From 8ccc6e2bbb1d910ea59e990bc1e4ac3219feb9d8 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Fri, 22 Mar 2024 16:58:43 -0600 Subject: [PATCH 001/243] Fixed GMRES bug in Constraint. --- .../function/constraint/ROL_ConstraintDef.hpp | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/packages/rol/src/function/constraint/ROL_ConstraintDef.hpp b/packages/rol/src/function/constraint/ROL_ConstraintDef.hpp index 66277f49ae5d..994f5db8579b 100644 --- a/packages/rol/src/function/constraint/ROL_ConstraintDef.hpp +++ b/packages/rol/src/function/constraint/ROL_ConstraintDef.hpp @@ -216,19 +216,19 @@ std::vector Constraint::solveAugmentedSystem(Vector &v1, v1.zero(); v2.zero(); // Allocate static memory. - ROL::Ptr > r1 = b1.clone(); - ROL::Ptr > r2 = b2.clone(); - ROL::Ptr > z1 = v1.clone(); - ROL::Ptr > z2 = v2.clone(); - ROL::Ptr > w1 = b1.clone(); - ROL::Ptr > w2 = b2.clone(); - std::vector > > V1; - std::vector > > V2; - ROL::Ptr > V2temp = b2.clone(); - std::vector > > Z1; - std::vector > > Z2; - ROL::Ptr > w1temp = b1.clone(); - ROL::Ptr > Z2temp = v2.clone(); + Ptr> r1 = b1.clone(); + Ptr> r2 = b2.clone(); + Ptr> z1 = v1.clone(); + Ptr> z2 = v2.clone(); + Ptr> w1 = b1.clone(); + Ptr> w2 = b2.clone(); + std::vector>> V1; + std::vector>> V2; + Ptr> V2temp = b2.clone(); + std::vector>> Z1; + std::vector>> Z2; + Ptr> w1temp = b1.clone(); + Ptr> Z2temp = v2.clone(); std::vector res(m+1, zero); LA::Matrix H(m+1,m); @@ -237,13 +237,14 @@ std::vector Constraint::solveAugmentedSystem(Vector &v1, LA::Vector s(m+1); LA::Vector y(m+1); LA::Vector cnorm(m); - ROL::LAPACK lapack; + LAPACK lapack; // Compute initial residual. - applyAdjointJacobian(*r1, v2, x, zerotol); - r1->scale(-one); r1->axpy(-one, v1.dual()); r1->plus(b1); - applyJacobian(*r2, v1, x, zerotol); - r2->scale(-one); r2->plus(b2); + //applyAdjointJacobian(*r1, v2, x, zerotol); + //r1->scale(-one); r1->axpy(-one, v1.dual()); r1->plus(b1); + //applyJacobian(*r2, v1, x, zerotol); + //r2->scale(-one); r2->plus(b2); + r1->set(b1); r2->set(b2); res[0] = std::sqrt(r1->dot(*r1) + r2->dot(*r2)); // Check if residual is identically zero. @@ -336,12 +337,13 @@ std::vector Constraint::solveAugmentedSystem(Vector &v1, if (res[i+1] <= tol) { // std::cout << " solved in " << i+1 << " iterations to " << res[i+1] << " (" << res[i+1]/res[0] << ")" << std::endl; // Update solution vector. - v1.plus(*z1); - v2.plus(*z2); + //v1.plus(*z1); + //v2.plus(*z2); break; } } // for (int i=0; i++; i Date: Wed, 3 Apr 2024 15:36:28 -0600 Subject: [PATCH 002/243] Type P incorporation into Solver & Problem --- packages/rol/src/algorithm/ROL_Problem.hpp | 21 +- .../rol/src/algorithm/ROL_Problem_Def.hpp | 76 +++++- packages/rol/src/algorithm/ROL_Solver.hpp | 2 + packages/rol/src/algorithm/ROL_Solver_Def.hpp | 9 +- .../TypeP/ROL_TypeP_AlgorithmFactory.hpp | 4 +- .../TypeP/ROL_TypeP_Algorithm_Def.hpp | 9 +- .../ROL_TypeP_InexactNewtonAlgorithm_Def.hpp | 4 +- .../function/objective/ROL_l1Objective.hpp | 3 +- .../sol/algorithm/ROL_StochasticProblem.hpp | 2 + packages/rol/src/zoo/ROL_Types.hpp | 6 +- .../rol/test/algorithm/TypeP/CMakeLists.txt | 8 + packages/rol/test/algorithm/TypeP/test_08.cpp | 255 ++++++++++++++++++ 12 files changed, 374 insertions(+), 25 deletions(-) create mode 100644 packages/rol/test/algorithm/TypeP/test_08.cpp diff --git a/packages/rol/src/algorithm/ROL_Problem.hpp b/packages/rol/src/algorithm/ROL_Problem.hpp index 2176077070a5..80a6fcc3b375 100644 --- a/packages/rol/src/algorithm/ROL_Problem.hpp +++ b/packages/rol/src/algorithm/ROL_Problem.hpp @@ -65,7 +65,8 @@ class Problem { bool hasInequality_; bool hasLinearEquality_; bool hasLinearInequality_; - unsigned cnt_econ_; + bool hasProximableObjective_; + unsigned cnt_econ_; unsigned cnt_icon_; unsigned cnt_linear_econ_; unsigned cnt_linear_icon_; @@ -73,6 +74,7 @@ class Problem { ParameterList ppa_list_; Ptr> obj_; + Ptr> nobj_; Ptr> xprim_; Ptr> xdual_; Ptr> bnd_; @@ -89,6 +91,7 @@ class Problem { protected: Ptr> INPUT_obj_; + Ptr> INPUT_nobj_; Ptr> INPUT_xprim_; Ptr> INPUT_xdual_; Ptr> INPUT_bnd_; @@ -119,12 +122,14 @@ class Problem { hasInequality_(problem.hasInequality_), hasLinearEquality_(problem.hasLinearEquality_), hasLinearInequality_(problem.hasLinearInequality_), + hasProximableObjective_(problem.hasProximableObjective_), cnt_econ_(problem.cnt_econ_), cnt_icon_(problem.cnt_icon_), cnt_linear_econ_(problem.cnt_linear_econ_), cnt_linear_icon_(problem.cnt_linear_icon_), ppa_list_(problem.ppa_list_), INPUT_obj_(problem.INPUT_obj_), + INPUT_nobj_(problem.INPUT_nobj_), INPUT_xprim_(problem.INPUT_xprim_), INPUT_xdual_(problem.INPUT_xdual_), INPUT_bnd_(problem.INPUT_bnd_), @@ -222,6 +227,14 @@ class Problem { @param[in] ppa polyhedral projection algorithm */ void setProjectionAlgorithm(ParameterList &parlist); + + /** Set Proximable objective function + */ + void addProximableObjective(const Ptr> &nobj); + /** Remove Proximable objective function + */ + void removeProximableObjective(); + /***************************************************************************/ /*** Accessor methods ******************************************************/ @@ -230,6 +243,10 @@ class Problem { /** \brief Get the objective function. */ const Ptr>& getObjective(); + + /** Get proximable objective + */ + const Ptr>& getProximableObjective(); /** \brief Get the primal optimization space vector. */ @@ -260,7 +277,7 @@ class Problem { */ const Ptr>& getPolyhedralProjection(); - /** \brief Get the optimization problem type (U, B, E, or G). + /** \brief Get the optimization problem type (U, B, E, G, or P). */ EProblem getProblemType(); diff --git a/packages/rol/src/algorithm/ROL_Problem_Def.hpp b/packages/rol/src/algorithm/ROL_Problem_Def.hpp index 902f3b31379d..05e8b8c48b7b 100644 --- a/packages/rol/src/algorithm/ROL_Problem_Def.hpp +++ b/packages/rol/src/algorithm/ROL_Problem_Def.hpp @@ -55,11 +55,13 @@ Problem::Problem( const Ptr> &obj, : isFinalized_(false), hasBounds_(false), hasEquality_(false), hasInequality_(false), hasLinearEquality_(false), hasLinearInequality_(false), + hasProximableObjective_(false), cnt_econ_(0), cnt_icon_(0), cnt_linear_econ_(0), cnt_linear_icon_(0), - obj_(nullPtr), xprim_(nullPtr), xdual_(nullPtr), bnd_(nullPtr), + obj_(nullPtr), nobj_(nullPtr), xprim_(nullPtr), xdual_(nullPtr), bnd_(nullPtr), con_(nullPtr), mul_(nullPtr), res_(nullPtr), proj_(nullPtr), problemType_(TYPE_U) { INPUT_obj_ = obj; + INPUT_nobj_ = nullPtr; INPUT_xprim_ = x; INPUT_bnd_ = nullPtr; INPUT_con_.clear(); @@ -86,6 +88,26 @@ void Problem::removeBoundConstraint() { hasBounds_ = false; } + +template +void Problem::addProximableObjective(const Ptr> &nobj) { + ROL_TEST_FOR_EXCEPTION(isFinalized_,std::invalid_argument, + ">>> ROL::Problem: Cannot add regularizer after problem is finalized!"); + + INPUT_nobj_ = nobj; + hasProximableObjective_ = true; +} + +template +void Problem::removeProximableObjective() { + ROL_TEST_FOR_EXCEPTION(isFinalized_,std::invalid_argument, + ">>> ROL::Problem: Cannot remove regularizer after problem is finalized!"); + + INPUT_nobj_ = nullPtr; + hasProximableObjective_ = false; +} + + template void Problem::addConstraint( std::string name, const Ptr> &econ, @@ -210,10 +232,11 @@ template void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostream &outStream) { if (!isFinalized_) { std::unordered_map> con, lcon, icon; - bool hasEquality = hasEquality_; - bool hasLinearEquality = hasLinearEquality_; - bool hasInequality = hasInequality_; - bool hasLinearInequality = hasLinearInequality_; + bool hasEquality = hasEquality_; + bool hasLinearEquality = hasLinearEquality_; + bool hasInequality = hasInequality_; + bool hasLinearInequality = hasLinearInequality_; + bool hasProximableObjective = hasProximableObjective_; con.insert(INPUT_con_.begin(),INPUT_con_.end()); if (lumpConstraints) { con.insert(INPUT_linear_con_.begin(),INPUT_linear_con_.end()); @@ -229,19 +252,21 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr //std::cout << hasBounds_ << " " << hasEquality << " " << hasInequality << " " << hasLinearEquality << " " << hasLinearInequality << std::endl; if (!hasLinearEquality && !hasLinearInequality) { proj_ = nullPtr; - if (!hasEquality && !hasInequality && !hasBounds_) { + if (!hasEquality && !hasInequality && !hasBounds_ && !hasProximableObjective) { problemType_ = TYPE_U; obj_ = INPUT_obj_; - xprim_ = INPUT_xprim_; + nobj_ = nullPtr; + xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = nullPtr; con_ = nullPtr; mul_ = nullPtr; res_ = nullPtr; } - else if (!hasEquality && !hasInequality && hasBounds_) { + else if (!hasEquality && !hasInequality && hasBounds_ && !hasProximableObjective) { problemType_ = TYPE_B; obj_ = INPUT_obj_; + nobj_ = nullPtr; xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = INPUT_bnd_; @@ -249,10 +274,11 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr mul_ = nullPtr; res_ = nullPtr; } - else if (hasEquality && !hasInequality && !hasBounds_) { + else if (hasEquality && !hasInequality && !hasBounds_ && !hasProximableObjective) { ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_); problemType_ = TYPE_E; obj_ = INPUT_obj_; + nobj_ = nullPtr; xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = nullPtr; @@ -260,10 +286,27 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr mul_ = cm.getMultiplier(); res_ = cm.getResidual(); } + else if (hasProximableObjective){ + if (!hasEquality && !hasInequality && !hasBounds_){ + problemType_ = TYPE_P; + obj_ = INPUT_obj_; + nobj_ = INPUT_nobj_; + xprim_ = INPUT_xprim_; + xdual_ = INPUT_xdual_; + bnd_ = nullPtr; + con_ = nullPtr; + mul_ = nullPtr; + res_ = nullPtr; + } + else { + throw Exception::NotImplemented(">>> ROL::TypeP - with constraints is not supported"); + } + } else { ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); problemType_ = TYPE_EB; obj_ = INPUT_obj_; + nobj_ = nullPtr; if (cm.hasInequality()) { obj_ = makePtr>(INPUT_obj_); } @@ -277,6 +320,10 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr } else { if (!hasBounds_ && !hasLinearInequality) { + if (hasProximableObjective){ + throw Exception::NotImplemented(">>> ROL::TypeP - with constraints is not supported"); + } + nobj_ = nullPtr; ConstraintAssembler cm(lcon,INPUT_xprim_,INPUT_xdual_); xfeas_ = cm.getOptVector()->clone(); xfeas_->set(*cm.getOptVector()); rlc_ = makePtr>(cm.getConstraint(),xfeas_,cm.getResidual()); @@ -355,6 +402,7 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr outStream << std::endl; outStream << " ROL::Problem::finalize" << std::endl; outStream << " Problem Summary:" << std::endl; + outStream << " Has Proximable Objective? .......... " << (hasProximableObjective ? "yes" : "no") << std::endl; outStream << " Has Bound Constraint? .............. " << (hasBounds_ ? "yes" : "no") << std::endl; outStream << " Has Equality Constraint? ........... " << (hasEquality ? "yes" : "no") << std::endl; if (hasEquality) { @@ -444,7 +492,11 @@ const Ptr>& Problem::getObjective() { finalize(); return obj_; } - +template +const Ptr>& Problem::getProximableObjective(){ + finalize(); + return nobj_; +} template const Ptr>& Problem::getPrimalOptimizationVector() { finalize(); @@ -616,7 +668,7 @@ void Problem::checkDerivatives(bool printToStream, std::ostream &outStream INPUT_obj_->checkGradient(*x,*g,*d,printToStream,outStream); INPUT_obj_->checkHessVec(*x,*g,*d,printToStream,outStream); INPUT_obj_->checkHessSym(*x,*g,*d,*v,printToStream,outStream); - + //TODO: Proximable Objective Check // Constraint check for (auto it = INPUT_con_.begin(); it != INPUT_con_.end(); ++it) { c = it->second.residual->clone(); c->randomize(-scale,scale); @@ -646,6 +698,8 @@ void Problem::check(bool printToStream, std::ostream &outStream, const Ptr if (hasLinearEquality_ || hasLinearInequality_) checkLinearity(printToStream,outStream); checkDerivatives(printToStream,outStream,x0,scale); +// if (hasProximableObjective) +// checkProximableObjective(printToStream, outStream); } template diff --git a/packages/rol/src/algorithm/ROL_Solver.hpp b/packages/rol/src/algorithm/ROL_Solver.hpp index 0947d6f6fc50..c91b2e79ce91 100644 --- a/packages/rol/src/algorithm/ROL_Solver.hpp +++ b/packages/rol/src/algorithm/ROL_Solver.hpp @@ -48,6 +48,7 @@ #include "ROL_TypeB_AlgorithmFactory.hpp" #include "ROL_TypeE_AlgorithmFactory.hpp" #include "ROL_TypeG_AlgorithmFactory.hpp" +#include "ROL_TypeP_AlgorithmFactory.hpp" #include "ROL_Problem.hpp" #include "ROL_ParameterList.hpp" @@ -71,6 +72,7 @@ class Solver { Ptr> algoB_; Ptr> algoE_; Ptr> algoG_; + Ptr> algoP_; public: diff --git a/packages/rol/src/algorithm/ROL_Solver_Def.hpp b/packages/rol/src/algorithm/ROL_Solver_Def.hpp index b6b7c9cd15be..e74f28ad4557 100644 --- a/packages/rol/src/algorithm/ROL_Solver_Def.hpp +++ b/packages/rol/src/algorithm/ROL_Solver_Def.hpp @@ -53,6 +53,7 @@ Solver::Solver( const Ptr> &opt, : opt_(opt), problemType_(opt_->getProblemType()) { switch (problemType_) { case TYPE_U: algoU_ = TypeU::AlgorithmFactory(parlist,secant); break; + case TYPE_P: algoP_ = TypeP::AlgorithmFactory(parlist,secant); break; case TYPE_B: algoB_ = TypeB::AlgorithmFactory(parlist,secant); break; case TYPE_E: algoE_ = TypeE::AlgorithmFactory(parlist,secant); break; case TYPE_EB: algoG_ = TypeG::AlgorithmFactory(parlist,secant); break; @@ -78,6 +79,10 @@ int Solver::solve( std::ostream &outStream, if (status != nullPtr) algoU_->setStatusTest(status,combineStatus); algoU_->run(*opt_,outStream); break; + case TYPE_P: + if (status != nullPtr) algoP_->setStatusTest(status,combineStatus); + algoP_->run(*opt_,outStream); + break; case TYPE_B: if (status != nullPtr) algoB_->setStatusTest(status,combineStatus); algoB_->run(*opt_,outStream); @@ -106,7 +111,8 @@ Ptr> Solver::getAlgorithmState() const { //Ptr>& Solver::getAlgorithmState() const { switch (problemType_) { case TYPE_U: return algoU_->getState(); - case TYPE_B: return algoB_->getState(); + case TYPE_P: return algoP_->getState(); + case TYPE_B: return algoB_->getState(); case TYPE_E: return algoE_->getState(); case TYPE_EB: return algoG_->getState(); case TYPE_LAST: @@ -120,6 +126,7 @@ template void Solver::reset() { switch (problemType_) { case TYPE_U: algoU_->reset(); break; + case TYPE_P: algoP_->reset(); break; case TYPE_B: algoB_->reset(); break; case TYPE_E: algoE_->reset(); break; case TYPE_EB: algoG_->reset(); break; diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp index 4d46667ba333..23b89e87c61e 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp @@ -129,7 +129,7 @@ inline EAlgorithmP StringToEAlgorithmP(std::string s) { } template -inline Ptr> AlgorithmFactory(ParameterList &parlist) { +inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr> &secant = nullPtr) { EAlgorithmP ealg = StringToEAlgorithmP(parlist.sublist("Step").get("Type","Trust Region")); switch(ealg) { case ALGORITHM_P_LINESEARCH: @@ -138,7 +138,7 @@ inline Ptr> AlgorithmFactory(ParameterList &parlist) { = parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").get("Type","Newton-Krylov"); if (desc=="Newton-Krylov" || desc=="Newton") return makePtr>(parlist); - else if (desc=="Quasi-Newton Method" || desc = "Quasi-Newton") + else if (desc=="Quasi-Newton Method" || desc == "Quasi-Newton") return makePtr>(parlist); else return makePtr>(parlist); diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp index 33c1f4e0fd79..07ade19857b4 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp @@ -103,17 +103,16 @@ void Algorithm::setStatusTest(const Ptr> &status, template void Algorithm::run( Problem &problem, std::ostream &outStream ) { - /*if (problem.getProblemType() == TYPE_P) { + if (problem.getProblemType() == TYPE_P) { run(*problem.getPrimalOptimizationVector(), - *problem.getDualOptimizationVector(), *problem.getObjective(), + *problem.getProximableObjective(), outStream); problem.finalizeIteration(); } else { - throw Exception::NotImplemented(">>> ROL::TypeP::Algorithm::run : Optimization problem is not Type P!"); - }*/ - throw Exception::NotImplemented(">>> ROL::TypeP::Algorithm::run : Optimization problem is not available for Type P problems!"); + throw Exception::NotImplemented(">>> ROL::TypeP::Algorithm::run : Optimization problem is not Type P problems!"); + } } template diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm_Def.hpp index 4b54395b2eab..f891fb208303 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm_Def.hpp @@ -41,8 +41,8 @@ // ************************************************************************ // @HEADER -#ifndef ROL_TYPEP_QUASINEWTONALGORITHM_DEF_HPP -#define ROL_TYPEP_QUASINEWTONALGORITHM_DEF_HPP +#ifndef ROL_TYPEP_INEXACTNEWTONALGORITHM_DEF_HPP +#define ROL_TYPEP_INEXACTNEWTONALGORITHM_DEF_HPP #include "ROL_TypeP_ProxGradientAlgorithm.hpp" #include "ROL_TypeP_SpectralGradientAlgorithm.hpp" diff --git a/packages/rol/src/function/objective/ROL_l1Objective.hpp b/packages/rol/src/function/objective/ROL_l1Objective.hpp index 4ca6eace5e19..21e0a0c3972d 100644 --- a/packages/rol/src/function/objective/ROL_l1Objective.hpp +++ b/packages/rol/src/function/objective/ROL_l1Objective.hpp @@ -105,7 +105,8 @@ class l1Objective : public Objective { Pv.applyBinary(psb_, *weights_); Pv.scale(t); Pv.plus(v); - } + } +//TODO: input prox jacobian }; // class l1Objective } // namespace ROL diff --git a/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp b/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp index 854c0f03f460..e2300c816034 100644 --- a/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp +++ b/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp @@ -67,6 +67,7 @@ template class StochasticProblem : public Problem { private: Ptr> ORIGINAL_obj_; + Ptr> ORIGINAL_nobj_; Ptr> ORIGINAL_xprim_; Ptr> ORIGINAL_xdual_; Ptr> ORIGINAL_bnd_; @@ -80,6 +81,7 @@ class StochasticProblem : public Problem { std::unordered_map statMap_; using Problem::INPUT_obj_; + using Problem::INPUT_nobj_; using Problem::INPUT_xprim_; using Problem::INPUT_xdual_; using Problem::INPUT_bnd_; diff --git a/packages/rol/src/zoo/ROL_Types.hpp b/packages/rol/src/zoo/ROL_Types.hpp index 739eda150c76..f2df9e7968b1 100644 --- a/packages/rol/src/zoo/ROL_Types.hpp +++ b/packages/rol/src/zoo/ROL_Types.hpp @@ -256,6 +256,7 @@ namespace ROL { // Types of optimization problem enum EProblem { TYPE_U = 0, + TYPE_P, TYPE_B, TYPE_E, TYPE_EB, @@ -312,7 +313,9 @@ namespace ROL { (s == STEP_TRUSTREGION) || (s == STEP_BUNDLE) ); break; - + case TYPE_P: comp = ( (s == STEP_LINESEARCH) || + (s == STEP_TRUSTREGION)); + break; case TYPE_B: comp = ( (s == STEP_LINESEARCH) || (s == STEP_TRUSTREGION) || (s == STEP_MOREAUYOSIDAPENALTY) || @@ -341,6 +344,7 @@ namespace ROL { std::string retString; switch(p) { case TYPE_U: retString = "Type-U"; break; + case TYPE_P: retString = "Type-P"; break; case TYPE_E: retString = "Type-E"; break; case TYPE_B: retString = "Type-B"; break; case TYPE_EB: retString = "Type-EB"; break; diff --git a/packages/rol/test/algorithm/TypeP/CMakeLists.txt b/packages/rol/test/algorithm/TypeP/CMakeLists.txt index c0a93e5727ac..5dff144817c5 100644 --- a/packages/rol/test/algorithm/TypeP/CMakeLists.txt +++ b/packages/rol/test/algorithm/TypeP/CMakeLists.txt @@ -62,6 +62,14 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( ADD_DIR_TO_NAME ) +TRIBITS_ADD_EXECUTABLE_AND_TEST( + TestTypePSolver + SOURCES test_08.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) TRIBITS_COPY_FILES_TO_BINARY_DIR( TypePTestDataCopy SOURCE_FILES diff --git a/packages/rol/test/algorithm/TypeP/test_08.cpp b/packages/rol/test/algorithm/TypeP/test_08.cpp new file mode 100644 index 000000000000..da950c89083a --- /dev/null +++ b/packages/rol/test/algorithm/TypeP/test_08.cpp @@ -0,0 +1,255 @@ +// @HEADER +// ************************************************************************ +// +// Rapid Optimization Library (ROL) Package +// Copyright (2014) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact lead developers: +// Drew Kouri (dpkouri@sandia.gov) and +// Denis Ridzal (dridzal@sandia.gov) +// +// ************************************************************************ +// @HEADER + +/*! \file test_03.cpp + \brief Validate Trust Region algorithm. +*/ + +#include "ROL_TypeP_TrustRegionAlgorithm.hpp" +#include "ROL_StdObjective.hpp" +#include "ROL_l1Objective.hpp" +#include "ROL_Solver.hpp" +#include "ROL_Problem.hpp" +#include "ROL_Stream.hpp" +#include "Teuchos_GlobalMPISession.hpp" +#include +#include + +template +class QuadraticTypeP_Test01 : public ROL::StdObjective { +private: + int dim_; + std::vector a_, b_; + +public: + QuadraticTypeP_Test01(int dim) : dim_(dim) { + using seed_type = std::mt19937_64::result_type; + seed_type const seed = 123; + std::mt19937_64 eng{seed}; + std::uniform_real_distribution distA(0.0,5.0), distB(-10.0,10.0); + a_.resize(dim); + b_.resize(dim); + for (int i = 0; i < dim; ++i) { + a_[i] = distA(eng); + b_[i] = distB(eng); + } + } + + Real value(const std::vector &x, Real &tol) { + Real val(0); + for (int i = 0; i < dim_; ++i) + val += static_cast(0.5)*a_[i]*x[i]*x[i] + b_[i]*x[i]; + return val; + } + + void gradient(std::vector &g, const std::vector &x, Real &tol) { + for (int i = 0; i < dim_; ++i) + g[i] = a_[i]*x[i] + b_[i]; + } + + void hessVec(std::vector &hv, const std::vector &v, const std::vector &x, Real &tol) { + for (int i = 0; i < dim_; ++i) + hv[i] = a_[i]*v[i]; + } + + void getSolution(std::vector &x, const std::vector &wts, const std::vector &y) const { + for (int i = 0; i < dim_; ++i) + x[i] = (std::min(wts[i], std::max(-wts[i], a_[i]*y[i] + b_[i])) - b_[i]) / a_[i]; + } +}; + +typedef double RealT; + +int main(int argc, char *argv[]) { + + Teuchos::GlobalMPISession mpiSession(&argc, &argv); + + // This little trick lets us print to std::cout only if a + // (dummy) command-line argument is provided. + int iprint = argc - 1; + ROL::Ptr outStream; + ROL::nullstream bhs; // outputs nothing + if (iprint > 0) + outStream = ROL::makePtrFromRef(std::cout); + else + outStream = ROL::makePtrFromRef(bhs); + + int errorFlag = 0; + + try { + RealT tol = 1e2*std::sqrt(ROL::ROL_EPSILON()); + + ROL::ParameterList list; + list.sublist("General").set("Output Level",iprint); + list.sublist("Step").set("Type","Trust Region"); + list.sublist("Status Test").set("Gradient Tolerance",1e-1*tol); + list.sublist("Status Test").set("Constraint Tolerance",1e-1*tol); + list.sublist("Status Test").set("Step Tolerance",1e-3*tol); + list.sublist("Status Test").set("Iteration Limit", 50); + int dim = 5; + ROL::Ptr> sol, wts, y; + ROL::Ptr> sobj; + ROL::Ptr> nobj; + + + ROL::Ptr> algo; + std::vector data; + RealT err(0); + + *outStream << std::endl << "Random Diagonal LASSO Test Problem" << std::endl << std::endl; + ROL::Ptr> wtsP = ROL::makePtr>(dim); + ROL::Ptr> yP = ROL::makePtr>(dim); + wts = ROL::makePtr>(wtsP);// wts->setSeed(234); + y = ROL::makePtr>(yP); // y->setSeed(345); + sol = ROL::makePtr>(dim); + wts->randomize(static_cast(0),static_cast(1)); + y->randomize(static_cast(-5),static_cast(5)); + + nobj = ROL::makePtr>(wts,y); + sobj = ROL::makePtr>(dim); + + std::vector xstar(dim); + sobj->getSolution(xstar, *wtsP, *yP); + RealT xmax(0); + for (int i = 0; i < dim; ++i) + xmax = std::max(xmax,std::abs(xstar[i])); + + // Check derivatives of smooth function + ROL::Ptr> xd = sol->clone(); + xd->randomize(-1.0,1.0); + ROL::Ptr> yd = sol->clone(); + yd->randomize(-1.0,1.0); + ROL::Ptr> zd = sol->clone(); + zd->randomize(-1.0,1.0); + sobj->checkGradient(*xd,*yd,true,*outStream); + sobj->checkHessVec(*xd,*yd,true,*outStream); + sobj->checkHessSym(*xd,*yd,*zd,true,*outStream); + + list.sublist("Step").sublist("Trust Region").sublist("TRN").sublist("Solver").set("Subproblem Solver", "SPG"); + sol->zero(); + auto problem = ROL::makePtr>(sobj, sol); // check + problem->addProximableObjective(nobj); + problem->finalize(false, true, *outStream); + ROL::Solver solverspg(problem, list); + + auto begin = std::chrono::high_resolution_clock::now(); + solverspg.solve(*outStream); + auto end = std::chrono::high_resolution_clock::now(); + *outStream << " Optimization Time: " << std::chrono::duration_cast(end-begin).count() << " microseconds" << std::endl; + + err = static_cast(0); + data = *ROL::staticPtrCast>(sol)->getVector(); + *outStream << " Result: "; + for (int i = 0; i < dim; ++i) { + *outStream << " x" << i+1 << " = " << data[i]; + err = std::max(err,std::abs(data[i]-xstar[i])); + } + *outStream << std::endl; + *outStream << " Truth: "; + for (int i = 0; i < dim; ++i) { + *outStream << " x" << i+1 << " = " << xstar[i]; + } + *outStream << std::endl; + *outStream << " Max Relative Error = " << err/xmax << std::endl; + errorFlag += (err > tol ? 1 : 0); + + list.sublist("Step").sublist("Trust Region").sublist("TRN").sublist("Solver").set("Subproblem Solver", "Simplified SPG"); + sol->zero(); + ROL::Solver solverspg2(problem, list); + begin = std::chrono::high_resolution_clock::now(); + solverspg2.solve(*outStream); + end = std::chrono::high_resolution_clock::now(); + *outStream << " Optimization Time: " << std::chrono::duration_cast(end-begin).count() << " microseconds" << std::endl; + + err = static_cast(0); + data = *ROL::staticPtrCast>(sol)->getVector(); + *outStream << " Result: "; + for (int i = 0; i < dim; ++i) { + *outStream << " x" << i+1 << " = " << data[i]; + err = std::max(err,std::abs(data[i]-xstar[i])); + } + *outStream << std::endl; + *outStream << " Truth: "; + for (int i = 0; i < dim; ++i) { + *outStream << " x" << i+1 << " = " << xstar[i]; + } + *outStream << std::endl; + *outStream << " Max Relative Error = " << err/xmax << std::endl; + errorFlag += (err > tol ? 1 : 0); + + list.sublist("Step").sublist("Trust Region").sublist("TRN").sublist("Solver").set("Subproblem Solver", "NCG"); + sol->zero(); + ROL::Solver solverncg(problem, list); + begin = std::chrono::high_resolution_clock::now(); + solverncg.solve(*outStream); + end = std::chrono::high_resolution_clock::now(); + *outStream << " Optimization Time: " << std::chrono::duration_cast(end-begin).count() << " microseconds" << std::endl; + + err = static_cast(0); + data = *ROL::staticPtrCast>(sol)->getVector(); + *outStream << " Result: "; + for (int i = 0; i < dim; ++i) { + *outStream << " x" << i+1 << " = " << data[i]; + err = std::max(err,std::abs(data[i]-xstar[i])); + } + *outStream << std::endl; + *outStream << " Truth: "; + for (int i = 0; i < dim; ++i) { + *outStream << " x" << i+1 << " = " << xstar[i]; + } + *outStream << std::endl; + *outStream << " Max Relative Error = " << err/xmax << std::endl; + errorFlag += (err > tol ? 1 : 0); + } + catch (std::logic_error& err) { + *outStream << err.what() << "\n"; + errorFlag = -1000; + }; // end try + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return 0; +} From 4de76e420fec5c02a155377f1df3dc8310ada365 Mon Sep 17 00:00:00 2001 From: Robert Baraldi Date: Wed, 3 Apr 2024 17:21:22 -0600 Subject: [PATCH 003/243] Tab -> to 2-space changes where found in code. --- packages/rol/src/algorithm/ROL_Problem.hpp | 12 ++++++------ packages/rol/src/algorithm/ROL_Problem_Def.hpp | 2 +- packages/rol/src/algorithm/ROL_Solver_Def.hpp | 2 +- .../rol/src/sol/algorithm/ROL_StochasticProblem.hpp | 2 +- packages/rol/test/algorithm/TypeP/test_08.cpp | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/packages/rol/src/algorithm/ROL_Problem.hpp b/packages/rol/src/algorithm/ROL_Problem.hpp index 80a6fcc3b375..47593f570154 100644 --- a/packages/rol/src/algorithm/ROL_Problem.hpp +++ b/packages/rol/src/algorithm/ROL_Problem.hpp @@ -66,7 +66,7 @@ class Problem { bool hasLinearEquality_; bool hasLinearInequality_; bool hasProximableObjective_; - unsigned cnt_econ_; + unsigned cnt_econ_; unsigned cnt_icon_; unsigned cnt_linear_econ_; unsigned cnt_linear_icon_; @@ -74,7 +74,7 @@ class Problem { ParameterList ppa_list_; Ptr> obj_; - Ptr> nobj_; + Ptr> nobj_; Ptr> xprim_; Ptr> xdual_; Ptr> bnd_; @@ -91,7 +91,7 @@ class Problem { protected: Ptr> INPUT_obj_; - Ptr> INPUT_nobj_; + Ptr> INPUT_nobj_; Ptr> INPUT_xprim_; Ptr> INPUT_xdual_; Ptr> INPUT_bnd_; @@ -233,7 +233,7 @@ class Problem { void addProximableObjective(const Ptr> &nobj); /** Remove Proximable objective function */ - void removeProximableObjective(); + void removeProximableObjective(); /***************************************************************************/ @@ -244,8 +244,8 @@ class Problem { */ const Ptr>& getObjective(); - /** Get proximable objective - */ + /** Get proximable objective + */ const Ptr>& getProximableObjective(); /** \brief Get the primal optimization space vector. diff --git a/packages/rol/src/algorithm/ROL_Problem_Def.hpp b/packages/rol/src/algorithm/ROL_Problem_Def.hpp index 05e8b8c48b7b..c3ec526413b2 100644 --- a/packages/rol/src/algorithm/ROL_Problem_Def.hpp +++ b/packages/rol/src/algorithm/ROL_Problem_Def.hpp @@ -256,7 +256,7 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr problemType_ = TYPE_U; obj_ = INPUT_obj_; nobj_ = nullPtr; - xprim_ = INPUT_xprim_; + xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = nullPtr; con_ = nullPtr; diff --git a/packages/rol/src/algorithm/ROL_Solver_Def.hpp b/packages/rol/src/algorithm/ROL_Solver_Def.hpp index e74f28ad4557..9e2443277f59 100644 --- a/packages/rol/src/algorithm/ROL_Solver_Def.hpp +++ b/packages/rol/src/algorithm/ROL_Solver_Def.hpp @@ -112,7 +112,7 @@ Ptr> Solver::getAlgorithmState() const { switch (problemType_) { case TYPE_U: return algoU_->getState(); case TYPE_P: return algoP_->getState(); - case TYPE_B: return algoB_->getState(); + case TYPE_B: return algoB_->getState(); case TYPE_E: return algoE_->getState(); case TYPE_EB: return algoG_->getState(); case TYPE_LAST: diff --git a/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp b/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp index e2300c816034..522ff2c8760c 100644 --- a/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp +++ b/packages/rol/src/sol/algorithm/ROL_StochasticProblem.hpp @@ -81,7 +81,7 @@ class StochasticProblem : public Problem { std::unordered_map statMap_; using Problem::INPUT_obj_; - using Problem::INPUT_nobj_; + using Problem::INPUT_nobj_; using Problem::INPUT_xprim_; using Problem::INPUT_xdual_; using Problem::INPUT_bnd_; diff --git a/packages/rol/test/algorithm/TypeP/test_08.cpp b/packages/rol/test/algorithm/TypeP/test_08.cpp index da950c89083a..7d0223b2a539 100644 --- a/packages/rol/test/algorithm/TypeP/test_08.cpp +++ b/packages/rol/test/algorithm/TypeP/test_08.cpp @@ -168,9 +168,9 @@ int main(int argc, char *argv[]) { list.sublist("Step").sublist("Trust Region").sublist("TRN").sublist("Solver").set("Subproblem Solver", "SPG"); sol->zero(); auto problem = ROL::makePtr>(sobj, sol); // check - problem->addProximableObjective(nobj); - problem->finalize(false, true, *outStream); - ROL::Solver solverspg(problem, list); + problem->addProximableObjective(nobj); + problem->finalize(false, true, *outStream); + ROL::Solver solverspg(problem, list); auto begin = std::chrono::high_resolution_clock::now(); solverspg.solve(*outStream); From 05a530361c74d68b9d1cc6f9227b0f98969a3baa Mon Sep 17 00:00:00 2001 From: Robert Baraldi Date: Wed, 3 Apr 2024 17:23:12 -0600 Subject: [PATCH 004/243] One spacing/tab issue more in comments. --- packages/rol/src/algorithm/ROL_Problem.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/rol/src/algorithm/ROL_Problem.hpp b/packages/rol/src/algorithm/ROL_Problem.hpp index 47593f570154..540ebcd151f8 100644 --- a/packages/rol/src/algorithm/ROL_Problem.hpp +++ b/packages/rol/src/algorithm/ROL_Problem.hpp @@ -228,11 +228,11 @@ class Problem { */ void setProjectionAlgorithm(ParameterList &parlist); - /** Set Proximable objective function - */ + /** Set Proximable objective function + */ void addProximableObjective(const Ptr> &nobj); - /** Remove Proximable objective function - */ + /** Remove Proximable objective function + */ void removeProximableObjective(); From 6ee4e512c435b92fd3d7a78df5d82b8c70b21bdd Mon Sep 17 00:00:00 2001 From: Robert Baraldi Date: Thu, 4 Apr 2024 10:44:52 -0600 Subject: [PATCH 005/243] Apply suggestions from code review --- packages/rol/src/algorithm/ROL_Problem.hpp | 3 ++- packages/rol/src/algorithm/ROL_Problem_Def.hpp | 3 --- packages/rol/test/algorithm/TypeP/CMakeLists.txt | 1 + packages/rol/test/algorithm/TypeP/test_08.cpp | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/packages/rol/src/algorithm/ROL_Problem.hpp b/packages/rol/src/algorithm/ROL_Problem.hpp index 540ebcd151f8..ca977f5e8c99 100644 --- a/packages/rol/src/algorithm/ROL_Problem.hpp +++ b/packages/rol/src/algorithm/ROL_Problem.hpp @@ -231,6 +231,7 @@ class Problem { /** Set Proximable objective function */ void addProximableObjective(const Ptr> &nobj); + /** Remove Proximable objective function */ void removeProximableObjective(); @@ -244,7 +245,7 @@ class Problem { */ const Ptr>& getObjective(); - /** Get proximable objective + /** \brief Get proximable objective */ const Ptr>& getProximableObjective(); diff --git a/packages/rol/src/algorithm/ROL_Problem_Def.hpp b/packages/rol/src/algorithm/ROL_Problem_Def.hpp index c3ec526413b2..ecd6630bc90a 100644 --- a/packages/rol/src/algorithm/ROL_Problem_Def.hpp +++ b/packages/rol/src/algorithm/ROL_Problem_Def.hpp @@ -88,7 +88,6 @@ void Problem::removeBoundConstraint() { hasBounds_ = false; } - template void Problem::addProximableObjective(const Ptr> &nobj) { ROL_TEST_FOR_EXCEPTION(isFinalized_,std::invalid_argument, @@ -106,8 +105,6 @@ void Problem::removeProximableObjective() { INPUT_nobj_ = nullPtr; hasProximableObjective_ = false; } - - template void Problem::addConstraint( std::string name, const Ptr> &econ, diff --git a/packages/rol/test/algorithm/TypeP/CMakeLists.txt b/packages/rol/test/algorithm/TypeP/CMakeLists.txt index 5dff144817c5..8a8572722d0b 100644 --- a/packages/rol/test/algorithm/TypeP/CMakeLists.txt +++ b/packages/rol/test/algorithm/TypeP/CMakeLists.txt @@ -70,6 +70,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( PASS_REGULAR_EXPRESSION "TEST PASSED" ADD_DIR_TO_NAME ) + TRIBITS_COPY_FILES_TO_BINARY_DIR( TypePTestDataCopy SOURCE_FILES diff --git a/packages/rol/test/algorithm/TypeP/test_08.cpp b/packages/rol/test/algorithm/TypeP/test_08.cpp index 7d0223b2a539..9b459e7f8f08 100644 --- a/packages/rol/test/algorithm/TypeP/test_08.cpp +++ b/packages/rol/test/algorithm/TypeP/test_08.cpp @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) { ROL::Ptr> nobj; - ROL::Ptr> algo; + ROL::Ptr> algo; std::vector data; RealT err(0); From 18fa4aa062156e5b43c6ba09df589c853a3dd480 Mon Sep 17 00:00:00 2001 From: Robert John Baraldi Date: Wed, 10 Apr 2024 15:43:52 -0600 Subject: [PATCH 006/243] easier logic changes --- .../rol/src/algorithm/ROL_Problem_Def.hpp | 236 +++++++++--------- 1 file changed, 119 insertions(+), 117 deletions(-) diff --git a/packages/rol/src/algorithm/ROL_Problem_Def.hpp b/packages/rol/src/algorithm/ROL_Problem_Def.hpp index ecd6630bc90a..8f4a2fee83d2 100644 --- a/packages/rol/src/algorithm/ROL_Problem_Def.hpp +++ b/packages/rol/src/algorithm/ROL_Problem_Def.hpp @@ -247,151 +247,153 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr } // Transform optimization problem //std::cout << hasBounds_ << " " << hasEquality << " " << hasInequality << " " << hasLinearEquality << " " << hasLinearInequality << std::endl; - if (!hasLinearEquality && !hasLinearInequality) { - proj_ = nullPtr; - if (!hasEquality && !hasInequality && !hasBounds_ && !hasProximableObjective) { - problemType_ = TYPE_U; + if (hasProximableObjective){ + if (!hasEquality && !hasInequality && !hasBounds_ && !hasLinearEquality && !hasLinearInequality){ + problemType_ = TYPE_P; obj_ = INPUT_obj_; - nobj_ = nullPtr; + nobj_ = INPUT_nobj_; xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = nullPtr; con_ = nullPtr; mul_ = nullPtr; res_ = nullPtr; - } - else if (!hasEquality && !hasInequality && hasBounds_ && !hasProximableObjective) { - problemType_ = TYPE_B; - obj_ = INPUT_obj_; - nobj_ = nullPtr; - xprim_ = INPUT_xprim_; - xdual_ = INPUT_xdual_; - bnd_ = INPUT_bnd_; - con_ = nullPtr; - mul_ = nullPtr; - res_ = nullPtr; - } - else if (hasEquality && !hasInequality && !hasBounds_ && !hasProximableObjective) { - ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_); - problemType_ = TYPE_E; - obj_ = INPUT_obj_; - nobj_ = nullPtr; - xprim_ = INPUT_xprim_; - xdual_ = INPUT_xdual_; - bnd_ = nullPtr; - con_ = cm.getConstraint(); - mul_ = cm.getMultiplier(); - res_ = cm.getResidual(); - } - else if (hasProximableObjective){ - if (!hasEquality && !hasInequality && !hasBounds_){ - problemType_ = TYPE_P; + } + else { + throw Exception::NotImplemented(">>> ROL::TypeP - with constraints is not supported"); + } + } + else { + if (!hasLinearEquality && !hasLinearInequality) { + proj_ = nullPtr; + if (!hasEquality && !hasInequality && !hasBounds_ ) { + problemType_ = TYPE_U; obj_ = INPUT_obj_; - nobj_ = INPUT_nobj_; - xprim_ = INPUT_xprim_; + nobj_ = nullPtr; + xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = nullPtr; con_ = nullPtr; mul_ = nullPtr; res_ = nullPtr; } - else { - throw Exception::NotImplemented(">>> ROL::TypeP - with constraints is not supported"); - } - } - else { - ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); - problemType_ = TYPE_EB; - obj_ = INPUT_obj_; - nobj_ = nullPtr; - if (cm.hasInequality()) { - obj_ = makePtr>(INPUT_obj_); - } - xprim_ = cm.getOptVector(); - xdual_ = cm.getDualOptVector(); - bnd_ = cm.getBoundConstraint(); - con_ = cm.getConstraint(); - mul_ = cm.getMultiplier(); - res_ = cm.getResidual(); - } - } - else { - if (!hasBounds_ && !hasLinearInequality) { - if (hasProximableObjective){ - throw Exception::NotImplemented(">>> ROL::TypeP - with constraints is not supported"); - } - nobj_ = nullPtr; - ConstraintAssembler cm(lcon,INPUT_xprim_,INPUT_xdual_); - xfeas_ = cm.getOptVector()->clone(); xfeas_->set(*cm.getOptVector()); - rlc_ = makePtr>(cm.getConstraint(),xfeas_,cm.getResidual()); - proj_ = nullPtr; - if (!hasEquality && !hasInequality) { - problemType_ = TYPE_U; - obj_ = rlc_->transform(INPUT_obj_); - xprim_ = xfeas_->clone(); xprim_->zero(); - xdual_ = cm.getDualOptVector(); - bnd_ = nullPtr; + else if (!hasEquality && !hasInequality && hasBounds_) { + problemType_ = TYPE_B; + obj_ = INPUT_obj_; + nobj_ = nullPtr; + xprim_ = INPUT_xprim_; + xdual_ = INPUT_xdual_; + bnd_ = INPUT_bnd_; con_ = nullPtr; mul_ = nullPtr; res_ = nullPtr; } + else if (hasEquality && !hasInequality && !hasBounds_) { + ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_); + problemType_ = TYPE_E; + obj_ = INPUT_obj_; + nobj_ = nullPtr; + xprim_ = INPUT_xprim_; + xdual_ = INPUT_xdual_; + bnd_ = nullPtr; + con_ = cm.getConstraint(); + mul_ = cm.getMultiplier(); + res_ = cm.getResidual(); + } else { - for (auto it = con.begin(); it != con.end(); ++it) { - icon.insert(std::pair>(it->first, - ConstraintData(rlc_->transform(it->second.constraint), - it->second.multiplier,it->second.residual,it->second.bounds))); + ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); + problemType_ = TYPE_EB; + obj_ = INPUT_obj_; + nobj_ = nullPtr; + if (cm.hasInequality()) { + obj_ = makePtr>(INPUT_obj_); } - Ptr> xtmp = xfeas_->clone(); xtmp->zero(); - ConstraintAssembler cm1(icon,xtmp,cm.getDualOptVector()); - xprim_ = cm1.getOptVector(); - xdual_ = cm1.getDualOptVector(); - con_ = cm1.getConstraint(); - mul_ = cm1.getMultiplier(); - res_ = cm1.getResidual(); - if (!hasInequality) { - problemType_ = TYPE_E; + xprim_ = cm.getOptVector(); + xdual_ = cm.getDualOptVector(); + bnd_ = cm.getBoundConstraint(); + con_ = cm.getConstraint(); + mul_ = cm.getMultiplier(); + res_ = cm.getResidual(); + } + } + else { + if (!hasBounds_ && !hasLinearInequality) { + if (hasProximableObjective){ + throw Exception::NotImplemented(">>> ROL::TypeP - with constraints is not supported"); + } + nobj_ = nullPtr; + ConstraintAssembler cm(lcon,INPUT_xprim_,INPUT_xdual_); + xfeas_ = cm.getOptVector()->clone(); xfeas_->set(*cm.getOptVector()); + rlc_ = makePtr>(cm.getConstraint(),xfeas_,cm.getResidual()); + proj_ = nullPtr; + if (!hasEquality && !hasInequality) { + problemType_ = TYPE_U; obj_ = rlc_->transform(INPUT_obj_); + xprim_ = xfeas_->clone(); xprim_->zero(); + xdual_ = cm.getDualOptVector(); bnd_ = nullPtr; + con_ = nullPtr; + mul_ = nullPtr; + res_ = nullPtr; } else { - problemType_ = TYPE_EB; - obj_ = makePtr>(rlc_->transform(INPUT_obj_)); - bnd_ = cm1.getBoundConstraint(); + for (auto it = con.begin(); it != con.end(); ++it) { + icon.insert(std::pair>(it->first, + ConstraintData(rlc_->transform(it->second.constraint), + it->second.multiplier,it->second.residual,it->second.bounds))); + } + Ptr> xtmp = xfeas_->clone(); xtmp->zero(); + ConstraintAssembler cm1(icon,xtmp,cm.getDualOptVector()); + xprim_ = cm1.getOptVector(); + xdual_ = cm1.getDualOptVector(); + con_ = cm1.getConstraint(); + mul_ = cm1.getMultiplier(); + res_ = cm1.getResidual(); + if (!hasInequality) { + problemType_ = TYPE_E; + obj_ = rlc_->transform(INPUT_obj_); + bnd_ = nullPtr; + } + else { + problemType_ = TYPE_EB; + obj_ = makePtr>(rlc_->transform(INPUT_obj_)); + bnd_ = cm1.getBoundConstraint(); + } } } - } - else if ((hasBounds_ || hasLinearInequality) && !hasEquality && !hasInequality) { - ConstraintAssembler cm(lcon,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); - problemType_ = TYPE_B; - obj_ = INPUT_obj_; - if (cm.hasInequality()) { - obj_ = makePtr>(INPUT_obj_); + else if ((hasBounds_ || hasLinearInequality) && !hasEquality && !hasInequality) { + ConstraintAssembler cm(lcon,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); + problemType_ = TYPE_B; + obj_ = INPUT_obj_; + if (cm.hasInequality()) { + obj_ = makePtr>(INPUT_obj_); + } + xprim_ = cm.getOptVector(); + xdual_ = cm.getDualOptVector(); + bnd_ = cm.getBoundConstraint(); + con_ = nullPtr; + mul_ = nullPtr; + res_ = nullPtr; + proj_ = PolyhedralProjectionFactory(*xprim_,*xdual_,bnd_, + cm.getConstraint(),*cm.getMultiplier(),*cm.getResidual(),ppa_list_); } - xprim_ = cm.getOptVector(); - xdual_ = cm.getDualOptVector(); - bnd_ = cm.getBoundConstraint(); - con_ = nullPtr; - mul_ = nullPtr; - res_ = nullPtr; - proj_ = PolyhedralProjectionFactory(*xprim_,*xdual_,bnd_, - cm.getConstraint(),*cm.getMultiplier(),*cm.getResidual(),ppa_list_); - } - else { - ConstraintAssembler cm(con,lcon,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); - problemType_ = TYPE_EB; - obj_ = INPUT_obj_; - if (cm.hasInequality()) { - obj_ = makePtr>(INPUT_obj_); + else { + ConstraintAssembler cm(con,lcon,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); + problemType_ = TYPE_EB; + obj_ = INPUT_obj_; + if (cm.hasInequality()) { + obj_ = makePtr>(INPUT_obj_); + } + xprim_ = cm.getOptVector(); + xdual_ = cm.getDualOptVector(); + con_ = cm.getConstraint(); + mul_ = cm.getMultiplier(); + res_ = cm.getResidual(); + bnd_ = cm.getBoundConstraint(); + proj_ = PolyhedralProjectionFactory(*xprim_,*xdual_,bnd_, + cm.getLinearConstraint(),*cm.getLinearMultiplier(), + *cm.getLinearResidual(),ppa_list_); } - xprim_ = cm.getOptVector(); - xdual_ = cm.getDualOptVector(); - con_ = cm.getConstraint(); - mul_ = cm.getMultiplier(); - res_ = cm.getResidual(); - bnd_ = cm.getBoundConstraint(); - proj_ = PolyhedralProjectionFactory(*xprim_,*xdual_,bnd_, - cm.getLinearConstraint(),*cm.getLinearMultiplier(), - *cm.getLinearResidual(),ppa_list_); } } isFinalized_ = true; From 5214dbd538c7bb7143d67ce43dbc674c656c2e75 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Fri, 19 Apr 2024 11:18:47 -0600 Subject: [PATCH 007/243] added a .gitignore to main rol directory to ignore binary files --- packages/rol/.gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 packages/rol/.gitignore diff --git a/packages/rol/.gitignore b/packages/rol/.gitignore new file mode 100644 index 000000000000..e10e419d2535 --- /dev/null +++ b/packages/rol/.gitignore @@ -0,0 +1,10 @@ +*.zip +*.tar +*.tar.gz +*.bz2 +*.xz +*.swo +*.swp +*.pyc +__pycache__ + From fb9be66807f97e6e5ff737daab1ea621a2adb27d Mon Sep 17 00:00:00 2001 From: Robert John Baraldi Date: Thu, 25 Apr 2024 18:08:15 -0600 Subject: [PATCH 008/243] Drew change request - secant input where applicable --- .../rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp index 23b89e87c61e..ca780ad30804 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp @@ -139,11 +139,11 @@ inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr>(parlist); else if (desc=="Quasi-Newton Method" || desc == "Quasi-Newton") - return makePtr>(parlist); + return makePtr>(parlist, secant); else return makePtr>(parlist); } - case ALGORITHM_P_TRUSTREGION: return makePtr>(parlist); + case ALGORITHM_P_TRUSTREGION: return makePtr>(parlist, secant); case ALGORITHM_P_SPECTRALGRADIENT: return makePtr>(parlist); case ALGORITHM_P_IPIANO: return makePtr>(parlist); default: return nullPtr; From 0cc1e266f8712da901b52c25a6107d439dec6a42 Mon Sep 17 00:00:00 2001 From: Robert John Baraldi Date: Fri, 26 Apr 2024 11:30:54 -0600 Subject: [PATCH 009/243] drew PR changes --- packages/rol/src/algorithm/ROL_Problem_Def.hpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/packages/rol/src/algorithm/ROL_Problem_Def.hpp b/packages/rol/src/algorithm/ROL_Problem_Def.hpp index 8f4a2fee83d2..d589054c611c 100644 --- a/packages/rol/src/algorithm/ROL_Problem_Def.hpp +++ b/packages/rol/src/algorithm/ROL_Problem_Def.hpp @@ -247,6 +247,7 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr } // Transform optimization problem //std::cout << hasBounds_ << " " << hasEquality << " " << hasInequality << " " << hasLinearEquality << " " << hasLinearInequality << std::endl; + nobj_ = nullPtr; if (hasProximableObjective){ if (!hasEquality && !hasInequality && !hasBounds_ && !hasLinearEquality && !hasLinearInequality){ problemType_ = TYPE_P; @@ -269,8 +270,7 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr if (!hasEquality && !hasInequality && !hasBounds_ ) { problemType_ = TYPE_U; obj_ = INPUT_obj_; - nobj_ = nullPtr; - xprim_ = INPUT_xprim_; + xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = nullPtr; con_ = nullPtr; @@ -280,7 +280,6 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr else if (!hasEquality && !hasInequality && hasBounds_) { problemType_ = TYPE_B; obj_ = INPUT_obj_; - nobj_ = nullPtr; xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = INPUT_bnd_; @@ -292,7 +291,6 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_); problemType_ = TYPE_E; obj_ = INPUT_obj_; - nobj_ = nullPtr; xprim_ = INPUT_xprim_; xdual_ = INPUT_xdual_; bnd_ = nullPtr; @@ -304,7 +302,6 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr ConstraintAssembler cm(con,INPUT_xprim_,INPUT_xdual_,INPUT_bnd_); problemType_ = TYPE_EB; obj_ = INPUT_obj_; - nobj_ = nullPtr; if (cm.hasInequality()) { obj_ = makePtr>(INPUT_obj_); } @@ -318,10 +315,6 @@ void Problem::finalize(bool lumpConstraints, bool printToStream, std::ostr } else { if (!hasBounds_ && !hasLinearInequality) { - if (hasProximableObjective){ - throw Exception::NotImplemented(">>> ROL::TypeP - with constraints is not supported"); - } - nobj_ = nullPtr; ConstraintAssembler cm(lcon,INPUT_xprim_,INPUT_xdual_); xfeas_ = cm.getOptVector()->clone(); xfeas_->set(*cm.getOptVector()); rlc_ = makePtr>(cm.getConstraint(),xfeas_,cm.getResidual()); @@ -491,11 +484,13 @@ const Ptr>& Problem::getObjective() { finalize(); return obj_; } + template const Ptr>& Problem::getProximableObjective(){ finalize(); return nobj_; } + template const Ptr>& Problem::getPrimalOptimizationVector() { finalize(); From 326539221068a90202878f3ebedd114c24050606 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Mon, 6 May 2024 18:34:02 -0600 Subject: [PATCH 010/243] Added a Python script that finds all instances of ParameterList::get in ROL header files (ignores comments) --- packages/rol/refactor/parameterlist.py | 115 +++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 packages/rol/refactor/parameterlist.py diff --git a/packages/rol/refactor/parameterlist.py b/packages/rol/refactor/parameterlist.py new file mode 100644 index 000000000000..3acb9af9a135 --- /dev/null +++ b/packages/rol/refactor/parameterlist.py @@ -0,0 +1,115 @@ +import sys +import os +import re +import subprocess as sp +from pathlib import Path + +# +# regex pattern | meaning +# --------------+---------------------------------------------- +# \s* | arbitrary amount of whitespace including none +# \( | literal left parenthesis +# \) | literal right parenthesis +# \. | literal period +# +# ParameterList::get("name","value") +# +# re.search(r'\.\s*get\s*"[a-zA-Z0-9]+"\s*,', source_code) + + +def get_rol_headers(rol_path : Path, token : str) -> [Path]: + result = sp.Popen(['grep','-rl',token,'--include=*.hpp',rol_path],stdout=sp.PIPE) + return [Path(line.decode('utf-8').strip()) for line in result.stdout] + + +def read_file(pathfile : Path) -> str: + with open(pathfile,"r") as f: + text = f.read() + return text + +def strip_cpp_comments( cpp_source : str ) -> str: + + in_string = False + in_single_line_comment = False + in_multi_line_comment = False + result = [] + i = 0 + + while i < len(cpp_source): + + # Check for string start/end + if cpp_source[i] == '"' and not (in_single_line_comment or in_multi_line_comment): + in_string = not in_string + result.append(cpp_source[i]) + # Check for single-line comment start + + elif i+1 < len(cpp_source) and cpp_source[i:i+2] == "//" and not (in_string or in_multi_line_comment): + in_single_line_comment = True + i += 1 # Skip next character to avoid parsing '/' twice + + # Check for multi-line comment start + elif i + 1 < len(cpp_source) and cpp_source[i:i+2] == "/*" and not (in_string or in_single_line_comment): + in_multi_line_comment = True + i += 1 # Skip next character to avoid parsing '*' twice + + # Check for single-line comment end + elif in_single_line_comment and cpp_source[i] == "\n": + in_single_line_comment = False + result.append(cpp_source[i]) # Include newline in result + + # Check for multi-line comment end + elif i + 1 < len(cpp_source) and in_multi_line_comment and cpp_source[i:i+2] == "*/": + in_multi_line_comment = False + i += 1 # Skip next character to avoid parsing '/' twice + + # Append character if not in a comment + elif not (in_single_line_comment or in_multi_line_comment): + result.append(cpp_source[i]) + + i += 1 + + return ''.join(result) + + +def contains_escaped_quote_advanced(s: str) -> bool: + i = 0 + while i < len(s): + if s[i] == '\\': + backslash_count = 1 + i += 1 + + # Count consecutive backslashes + while i < len(s) and s[i] == '\\': + backslash_count += 1 + i += 1 + + # If there's an odd number of backslashes followed by a quote, then it is escaped + if i < len(s) and s[i] == '"' and backslash_count % 2 == 1: + return True + else: + i += 1 + return False + + +if __name__ == '__main__': + + """ + Currently iterates over all ROL header files that contain the token (default: ParameterList) + then looks for calls to ParameterList::get and prints the entire "line" from the start of the + line to the end of the statement (semicolon). Ignores code comments + """ + + assert( len(sys.argv) > 1 ) + rol_root_path = sys.argv[1] + + token = 'ParameterList' if len(sys.argv) < 3 else sys.argv[2] + + pattern = re.compile(r'^.*?(\.get\s*\(\s*"[^"]*"\s*,(.*)\)\s*;)',re.MULTILINE) + headers = get_rol_headers(rol_root_path,token) + for h in headers: + cpp = strip_cpp_comments(read_file(h)) + matches = re.finditer(pattern,cpp) + print(h) + for m in matches: + print(m.group(0)) + From d436aa512476cec47a9aeafb5b508568783603d3 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Mon, 13 May 2024 16:07:48 -0600 Subject: [PATCH 011/243] Summary of Changes - Added automated Python virtual environment setup to rol/cmake/ROLParameters.cmake - __pycache__ files are now generated in the build tree instead of the source tree - rol_parameters.py scans rol/src and creates alphabetical lists of all unique ParameterList::sublist names and ParameterList::get key names. Two corresponding files `all_keys.txt` and `all_sublists.txt` are written to the build tree. - While not a sufficient condition for a valid parameter, it is a necessary condition that all keys and sublists used be registered in these two files. --- packages/rol/CMakeLists.txt | 7 ++ packages/rol/cmake/ROLParameters.cmake | 38 ++++++++ packages/rol/cmake/ROL_config.h.in | 3 + packages/rol/rol_parameters/find_files.py | 75 ++++++++++++++++ .../rol/rol_parameters/parse_parameters.py | 11 +++ .../rol/rol_parameters/read_cpp_source.py | 85 ++++++++++++++++++ packages/rol/rol_parameters/requirements.txt | 1 + packages/rol/rol_parameters/rol_parameters.py | 88 +++++++++++++++++++ packages/rol/rol_parameters/sublists.py | 28 ++++++ 9 files changed, 336 insertions(+) create mode 100644 packages/rol/cmake/ROLParameters.cmake create mode 100644 packages/rol/rol_parameters/find_files.py create mode 100644 packages/rol/rol_parameters/parse_parameters.py create mode 100644 packages/rol/rol_parameters/read_cpp_source.py create mode 100644 packages/rol/rol_parameters/requirements.txt create mode 100644 packages/rol/rol_parameters/rol_parameters.py create mode 100644 packages/rol/rol_parameters/sublists.py diff --git a/packages/rol/CMakeLists.txt b/packages/rol/CMakeLists.txt index 98c276b8c796..b6e045207d7b 100644 --- a/packages/rol/CMakeLists.txt +++ b/packages/rol/CMakeLists.txt @@ -30,6 +30,12 @@ TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_PYROL OFF ) +TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_PARAMETERLIST_VALIDATION + ENABLE_PARAMETERLIST_VALIDATION + "Build ROL with ParameterList validation." + OFF + ) + # Build Options SET( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) INCLUDE(BuildOptions) @@ -39,6 +45,7 @@ GET_PROPERTY( STACKTRACE_STRING GLOBAL PROPERTY STACKTRACE_IMPL ) #ENDIF() +include(ROLParameters) # diff --git a/packages/rol/cmake/ROLParameters.cmake b/packages/rol/cmake/ROLParameters.cmake new file mode 100644 index 000000000000..7117c4469ea2 --- /dev/null +++ b/packages/rol/cmake/ROLParameters.cmake @@ -0,0 +1,38 @@ +if( ROL_ENABLE_PARAMETERLIST_VALIDATION ) + + message("Enabling automated ParameterList detection and validation") + + if(NOT DEFINED ${PYTHON_EXECUTABLE}) + find_program(PYTHON_EXECUTABLE NAMES python3 python REQUIRED) + endif() + + set( ROL_SOURCE_DIR "${PROJECT_SOURCE_DIR}/packages/rol" ) + set( ROL_PARAMETERS_SOURCE_DIR "${ROL_SOURCE_DIR}/rol_parameters" ) + + set( ROL_BINARY_DIR "${PROJECT_BINARY_DIR}/packages/rol" ) + set( ROL_PARAMETERS_BINARY_DIR "${ROL_BINARY_DIR}/rol_parameters" ) + + set( REQUIREMENTS_FILE "${ROL_PARAMETERS_SOURCE_DIR}/requirements.txt" ) + set( VENV_PATH "${ROL_PARAMETERS_BINARY_DIR}/venv" ) + + # Set up Python virtual environment + add_custom_target( setup_venv + COMMAND ${CMAKE_COMMAND} -E env ${PYTHON_EXECUTABLE} -m venv ${VENV_PATH} + COMMAND ${CMAKE_COMMAND} -E env ${VENV_PATH}/bin/python -m pip install -r ${REQUIREMENTS_FILE} + COMMENT "Setting up virtual environment and installing required Python packages" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) + + message( "Python virtual environment path: ${VENV_PATH}" ) + message( STATUS "Run 'make setup_venv` or your equivalent build system command (e.g. ninja setup_venv') to setup the Python virtual environment before building rol_parameters") + + add_custom_target( rol_parameters + COMMAND ${CMAKE_COMMAND} -E env PYTHONPYCACHEPREFIX=${CMAKE_BINARY_DIR}/pycache + ${VENV_PATH}/bin/python ${ROL_PARAMETERS_SOURCE_DIR}/rol_parameters.py ${ROL_SOURCE_DIR} ${ROL_PARAMETERS_BINARY_DIR} + DEPENDS setup_venv + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Running rol_parameters.py using the virtual environment") + + message( STATUS "Run 'make rol_parameters` or your equivalent build system command (e.g. ninja rol_parameters') to build the hierarchical parameter list from the ROL source tree") + +endif() + diff --git a/packages/rol/cmake/ROL_config.h.in b/packages/rol/cmake/ROL_config.h.in index 80df89cfa5fd..d1b46e137485 100644 --- a/packages/rol/cmake/ROL_config.h.in +++ b/packages/rol/cmake/ROL_config.h.in @@ -54,3 +54,6 @@ /* Define for python interface support. */ #cmakedefine ENABLE_PYBIND11_PYROL + +/* Define support for automated ParameterList validation. */ +#cmakedefine ENABLE_PARAMETERLIST_VALIDATION diff --git a/packages/rol/rol_parameters/find_files.py b/packages/rol/rol_parameters/find_files.py new file mode 100644 index 000000000000..108e876ad6cf --- /dev/null +++ b/packages/rol/rol_parameters/find_files.py @@ -0,0 +1,75 @@ +import subprocess +import pathlib + +def find_files( root_path : pathlib.Path, + search_token : str, + include : list[str]=[], + exclude : list[str]=[]) -> list[pathlib.Path]: + """ + Searches for files within a directory tree that contain a specified search token using + the Unix/MacOS command line tool `grep`. + + This function wraps the Unix `grep` command to recursively search through files + starting from a root directory. It returns a list of `pathlib.Path` objects for + files that contain the specified search token. The search can be further refined + by specifying patterns for files to include or exclude. + + Parameters: + - root_path (pathlib.Path): The root directory from which the search will begin. + Must be a valid directory path. + - search_token (str): The token to search for within files. This is passed directly + to `grep`, so regular expressions can be used. + - includes (list[str], optional): A list of patterns to include in the search. + Patterns should match the file names to include. + For example, ['*.py'] to include only Python files. + Defaults to an empty list, which includes all files. + - excludes (list[str], optional): A list of patterns to exclude from the search. + Patterns should match the file names to exclude. + For example, ['*.txt'] to exclude all text files. + Defaults to an empty list, which excludes no files. + + Returns: + - list[pathlib.Path]: A list of `pathlib.Path` objects, each representing a file + that contains the search token. The list will be empty if + no matching files are found. + + Raises: + - Exception: If the `grep` command fails for any reason (e.g., due to an invalid + root_path or issues executing `grep`), an exception is raised with + the error message from `grep`. + + Example: + >>> find_files(pathlib.Path('/path/to/search'), 'def main', includes=['*.py']) + [PosixPath('/path/to/search/script1.py'), PosixPath('/path/to/search/dir/script2.py')] + + Note: + - This function relies on the Unix `grep` command and may not be portable to + environments without `grep` (e.g., some Windows environments without Unix-like + tools installed). + """ + + # Ensure the root path is an existant directory + assert( root_path.exists() ) + assert( root_path.is_dir() ) + + if isinstance(include,str): + include=[include] + if isinstance(exclude,str): + exclude=[exclude] + + + cmd = ['grep','-rl',search_token] + \ + [f'--include={inc}' for inc in include] + \ + [f'--exclude={exc}' for exc in exclude] + \ + [str(root_path)] + + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + # Check if the command was successful + if result.returncode != 0: + raise Exception(f"Error executing grep: {result.stderr}") + + # Parse the output into a list of Path objects + return [pathlib.Path(line.strip()) for line in result.stdout.splitlines()] + + diff --git a/packages/rol/rol_parameters/parse_parameters.py b/packages/rol/rol_parameters/parse_parameters.py new file mode 100644 index 000000000000..b6701cad7430 --- /dev/null +++ b/packages/rol/rol_parameters/parse_parameters.py @@ -0,0 +1,11 @@ +import re + +def crop_to_scope( cpp, token ): + pattern = re.compile(rf'\{{[^{{}}]*{token}[^{{}}][^{{}}]*\}}', re.MULTILINE) + match = re.search(pattern, cpp) + if match: + return match.group() + +def get_sublist_variable_name(cpp,token): + pattern = re.compile(rf'\{{[^{{}}]*{token}[^{{}}][^{{}}]*\}}', re.MULTILINE) + diff --git a/packages/rol/rol_parameters/read_cpp_source.py b/packages/rol/rol_parameters/read_cpp_source.py new file mode 100644 index 000000000000..d08169e2ee3a --- /dev/null +++ b/packages/rol/rol_parameters/read_cpp_source.py @@ -0,0 +1,85 @@ +import pathlib + +def contains_escaped_quote_advanced(s: str) -> bool: + i = 0 + while i < len(s): + if s[i] == '\\': + backslash_count = 1 + i += 1 + + # Count consecutive backslashes + while i < len(s) and s[i] == '\\': + backslash_count += 1 + i += 1 + + # If there's an odd number of backslashes followed by a quote, then it is escaped + if i < len(s) and s[i] == '"' and backslash_count % 2 == 1: + return True + else: + i += 1 + return False + +def strip_cpp_comments(cpp_source: str) -> str: + in_string = False + in_single_line_comment = False + in_multi_line_comment = False + result = [] + i = 0 + while i < len(cpp_source): + # Check for string start/end + if cpp_source[i] == '"' and not (in_single_line_comment or in_multi_line_comment): + # Extract substring from the current position backwards to the last non-escaped quote or start + substring = cpp_source[:i+1][::-1] + # Check if the quote is escaped + if not contains_escaped_quote_advanced(substring): + in_string = not in_string + result.append(cpp_source[i]) + # Check for single-line comment start + elif i+1 < len(cpp_source) and cpp_source[i:i+2] == "//" and not (in_string or in_multi_line_comment): + in_single_line_comment = True + i += 1 # Skip next character to avoid parsing '/' twice + # Check for multi-line comment start + elif i + 1 < len(cpp_source) and cpp_source[i:i+2] == "/*" and not (in_string or in_single_line_comment): + in_multi_line_comment = True + i += 1 # Skip next character to avoid parsing '*' twice + # Check for single-line comment end + elif in_single_line_comment and cpp_source[i] == "\n": + in_single_line_comment = False + result.append(cpp_source[i]) # Include newline in result + # Check for multi-line comment end + elif i + 1 < len(cpp_source) and in_multi_line_comment and cpp_source[i:i+2] == "*/": + in_multi_line_comment = False + i += 1 # Skip next character to avoid parsing '/' twice + # Append character if not in a comment + elif not (in_single_line_comment or in_multi_line_comment): + result.append(cpp_source[i]) + i += 1 + + return ''.join(result) + + +#def is_utf8(data : str) -> bool: +# try: +# data.decode('utf-8') +# return True +# except UnicodeDecodeError: +# return False + + + +def read_cpp_source(cpp_file : pathlib.Path) -> str: + + # Ensure the argument is a file + assert( cpp_file.exists() ) + assert( cpp_file.is_file() ) + + # Read C++ source file to string + with open(cpp_file,"r") as f: + content = f.read() + + # Ensure file contains only text +# assert( is_utf8(content) ) + + cpp_source = strip_cpp_comments(content) + + return cpp_source diff --git a/packages/rol/rol_parameters/requirements.txt b/packages/rol/rol_parameters/requirements.txt new file mode 100644 index 000000000000..74460bb214c2 --- /dev/null +++ b/packages/rol/rol_parameters/requirements.txt @@ -0,0 +1 @@ +networkx==2.8.8 diff --git a/packages/rol/rol_parameters/rol_parameters.py b/packages/rol/rol_parameters/rol_parameters.py new file mode 100644 index 000000000000..63d7b7d42a3d --- /dev/null +++ b/packages/rol/rol_parameters/rol_parameters.py @@ -0,0 +1,88 @@ +import re +import sys +import pathlib +import networkx as nx +from find_files import find_files +from read_cpp_source import read_cpp_source + +def compile_list_of_sublists(source_dir,binary_dir): + files = find_files(source_dir, "sublist", include=['*.hpp']) + pattern = re.compile(r'\.sublist\("\s*([^"]*)"\s*\)', re.MULTILINE) + all_sublists = set() + for file in files: + cpp = read_cpp_source(file) + matches = list(re.finditer(pattern, cpp)) + if len(matches): + for m in matches: + all_sublists.add(m.group(1).strip()) + + outfile = binary_dir/'all_sublists.txt' + + with open(outfile,'w') as f: + for key in sorted(all_sublists): + f.write(f'{key}\n') + + print(f'Created file {outfile}') + + +def compile_list_of_keys(source_dir,binary_dir): + parlist_files = find_files(source_dir, "ParameterList", include=['*.hpp'],exclude=['zoo']) + pattern = re.compile(r'(\.get\s*\(\s*"[^"]*"\s*,(.*)\)\s*;)',re.MULTILINE) + all_keys = set() + for file in parlist_files: + cpp = read_cpp_source(file) + matches = list(re.finditer(pattern,cpp)) + if len(matches): + for m in matches: + all_keys.add(m.group(0).split('"')[1].strip()) + + outfile = binary_dir/'all_keys.txt' + + with open(outfile,'w') as f: + for key in sorted(all_keys): + f.write(f'{key}\n') + + print(f'Created file {outfile}') + + + +if __name__ == '__main__': + + assert( len(sys.argv)>2 ) + + rol_root = pathlib.Path(sys.argv[1]) + rol_src = rol_root/'src' + + binary_dir = pathlib.Path(sys.argv[2]) + + assert(rol_src.exists()) + assert(rol_src.is_dir()) + assert(binary_dir.exists()) + assert(binary_dir.is_dir()) + + compile_list_of_sublists(rol_src,binary_dir) + compile_list_of_keys(rol_src,binary_dir) + + +# file = pathlib.Path("/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol/src/step/ROL_AugmentedLagrangianStep.hpp") +# cpp = read_cpp_source(file) + +# scope = crop_to_scope(cpp, "Penalty Parameter Reciprocal Lower Bound") +# print(scope) + +# pattern = re.compile(rf'\.get\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) +# pattern = re.compile(r'\{[^{}]*\.get\s*\(\s*"[^"]*"\s*,(.*)\)[^{}]\}') +# pattern = re.compile(r'\{[^{}]*\.get\s*\(\s*"[^"]*"\s*,([^{}]*)\)[^{}]*\}', re.MULTILINE) +# +# +# match = re.search(pattern,cpp) +# if match: +# print(match.group()) +# pattern = re.compile(r'(=[^{}]*\.get\s*\(\s*"[^"]*"\s*,(.*)\)\s*;)',re.MULTILINE) +# matches = list(re.finditer(pattern,cpp)) +# if len(matches): +# print(file) +# for m in matches: +# print(m.group(0)) + +# G = nx.DiGraph() diff --git a/packages/rol/rol_parameters/sublists.py b/packages/rol/rol_parameters/sublists.py new file mode 100644 index 000000000000..983ad8e7e7ae --- /dev/null +++ b/packages/rol/rol_parameters/sublists.py @@ -0,0 +1,28 @@ +import re +from find_files import find_files +from read_cpp_source import read_cpp_source + + +def find_sublist_instances(root_path): + assert(root_path.exists()) + assert(root_path.is_dir()) + + files = [find_files(root_path, "sublist", include=['*.hpp']) + sublist_pattern = re.compile(r'^.*?(\.\s*sublist\s*\(\s*"[^"]*"\s*\)(.*);)',re.MULTILINE) +# sublist_pattern = re.compile(r'^.*?(\.\s*sublist\s*\(\s*"[^"]*"\s*\))',re.MULTILINE) + + results = dict() + + for file in files: + cpp = read_cpp_source(file) + matches = list(re.finditer(sublist_pattern, cpp)) + if len(matches): + if file not in results.keys(): + results[file] = [] + [results[file].append(m.groups()) for m in matches] + + return results + + + + From 798ae6ff55ab7fb314f5a72c7ef3ef386c366ed0 Mon Sep 17 00:00:00 2001 From: Robert John Baraldi Date: Wed, 15 May 2024 17:49:50 -0600 Subject: [PATCH 012/243] put getDual... back in --- packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp index 07ade19857b4..8f83c3df05f1 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp @@ -105,13 +105,14 @@ void Algorithm::run( Problem &problem, std::ostream &outStream ) { if (problem.getProblemType() == TYPE_P) { run(*problem.getPrimalOptimizationVector(), + *problem.getDualOptimizationVector(), *problem.getObjective(), - *problem.getProximableObjective(), + *problem.getProximableObjective(), outStream); problem.finalizeIteration(); } else { - throw Exception::NotImplemented(">>> ROL::TypeP::Algorithm::run : Optimization problem is not Type P problems!"); + throw Exception::NotImplemented(">>> ROL::TypeP::Algorithm::run : Optimization problem is not Type P!"); } } From 3b538fcff3ccfdb4dd4c16e5eeaebe2165f6a2e1 Mon Sep 17 00:00:00 2001 From: Robert John Baraldi Date: Wed, 15 May 2024 17:53:28 -0600 Subject: [PATCH 013/243] eliminated white space --- packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp index 8f83c3df05f1..f65373250de3 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_Algorithm_Def.hpp @@ -107,7 +107,7 @@ void Algorithm::run( Problem &problem, run(*problem.getPrimalOptimizationVector(), *problem.getDualOptimizationVector(), *problem.getObjective(), - *problem.getProximableObjective(), + *problem.getProximableObjective(), outStream); problem.finalizeIteration(); } From e402c59b6edb2fe4c6e575d3c042b1625b13f8dd Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Wed, 29 May 2024 15:00:58 -0600 Subject: [PATCH 014/243] Modified CMake invocation of python virtual environment to use poetry. Now scrapes rol/src for and keys and compiles json databases of unique instances of sublists and get keys and the relative path files in which they appear --- packages/rol/cmake/ROLParameters.cmake | 16 +-- packages/rol/rol_parameters/compile_json.py | 32 ++++++ packages/rol/rol_parameters/find_files.py | 29 ++--- packages/rol/rol_parameters/pyproject.toml | 9 ++ .../rol/rol_parameters/read_cpp_source.py | 59 +++++++--- packages/rol/rol_parameters/requirements.txt | 1 - packages/rol/rol_parameters/rol_parameters.py | 101 ++++++------------ 7 files changed, 145 insertions(+), 102 deletions(-) create mode 100644 packages/rol/rol_parameters/compile_json.py create mode 100644 packages/rol/rol_parameters/pyproject.toml delete mode 100644 packages/rol/rol_parameters/requirements.txt diff --git a/packages/rol/cmake/ROLParameters.cmake b/packages/rol/cmake/ROLParameters.cmake index 7117c4469ea2..19a5e8e53014 100644 --- a/packages/rol/cmake/ROLParameters.cmake +++ b/packages/rol/cmake/ROLParameters.cmake @@ -12,15 +12,17 @@ if( ROL_ENABLE_PARAMETERLIST_VALIDATION ) set( ROL_BINARY_DIR "${PROJECT_BINARY_DIR}/packages/rol" ) set( ROL_PARAMETERS_BINARY_DIR "${ROL_BINARY_DIR}/rol_parameters" ) - set( REQUIREMENTS_FILE "${ROL_PARAMETERS_SOURCE_DIR}/requirements.txt" ) + # set( REQUIREMENTS_FILE "${ROL_PARAMETERS_SOURCE_DIR}/requirements.txt" ) set( VENV_PATH "${ROL_PARAMETERS_BINARY_DIR}/venv" ) - # Set up Python virtual environment - add_custom_target( setup_venv - COMMAND ${CMAKE_COMMAND} -E env ${PYTHON_EXECUTABLE} -m venv ${VENV_PATH} - COMMAND ${CMAKE_COMMAND} -E env ${VENV_PATH}/bin/python -m pip install -r ${REQUIREMENTS_FILE} - COMMENT "Setting up virtual environment and installing required Python packages" - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) + add_custom_target( setup_venv + COMMAND ${CMAKE_COMMAND} -E env ${PYTHON_EXECUTABLE} -m venv ${VENV_PATH} + # Install poetry in the virtual environment + COMMAND ${CMAKE_COMMAND} -E env ${VENV_PATH}/bin/pip install poetry + # Use poetry to install dependencies from pyproject.toml + COMMAND ${CMAKE_COMMAND} -E env ${VENV_PATH}/bin/poetry install + COMMENT "Setting up virtual environment and installing required Python packages with poetry" + WORKING_DIRECTORY ${ROL_PARAMETERS_SOURCE_DIR} ) message( "Python virtual environment path: ${VENV_PATH}" ) message( STATUS "Run 'make setup_venv` or your equivalent build system command (e.g. ninja setup_venv') to setup the Python virtual environment before building rol_parameters") diff --git a/packages/rol/rol_parameters/compile_json.py b/packages/rol/rol_parameters/compile_json.py new file mode 100644 index 000000000000..f5ab689c9a00 --- /dev/null +++ b/packages/rol/rol_parameters/compile_json.py @@ -0,0 +1,32 @@ +import re +import pathlib +import json +from collections import OrderedDict +from find_files import find_files +from read_cpp_source import read_cpp_source + +def compile_json( pattern : re.Pattern, + root_dir : pathlib.Path, + relative_pathfiles : list[pathlib.Path], + num_capture_groups : int = 1 ) -> str: + + all_instances = OrderedDict() + + for relative_pathfile in relative_pathfiles: + cpp = read_cpp_source(root_dir / relative_pathfile) + matches = list(re.finditer(pattern, cpp)) + file_str = str(relative_pathfile) + + if len(matches): + for m in matches: + key_name = m.group(1).strip() + if key_name not in all_instances.keys(): + all_instances[key_name] = {file_str} + else: + all_instances[key_name].add(file_str) + + for k,v in all_instances.items(): + all_instances[k] = list(v) + + return json.dumps(all_instances,indent=4) + diff --git a/packages/rol/rol_parameters/find_files.py b/packages/rol/rol_parameters/find_files.py index 108e876ad6cf..161497eab4a6 100644 --- a/packages/rol/rol_parameters/find_files.py +++ b/packages/rol/rol_parameters/find_files.py @@ -29,9 +29,9 @@ def find_files( root_path : pathlib.Path, Defaults to an empty list, which excludes no files. Returns: - - list[pathlib.Path]: A list of `pathlib.Path` objects, each representing a file - that contains the search token. The list will be empty if - no matching files are found. + - list[pathlib.Path]: A list of `pathlib.Path` objects (relative to `root_path`), + each representing a file that contains the search token. The + list will be empty if no matching files are found. Raises: - Exception: If the `grep` command fails for any reason (e.g., due to an invalid @@ -40,7 +40,7 @@ def find_files( root_path : pathlib.Path, Example: >>> find_files(pathlib.Path('/path/to/search'), 'def main', includes=['*.py']) - [PosixPath('/path/to/search/script1.py'), PosixPath('/path/to/search/dir/script2.py')] + [PosixPath('script1.py'), PosixPath('script2.py')] Note: - This function relies on the Unix `grep` command and may not be portable to @@ -53,15 +53,18 @@ def find_files( root_path : pathlib.Path, assert( root_path.is_dir() ) if isinstance(include,str): - include=[include] + include = [include] if len(include) else [] if isinstance(exclude,str): - exclude=[exclude] + exclude = [exclude] if len(exclude) else [] + cmd = ['grep','-rl',search_token] - cmd = ['grep','-rl',search_token] + \ - [f'--include={inc}' for inc in include] + \ - [f'--exclude={exc}' for exc in exclude] + \ - [str(root_path)] + if len(include): + cmd += [f'--include={inc}' for inc in include] + if len(exclude): + cmd += [f'--exclude={exc}' for exc in exclude] + + cmd.append(str(root_path)) result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) @@ -69,7 +72,7 @@ def find_files( root_path : pathlib.Path, if result.returncode != 0: raise Exception(f"Error executing grep: {result.stderr}") - # Parse the output into a list of Path objects - return [pathlib.Path(line.strip()) for line in result.stdout.splitlines()] - + make_relative = lambda path_str : pathlib.Path(path_str).relative_to(root_path,walk_up=True) + # Parse the output into a list of relative Path objects (relative to root_path) + return sorted([make_relative(line.strip()) for line in result.stdout.splitlines()]) diff --git a/packages/rol/rol_parameters/pyproject.toml b/packages/rol/rol_parameters/pyproject.toml new file mode 100644 index 000000000000..19198f3c5d4a --- /dev/null +++ b/packages/rol/rol_parameters/pyproject.toml @@ -0,0 +1,9 @@ +[tool.poetry] +name = "rol_parameters" +version = "0.1.0" +description = "ROL Parameters - and automated parameter scraper and database maintainer" +authors = ["Greg von Winckel "] + +[tool.poetry.dependencies] +python = "^3.12" + diff --git a/packages/rol/rol_parameters/read_cpp_source.py b/packages/rol/rol_parameters/read_cpp_source.py index d08169e2ee3a..7d8a77f976e5 100644 --- a/packages/rol/rol_parameters/read_cpp_source.py +++ b/packages/rol/rol_parameters/read_cpp_source.py @@ -1,6 +1,22 @@ + + import pathlib -def contains_escaped_quote_advanced(s: str) -> bool: + +def contains_escaped_quote_advanced( s : str ) -> bool: + """ + Determines if a string contains an escaped double quote character. + + This function checks for occurrences of double quotes (") that are + preceded by an odd number of backslashes (\), indicating that the + quote is escaped. + + Parameters: + s (str): The input string to check. + + Returns: + bool: True if an escaped double quote is found, False otherwise. + """ i = 0 while i < len(s): if s[i] == '\\': @@ -19,7 +35,21 @@ def contains_escaped_quote_advanced(s: str) -> bool: i += 1 return False -def strip_cpp_comments(cpp_source: str) -> str: + + +def strip_cpp_comments( cpp_source : str ) -> str: + """ + Removes C++ style comments (both single-line and multi-line) from a string of C++ source code. + + This function strips out both single-line (//) and multi-line (/* ... */) comments + from the provided C++ source code, while preserving the content within string literals. + + Parameters: + cpp_source (str): The input C++ source code as a string. + + Returns: + str: The source code with comments removed. + """ in_string = False in_single_line_comment = False in_multi_line_comment = False @@ -58,17 +88,23 @@ def strip_cpp_comments(cpp_source: str) -> str: return ''.join(result) -#def is_utf8(data : str) -> bool: -# try: -# data.decode('utf-8') -# return True -# except UnicodeDecodeError: -# return False +def read_cpp_source( cpp_file : pathlib.Path ) -> str: + """ + Reads a C++ source file, removes comments, and returns the cleaned source code. + + This function reads the content of a given C++ source file, strips out all comments, + and returns the resulting cleaned source code as a string. + Parameters: + cpp_file (pathlib.Path): The path to the C++ source file to read. -def read_cpp_source(cpp_file : pathlib.Path) -> str: + Returns: + str: The C++ source code with comments removed. + Raises: + AssertionError: If the provided path does not exist or is not a file. + """ # Ensure the argument is a file assert( cpp_file.exists() ) assert( cpp_file.is_file() ) @@ -77,9 +113,8 @@ def read_cpp_source(cpp_file : pathlib.Path) -> str: with open(cpp_file,"r") as f: content = f.read() - # Ensure file contains only text -# assert( is_utf8(content) ) - cpp_source = strip_cpp_comments(content) return cpp_source + + diff --git a/packages/rol/rol_parameters/requirements.txt b/packages/rol/rol_parameters/requirements.txt deleted file mode 100644 index 74460bb214c2..000000000000 --- a/packages/rol/rol_parameters/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -networkx==2.8.8 diff --git a/packages/rol/rol_parameters/rol_parameters.py b/packages/rol/rol_parameters/rol_parameters.py index 63d7b7d42a3d..ea6538a427c1 100644 --- a/packages/rol/rol_parameters/rol_parameters.py +++ b/packages/rol/rol_parameters/rol_parameters.py @@ -1,88 +1,51 @@ import re import sys import pathlib -import networkx as nx from find_files import find_files -from read_cpp_source import read_cpp_source - -def compile_list_of_sublists(source_dir,binary_dir): - files = find_files(source_dir, "sublist", include=['*.hpp']) - pattern = re.compile(r'\.sublist\("\s*([^"]*)"\s*\)', re.MULTILINE) - all_sublists = set() - for file in files: - cpp = read_cpp_source(file) - matches = list(re.finditer(pattern, cpp)) - if len(matches): - for m in matches: - all_sublists.add(m.group(1).strip()) - - outfile = binary_dir/'all_sublists.txt' - - with open(outfile,'w') as f: - for key in sorted(all_sublists): - f.write(f'{key}\n') - - print(f'Created file {outfile}') - - -def compile_list_of_keys(source_dir,binary_dir): - parlist_files = find_files(source_dir, "ParameterList", include=['*.hpp'],exclude=['zoo']) - pattern = re.compile(r'(\.get\s*\(\s*"[^"]*"\s*,(.*)\)\s*;)',re.MULTILINE) - all_keys = set() - for file in parlist_files: - cpp = read_cpp_source(file) - matches = list(re.finditer(pattern,cpp)) - if len(matches): - for m in matches: - all_keys.add(m.group(0).split('"')[1].strip()) - - outfile = binary_dir/'all_keys.txt' - - with open(outfile,'w') as f: - for key in sorted(all_keys): - f.write(f'{key}\n') - - print(f'Created file {outfile}') - - +from compile_json import compile_json if __name__ == '__main__': assert( len(sys.argv)>2 ) rol_root = pathlib.Path(sys.argv[1]) +# rol_root = pathlib.Path('/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol') + binary_dir = pathlib.Path(sys.argv[2])#rol_root/'rol_parameters' rol_src = rol_root/'src' - binary_dir = pathlib.Path(sys.argv[2]) + # Create list of all (relative path) header files containing the token `ParameterList` in the C++ source + relative_pathfiles = find_files(rol_src,'ParameterList','*.hpp') + + # Breakdown of the `sublist` search pattern: + # \b : Asserts a word boundary, ensuring that "sublist" is matched as a whole word. + # sublist : Matches the literal string "sublist". + # \s* : Matches zero or more whitespace characters. + # \( : Matches a literal opening parenthesis ((). + # "([^"]+)" : Capturing group that matches one or more characters that are not double quotes ("), + # capturing the content between double quotes. + # \) : Matches a literal closing parenthesis ()). + sublist_pattern = re.compile(r'\bsublist\s*\(\s*"([^"]+)"\s*\)', re.MULTILINE) + sublist_json = compile_json(sublist_pattern,rol_src,relative_pathfiles) - assert(rol_src.exists()) - assert(rol_src.is_dir()) - assert(binary_dir.exists()) - assert(binary_dir.is_dir()) + with open(binary_dir / 'sublist.json', 'w') as f: + f.write(sublist_json) - compile_list_of_sublists(rol_src,binary_dir) - compile_list_of_keys(rol_src,binary_dir) + # Breakdown of the `getkey` search pattern: + # \b : Asserts a word boundary, ensuring that "get" is matched as a whole word. + # get : Matches the literal string "sublist". + # \s* : Matches zero or more whitespace characters. + # \( : Matches a literal opening parenthesis ((). + # "([^"]+)" : Capturing group that matches one or more characters that are not double quotes ("), + # capturing the content between double quotes. + # , : Matches a literal comma. + # \) : Matches a literal closing parenthesis ()). + # ; : Matches a literal semicolon + getkey_pattern = re.compile(rf'\bget\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) + getkey_json = compile_json(getkey_pattern,rol_src,relative_pathfiles) + with open(binary_dir / 'getkey.json', 'w') as f: + f.write(getkey_json) -# file = pathlib.Path("/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol/src/step/ROL_AugmentedLagrangianStep.hpp") -# cpp = read_cpp_source(file) -# scope = crop_to_scope(cpp, "Penalty Parameter Reciprocal Lower Bound") -# print(scope) -# pattern = re.compile(rf'\.get\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) -# pattern = re.compile(r'\{[^{}]*\.get\s*\(\s*"[^"]*"\s*,(.*)\)[^{}]\}') -# pattern = re.compile(r'\{[^{}]*\.get\s*\(\s*"[^"]*"\s*,([^{}]*)\)[^{}]*\}', re.MULTILINE) -# -# -# match = re.search(pattern,cpp) -# if match: -# print(match.group()) -# pattern = re.compile(r'(=[^{}]*\.get\s*\(\s*"[^"]*"\s*,(.*)\)\s*;)',re.MULTILINE) -# matches = list(re.finditer(pattern,cpp)) -# if len(matches): -# print(file) -# for m in matches: -# print(m.group(0)) -# G = nx.DiGraph() From eb0bc71128d8a0e41fa5387d12f4f8b31b796f6b Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Wed, 29 May 2024 17:08:00 -0600 Subject: [PATCH 015/243] Added poetry.lock --- packages/rol/rol_parameters/poetry.lock | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 packages/rol/rol_parameters/poetry.lock diff --git a/packages/rol/rol_parameters/poetry.lock b/packages/rol/rol_parameters/poetry.lock new file mode 100644 index 000000000000..1034779ff54e --- /dev/null +++ b/packages/rol/rol_parameters/poetry.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +package = [] + +[metadata] +lock-version = "2.0" +python-versions = "^3.12" +content-hash = "34e39677d8527182346093002688d17a5d2fc204b9eb3e094b2e6ac519028228" From 77c61794d7731093722de98f5ac9f2011700d88f Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Tue, 16 Jan 2024 00:47:42 -0700 Subject: [PATCH 016/243] Temporary fix for wrapping default constructors in the Dynamic interface --- packages/rol/src/function/dynamic/ROL_DynamicConstraint.hpp | 4 +++- packages/rol/src/function/dynamic/ROL_DynamicObjective.hpp | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/function/dynamic/ROL_DynamicConstraint.hpp b/packages/rol/src/function/dynamic/ROL_DynamicConstraint.hpp index 8606d677ca89..5e0bdb51da3f 100644 --- a/packages/rol/src/function/dynamic/ROL_DynamicConstraint.hpp +++ b/packages/rol/src/function/dynamic/ROL_DynamicConstraint.hpp @@ -124,7 +124,9 @@ class DynamicConstraint : public DynamicFunction { virtual ~DynamicConstraint() {} - DynamicConstraint( std::initializer_list zero_deriv_terms={} ): + DynamicConstraint() : DynamicConstraint( {} ) {} + + DynamicConstraint( std::initializer_list zero_deriv_terms ): DynamicFunction(zero_deriv_terms), unew_ ( nullPtr ), jv_ ( nullPtr ), diff --git a/packages/rol/src/function/dynamic/ROL_DynamicObjective.hpp b/packages/rol/src/function/dynamic/ROL_DynamicObjective.hpp index e68c65099c1c..570ddcd65750 100644 --- a/packages/rol/src/function/dynamic/ROL_DynamicObjective.hpp +++ b/packages/rol/src/function/dynamic/ROL_DynamicObjective.hpp @@ -75,8 +75,9 @@ class DynamicObjective : public DynamicFunction { using V = Vector; using TS = TimeStamp; + DynamicObjective() : DynamicObjective( {} ) {} - DynamicObjective( std::initializer_list zero_deriv_terms={} ) : + DynamicObjective( std::initializer_list zero_deriv_terms ) : DynamicFunction( zero_deriv_terms ) {} virtual ~DynamicObjective() {} From 1ea6251d88c7848322f30d8ffe434ad0cd9daee4 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Tue, 13 Feb 2024 15:00:19 -0700 Subject: [PATCH 017/243] Temporary SimOpt patch --- .../rol/src/function/simopt/ROL_Constraint_SimOpt.hpp | 9 ++++++++- .../rol/src/function/simopt/ROL_Objective_SimOpt.hpp | 6 +++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp b/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp index 287f383de54b..73439a6b0173 100644 --- a/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp +++ b/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp @@ -206,10 +206,17 @@ class Constraint_SimOpt : public Constraint { --- */ + virtual void value_simopt(Vector &c, + const Vector &u, + const Vector &z, + Real &tol) {}; virtual void value(Vector &c, const Vector &u, const Vector &z, - Real &tol) = 0; + Real &tol) + { + value_simopt(c, u, z, tol); + } /** \brief Given \f$z\f$, solve \f$c(u,z)=0\f$ for \f$u\f$. diff --git a/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp b/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp index 77162993ea82..0595e828a4f1 100644 --- a/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp +++ b/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp @@ -84,7 +84,11 @@ class Objective_SimOpt : public Objective { /** \brief Compute value. */ - virtual Real value( const Vector &u, const Vector &z, Real &tol ) = 0; + virtual Real value_simopt( const Vector &u, const Vector &z, Real &tol ) { return 0; } + virtual Real value( const Vector &u, const Vector &z, Real &tol ) + { + return value_simopt(u, z, tol); + } Real value( const Vector &x, Real &tol ) { const ROL::Vector_SimOpt &xs = dynamic_cast&>( From 6f76aa1a95116c614d5731b1f097dd6c821b3916 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Wed, 14 Feb 2024 19:07:26 -0700 Subject: [PATCH 018/243] Multi-file CMake --- packages/rol/pyrol/CMakeLists.txt | 43 ++++++++++++------- packages/rol/pyrol/src/CMakeLists.txt | 32 ++++++++++++-- packages/rol/pyrol/src/checkNumberFiles.cmake | 12 ++++++ 3 files changed, 67 insertions(+), 20 deletions(-) create mode 100644 packages/rol/pyrol/src/checkNumberFiles.cmake diff --git a/packages/rol/pyrol/CMakeLists.txt b/packages/rol/pyrol/CMakeLists.txt index be85d4bf11bf..b15d5596d4f0 100644 --- a/packages/rol/pyrol/CMakeLists.txt +++ b/packages/rol/pyrol/CMakeLists.txt @@ -65,14 +65,19 @@ TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_BINDER_SUPPRESS_ERRORS "Enable the suppress errors option of Binder." OFF ) +TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_BINDER_USE_ONE_FILE + PYROL_USE_ONE_FILE + "Enable the use of one file by Binder." + OFF ) + TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_BINDER_CMAKE_ERROR PYROL_CMAKE_ERROR - "Stop the configuration if binder fails." + "Stop the configuration if Binder fails." ON ) TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_BINDER_VERBOSE PYROL_B_VERBOSE - "Increase the verbosity of binder" + "Increase the verbosity of Binder" OFF ) TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_ENABLE_BINDER_UPDATE @@ -80,6 +85,8 @@ TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_ENABLE_BINDER_UPDATE "Enable the update of the generated source files with Binder." OFF ) +SET(PYROL_BINDER_NUM_FILES "100" CACHE STRING "Maximum number of generated files by Binder.") + MESSAGE("-- PYTHON_EXECUTABLE:") IF(NOT DEFINED ${PYTHON_EXECUTABLE}) find_program(PYTHON_EXECUTABLE @@ -266,15 +273,6 @@ IF (PYROL_GENERATE_SRC) endforeach() endforeach() - - #list(REMOVE_DUPLICATES PyROL_all_include_files_without_dir) - #list(REMOVE_ITEM PyROL_all_include_files_without_dir "") - - #list(REMOVE_DUPLICATES PyROL_all_include_files_with_dir) - #list(REMOVE_ITEM PyROL_all_include_files_with_dir "") - - #MESSAGE("PyROL_all_include_files_with_dir = ${PyROL_all_include_files_with_dir}") - SET(CONTENTS "") FOREACH(line IN LISTS all_include_dirs) SET(CONTENTS "${CONTENTS}${line}\n") @@ -295,7 +293,7 @@ IF (PYROL_GENERATE_SRC) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/python) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/src) - + file (GLOB PyROLPyFiles2 "${CMAKE_CURRENT_BINARY_DIR}/python/*.py") list (APPEND PyROLPyFiles ${PyROLPyFiles2}) @@ -307,7 +305,12 @@ IF (PYROL_GENERATE_SRC) list(APPEND BINDER_OPTIONS ${binder_include_name}) list(APPEND BINDER_OPTIONS --root-module pyrol) list(APPEND BINDER_OPTIONS --prefix ${CMAKE_CURRENT_BINARY_DIR}/binder) - list(APPEND BINDER_OPTIONS -max-file-size=1000000) + IF(PYROL_USE_ONE_FILE) + list(APPEND BINDER_OPTIONS -single-file) + ELSE() + list(APPEND BINDER_OPTIONS -max-file-size=1000000) + list(APPEND BINDER_OPTIONS -flat) + ENDIF() list(APPEND BINDER_OPTIONS --bind Teuchos) list(APPEND BINDER_OPTIONS --bind ROL) list(APPEND BINDER_OPTIONS --bind pyrol) @@ -317,7 +320,7 @@ IF (PYROL_GENERATE_SRC) list(APPEND BINDER_OPTIONS --config ${BINDER_CFG}) IF(PYROL_SUPPRESS_ERRORS) list(APPEND BINDER_OPTIONS --suppress-errors) - ENDIF() + ENDIF() list(APPEND BINDER_OPTIONS --) IF(TPL_ENABLE_CUDA) list(APPEND BINDER_OPTIONS -x cuda --cuda-host-only) @@ -337,10 +340,18 @@ IF (PYROL_GENERATE_SRC) message("BINDER_OPTIONS='${BINDER_OPTIONS}'") + IF(NOT PYROL_USE_ONE_FILE) + MATH(EXPR NUMBER_FILE "${PYROL_BINDER_NUM_FILES}") + + foreach(index RANGE 0 ${NUMBER_FILE}) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/binder/pyrol_${index}.cpp "") + endforeach() + ENDIF() + EXECUTE_PROCESS(COMMAND ${PyROL_BINDER_EXECUTABLE} ${BINDER_OPTIONS} RESULT_VARIABLE STATUS - OUTPUT_VARIABLE OUTPUT_BINDER + OUTPUT_VARIABLE OUTPUT_BINDER ) if(STATUS AND NOT STATUS EQUAL 0) @@ -351,7 +362,7 @@ IF (PYROL_GENERATE_SRC) message("BINDER FAILED: ${STATUS}") endif() else() - message(STATUS "BINDER SUCCESS:") + message(STATUS "BINDER SUCCESS:") message("${OUTPUT_BINDER}") endif() diff --git a/packages/rol/pyrol/src/CMakeLists.txt b/packages/rol/pyrol/src/CMakeLists.txt index 1ebc1e8ab97f..d086b86937d2 100644 --- a/packages/rol/pyrol/src/CMakeLists.txt +++ b/packages/rol/pyrol/src/CMakeLists.txt @@ -4,17 +4,41 @@ FILE(COPY ${PYROL_SRC} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) MESSAGE("CMAKE_CURRENT_BINARY_DIR = ${CMAKE_CURRENT_BINARY_DIR}") -file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/../binder/pyrol.sources BINDER_SRCS) +# file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/../binder/pyrol.sources BINDER_SRCS) list(TRANSFORM BINDER_SRCS PREPEND "${CMAKE_CURRENT_BINARY_DIR}/../binder/") list(APPEND PYROL_SRC ${BINDER_SRCS}) -MESSAGE("PYROL_SRC with binder = ${PYROL_SRC}") +list(APPEND PYROL_SRC ${CMAKE_CURRENT_BINARY_DIR}/../binder/pyrol.cpp) + +IF(NOT PYROL_USE_ONE_FILE) + MATH(EXPR NUMBER_FILE "${PYROL_BINDER_NUM_FILES}") + + foreach(index RANGE 0 ${NUMBER_FILE}) + list(APPEND PYROL_SRC ${CMAKE_CURRENT_BINARY_DIR}/../binder/pyrol_${index}.cpp) + endforeach() + + MATH(EXPR NUMBER_FILE "${NUMBER_FILE}+1") + + EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" + -D "NUMBER_FILE=${NUMBER_FILE}" + -P "${CMAKE_CURRENT_SOURCE_DIR}/checkNumberFiles.cmake" + RESULT_VARIABLE STATUS + OUTPUT_VARIABLE OUTPUT_CHECKNUMBERFILES + ) + + if(STATUS AND NOT STATUS EQUAL 0) + message("${OUTPUT_CHECKNUMBERFILES}") + message(FATAL_ERROR "checkNumberFiles FAILED: ${STATUS}") + endif() +ENDIF() + +MESSAGE("PYROL_SRC with Binder = ${PYROL_SRC}") pybind11_add_module(pyrol ${PYROL_SRC}) target_include_directories(pyrol PUBLIC ${Mpi4Py_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/../binder "$") target_compile_features(pyrol PUBLIC cxx_std_14) -foreach(depPkg IN LISTS ROL_LIB_ENABLED_DEPENDENCIES) +foreach(depPkg IN LISTS ROL_LIB_ENABLED_DEPENDENCIES) target_link_libraries(pyrol PUBLIC ${depPkg}::all_libs) endforeach() target_link_libraries(pyrol PUBLIC ${Trilinos_EXTRA_LINK_FLAGS}) @@ -25,5 +49,5 @@ INSTALL(TARGETS pyrol add_custom_command(TARGET pyrol POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pyrol.so ${CMAKE_CURRENT_BINARY_DIR}/../pyrol/. - COMMENT "Copy ${PROJECT_BINARY_DIR}/src/PyROL.so" + COMMENT "Copy ${PROJECT_BINARY_DIR}/src/pyrol.so" ) diff --git a/packages/rol/pyrol/src/checkNumberFiles.cmake b/packages/rol/pyrol/src/checkNumberFiles.cmake new file mode 100644 index 000000000000..9cdbbcf3357d --- /dev/null +++ b/packages/rol/pyrol/src/checkNumberFiles.cmake @@ -0,0 +1,12 @@ +if(NOT NUMBER_FILE) + message(FATAL_ERROR "NUMBER_FILE must be specified") +endif() + +if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/../binder/pyrol_${NUMBER_FILE}.cpp) + MATH(EXPR INDEX "${NUMBER_FILE}+1") + while (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/../binder/pyrol_${INDEX}.cpp) + MATH(EXPR INDEX "${INDEX}+1") + endwhile() + MATH(EXPR INDEX "${INDEX}-1") + message(FATAL_ERROR "File pyrol_${NUMBER_FILE}.cpp exists; please rerun the configuration with PyROL_BINDER_NUM_FILES at least equal to ${INDEX}.") +endif() From 6b347dc396d535b526aff50c86cf23749c909213 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Thu, 15 Feb 2024 19:18:02 -0700 Subject: [PATCH 019/243] Install improvements --- .../pyrol/example/pytorch/rosenbrock_torch.py | 69 +++++++++++++++++++ packages/rol/pyrol/pyproject.toml | 1 + packages/rol/pyrol/scripts/create_sdist | 11 ++- 3 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 packages/rol/pyrol/example/pytorch/rosenbrock_torch.py diff --git a/packages/rol/pyrol/example/pytorch/rosenbrock_torch.py b/packages/rol/pyrol/example/pytorch/rosenbrock_torch.py new file mode 100644 index 000000000000..a6e5da79f389 --- /dev/null +++ b/packages/rol/pyrol/example/pytorch/rosenbrock_torch.py @@ -0,0 +1,69 @@ +from TorchVectors import TensorVector +from TorchObjectives import TorchObjective + +from pyrol import getCout, Objective, Problem, Solver +from pyrol.vectors import NumPyVector + +from pyrol.pyrol.Teuchos import ParameterList + +import numpy as np +import time +import torch + + +class RosenbrockObjective(TorchObjective): + + def __init__(self): + super().__init__() + self.alpha = 100 + + def torch_value(self, x): + # return torch.sum(self.alpha*(x[::2]**2 - x[1::2])**2 + (x[::2] - 1)**2) + return torch.sum(self.alpha*(x[:-1]**2 - x[1:])**2 + (x[:-1] - 1)**2) + + +def build_parameter_list(): + params = ParameterList() + params['General'] = ParameterList() + params['General']['Output Level'] = 1 + params['Step'] = ParameterList() + params['Step']['Trust Region'] = ParameterList() + params['Step']['Trust Region']['Subproblem Solver'] = 'Truncated CG' + params['Status Test'] = ParameterList() + params['Status Test']['Iteration Limit'] = 10000 + + return params + + +def main(): + + torch.set_default_dtype(torch.float64) + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + #device = torch.device('cpu') + + start = time.time() + n = int(1e2) + print(device) + x = torch.empty(n, requires_grad=False, device=device) + x[ ::2] = -1.2 + x[1::2] = 1.0 + x = TensorVector(x) + + objective = RosenbrockObjective() + g = x.clone() + + stream = getCout() + + problem = Problem(objective, x, g) + # problem.checkDerivatives(True, stream) + + params = build_parameter_list() + solver = Solver(problem, params) + solver.solve(stream) + print(f"Solve time: {time.time() - start}\n") + + print(g.torch_object.device) + +if __name__ == "__main__": + main() diff --git a/packages/rol/pyrol/pyproject.toml b/packages/rol/pyrol/pyproject.toml index eadbd789546b..17c36d0221a2 100644 --- a/packages/rol/pyrol/pyproject.toml +++ b/packages/rol/pyrol/pyproject.toml @@ -47,3 +47,4 @@ ROL_ENABLE_PYROL = "ON" PYROL_ENABLE_BINDER = "OFF" PYROL_PIP_INSTALL = "ON" CMAKE_INSTALL_RPATH="$ORIGIN/lib64;$ORIGIN/lib;$ORIGIN;@loader_path/lib64;@loader_path/lib;@loader_path" +CMAKE_INTERPROCEDURAL_OPTIMIZATION="OFF" diff --git a/packages/rol/pyrol/scripts/create_sdist b/packages/rol/pyrol/scripts/create_sdist index a19ba13ba4fc..e43c8194f3da 100755 --- a/packages/rol/pyrol/scripts/create_sdist +++ b/packages/rol/pyrol/scripts/create_sdist @@ -54,23 +54,20 @@ cmake -G Ninja \ -D CMAKE_INSTALL_PREFIX:PATH=install \ ./${REPO_NAME} -B./build -## Step 2: Run Binder. -make -C build - -## Step 3: Create the reduced tarball. +## Step 2: Create the reduced tarball. make package_source -C build TARBALL_NAME="trilinos-${TRILINOS_VERSION}-Source" -## Step 4: Unpack the reduced tarball. +## Step 3: Unpack the reduced tarball. [ -d ${TARBALL_NAME} ] && rm -rf ${TARBALL_NAME} tar -zxf "build/${TARBALL_NAME}.tar.gz" mv ${TARBALL_NAME} pyrol -## Step 5: Create an SDist from the tarball. +## Step 4: Create an SDist from the tarball. python -m pipx run build --sdist pyrol cp -r pyrol/dist/* . -## Step 6: Clean up. +## Step 5: Clean up. rm -rf build rm -rf ${REPO_NAME}/packages/rol/pyrol/binder rm -rf pyrol From 391047656bfd94ae95d204b5b39f07383702cdc4 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Wed, 3 Apr 2024 22:05:29 -0600 Subject: [PATCH 020/243] Wrap some of SOL --- packages/rol/pyrol/src/PyROL_ETI.hpp | 78 +++++++++++++++------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/packages/rol/pyrol/src/PyROL_ETI.hpp b/packages/rol/pyrol/src/PyROL_ETI.hpp index 5cb0efd345b9..bf4b60e8cc23 100644 --- a/packages/rol/pyrol/src/PyROL_ETI.hpp +++ b/packages/rol/pyrol/src/PyROL_ETI.hpp @@ -1,63 +1,67 @@ #ifndef PYROL_ETI #define PYROL_ETI -#include -#include -#include -#include -#include +#include -#include -#include -#include -#include #include -#include - +#include +#include +#include +#include +#include +#include #include +#include #include - -#include +#include +#include +#include +#include +#include +#include +#include +#include #define BINDER_ETI_ABSTRACT(CLASS_NAME) \ template class CLASS_NAME; -#define BINDER_ETI_WITH_FOO(CLASS_NAME) \ - template class CLASS_NAME; \ - template <> inline void PyROL::foo(CLASS_NAME a){} - -#define BINDER_ROL_VECTOR(SCALAR) \ - BINDER_ETI_ABSTRACT(Vector) \ - BINDER_ETI_ABSTRACT(Vector_SimOpt) +// #define BINDER_ETI_WITH_FOO(CLASS_NAME) \ +// template class CLASS_NAME; \ +// template <> inline void PyROL::foo(CLASS_NAME a){} -#define BINDER_ROL_OBJECTIVE(SCALAR) \ +#define BINDER_ROL_CORE(SCALAR) \ + BINDER_ETI_ABSTRACT(Constraint) \ BINDER_ETI_ABSTRACT(Objective) \ - BINDER_ETI_ABSTRACT(Objective_SimOpt) \ - BINDER_ETI_ABSTRACT(Reduced_Objective_SimOpt) \ - BINDER_ETI_ABSTRACT(ReducedDynamicObjective) + BINDER_ETI_ABSTRACT(Problem) \ + BINDER_ETI_ABSTRACT(Solver) \ + BINDER_ETI_ABSTRACT(Vector) -#define BINDER_ROL_CONSTRAINT(SCALAR) \ - BINDER_ETI_ABSTRACT(Constraint) \ - BINDER_ETI_ABSTRACT(SimConstraint) \ +#define BINDER_ROL_SIMOPT(SCALAR) \ BINDER_ETI_ABSTRACT(BoundConstraint_SimOpt) \ - BINDER_ETI_ABSTRACT(SerialConstraint) + BINDER_ETI_ABSTRACT(Reduced_Objective_SimOpt) \ + BINDER_ETI_ABSTRACT(SimConstraint) \ + BINDER_ETI_ABSTRACT(Vector_SimOpt) -#define BINDER_ROL_SOLVER(SCALAR) \ - BINDER_ETI_ABSTRACT(Solver) +#define BINDER_ROL_DYNAMIC(SCALAR) \ + BINDER_ETI_ABSTRACT(DynamicConstraintCheck) \ + BINDER_ETI_ABSTRACT(DynamicObjectiveCheck) \ + BINDER_ETI_ABSTRACT(ReducedDynamicObjective) \ + BINDER_ETI_ABSTRACT(SerialConstraint) -#define BINDER_ROL_PROBLEM(SCALAR) \ - BINDER_ETI_ABSTRACT(Problem) +#define BINDER_ROL_STOCHASTIC(SCALAR) \ + BINDER_ETI_ABSTRACT(MonteCarloGenerator) \ + BINDER_ETI_ABSTRACT(RiskNeutralObjective) \ + BINDER_ETI_ABSTRACT(SampleGenerator) #define BINDER_ROL_OED(SCALAR) \ BINDER_ETI_ABSTRACT(Factory) namespace ROL { - BINDER_ROL_VECTOR(double) - BINDER_ROL_OBJECTIVE(double) - BINDER_ROL_CONSTRAINT(double) - BINDER_ROL_SOLVER(double) - BINDER_ROL_PROBLEM(double) + BINDER_ROL_CORE(double) + BINDER_ROL_SIMOPT(double) + BINDER_ROL_DYNAMIC(double) + BINDER_ROL_STOCHASTIC(double) namespace OED { BINDER_ROL_OED(double) From 245ef1404f6d24cf12ec46079600863e6a34bf36 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Wed, 3 Apr 2024 22:23:24 -0600 Subject: [PATCH 021/243] Include ParameterList at the top level of PyROL --- packages/rol/pyrol/python/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/rol/pyrol/python/__init__.py b/packages/rol/pyrol/python/__init__.py index 0aa9682944c4..1dbc5274c14d 100644 --- a/packages/rol/pyrol/python/__init__.py +++ b/packages/rol/pyrol/python/__init__.py @@ -1,6 +1,8 @@ import importlib from . getTypeName import getTypeName, getDefaultScalarType, ROL_classes, ROL_members +from pyrol.pyrol.Teuchos import ParameterList + __version__ = '0.1.0' def version(): From 2e1085e35e263f8c140e61056865bfce6dc3c3c2 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Wed, 29 May 2024 21:03:49 -0600 Subject: [PATCH 022/243] Python: Wrap TypeP --- packages/rol/pyrol/CMakeLists.txt | 1 + packages/rol/pyrol/scripts/PyROL_RCP.cfg | 1 + packages/rol/pyrol/scripts/create_sdist | 8 ++++---- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/rol/pyrol/CMakeLists.txt b/packages/rol/pyrol/CMakeLists.txt index b15d5596d4f0..43f9bf395841 100644 --- a/packages/rol/pyrol/CMakeLists.txt +++ b/packages/rol/pyrol/CMakeLists.txt @@ -137,6 +137,7 @@ list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm") list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm/TypeB") list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm/TypeE") list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm/TypeG") +list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm/TypeP") list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm/TypeU") list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm/TypeB/pqn") list(APPEND ROL_all_include_dirs "${${PACKAGE_NAME}_SOURCE_DIR}/src/algorithm/TypeG/augmentedlagrangian/") diff --git a/packages/rol/pyrol/scripts/PyROL_RCP.cfg b/packages/rol/pyrol/scripts/PyROL_RCP.cfg index f587597d046c..9b977210ef87 100644 --- a/packages/rol/pyrol/scripts/PyROL_RCP.cfg +++ b/packages/rol/pyrol/scripts/PyROL_RCP.cfg @@ -75,6 +75,7 @@ -function ROL::TypeB::AlgorithmFactory -function ROL::TypeE::AlgorithmFactory -function ROL::TypeG::AlgorithmFactory +-function ROL::TypeP::AlgorithmFactory -function ROL::TypeU::AlgorithmFactory -class ROL::ConstraintAssembler diff --git a/packages/rol/pyrol/scripts/create_sdist b/packages/rol/pyrol/scripts/create_sdist index e43c8194f3da..6fb60d51e67f 100755 --- a/packages/rol/pyrol/scripts/create_sdist +++ b/packages/rol/pyrol/scripts/create_sdist @@ -4,10 +4,10 @@ # to and then run from the directory containing the ROL repository. -# - Users of this script should check that the variables below are defined +# - Users of this script should check that the variables below are defined # properly. ############################################################################## -TRILINOS_VERSION="14.5" +TRILINOS_VERSION="15.1" REPO_NAME="ROL-Trilinos" LLVM_PREFIX=$(spack location -i llvm) @@ -15,7 +15,7 @@ LLVM_VERSION=$(echo ${LLVM_PREFIX} | awk -F[\-\-] '{print $5}') GCC_PREFIX=$(spack location -i gcc) ############################################################################## -## Other prerequisites: +## Other prerequisites: # * Binder (after the changes on its smart_holder branch) if [ ! command -v binder &> /dev/null ] @@ -61,7 +61,7 @@ TARBALL_NAME="trilinos-${TRILINOS_VERSION}-Source" ## Step 3: Unpack the reduced tarball. [ -d ${TARBALL_NAME} ] && rm -rf ${TARBALL_NAME} tar -zxf "build/${TARBALL_NAME}.tar.gz" -mv ${TARBALL_NAME} pyrol +mv ${TARBALL_NAME} pyrol ## Step 4: Create an SDist from the tarball. python -m pipx run build --sdist pyrol From 09421a364f21b9734fb52d082258924d12b840d2 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Mon, 3 Jun 2024 19:09:29 -0600 Subject: [PATCH 023/243] Python: Wrap more Dynamic functionality --- packages/rol/pyrol/scripts/PyROL_RCP.cfg | 3 ++- packages/rol/pyrol/src/PyROL_ETI.hpp | 13 ++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/packages/rol/pyrol/scripts/PyROL_RCP.cfg b/packages/rol/pyrol/scripts/PyROL_RCP.cfg index 9b977210ef87..3d22c6b3fc4b 100644 --- a/packages/rol/pyrol/scripts/PyROL_RCP.cfg +++ b/packages/rol/pyrol/scripts/PyROL_RCP.cfg @@ -102,7 +102,7 @@ +trampoline_member_function_binder ROL::Vector::clone customClone +include_for_namespace ROL::PyROL --namespace ROL::details +# -namespace ROL::details #################################################r # std library # @@ -157,6 +157,7 @@ -class std::vector +class std::vector +class std::vector ++class std::vector> +class std::vector> -namespace __gnu_cxx diff --git a/packages/rol/pyrol/src/PyROL_ETI.hpp b/packages/rol/pyrol/src/PyROL_ETI.hpp index bf4b60e8cc23..979f641cbc6d 100644 --- a/packages/rol/pyrol/src/PyROL_ETI.hpp +++ b/packages/rol/pyrol/src/PyROL_ETI.hpp @@ -17,8 +17,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -46,7 +48,11 @@ BINDER_ETI_ABSTRACT(DynamicConstraintCheck) \ BINDER_ETI_ABSTRACT(DynamicObjectiveCheck) \ BINDER_ETI_ABSTRACT(ReducedDynamicObjective) \ - BINDER_ETI_ABSTRACT(SerialConstraint) + BINDER_ETI_ABSTRACT(SerialConstraint) \ + BINDER_ETI_ABSTRACT(SerialObjective) + +#define BINDER_ROL_UTILS(SCALAR) \ + BINDER_ETI_ABSTRACT(ValidateFunction) #define BINDER_ROL_STOCHASTIC(SCALAR) \ BINDER_ETI_ABSTRACT(MonteCarloGenerator) \ @@ -63,10 +69,15 @@ namespace ROL { BINDER_ROL_DYNAMIC(double) BINDER_ROL_STOCHASTIC(double) +namespace details { + BINDER_ROL_UTILS(double) +} + namespace OED { BINDER_ROL_OED(double) } + } #endif // PYROL_ETI From 249ed3e0e4634d879670a51ccd1ed1afb2f2b74c Mon Sep 17 00:00:00 2001 From: jdsteinman Date: Fri, 7 Jun 2024 13:49:05 -0600 Subject: [PATCH 024/243] Update Rosenbrock tutorial to ROL 2. --- packages/rol/tutorial/example_unc.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/packages/rol/tutorial/example_unc.cpp b/packages/rol/tutorial/example_unc.cpp index 49bb07c9341d..ab3c56567cab 100644 --- a/packages/rol/tutorial/example_unc.cpp +++ b/packages/rol/tutorial/example_unc.cpp @@ -47,7 +47,7 @@ #define OPTIMIZATION_PROBLEM_REFACTOR -#include "ROL_OptimizationSolver.hpp" +#include "ROL_Solver.hpp" #include "ROL_RandomVector.hpp" #include "ROL_StdObjective.hpp" @@ -107,6 +107,7 @@ int main(int argc, char *argv[]) { try { ROL::ParameterList parlist; + parlist.sublist("General").set("Output Level", 1); parlist.sublist("General").sublist("Secant").set("Use as Hessian",false); parlist.sublist("Step").set("Type","Trust Region"); parlist.sublist("Step").sublist("Trust Region").set("Subproblem Solver","Truncated CG"); @@ -118,10 +119,10 @@ int main(int argc, char *argv[]) { ROL::Ptr > obj = ROL::makePtr>(); - ROL::OptimizationProblem problem( obj, x ); - problem.check(*outStream); + ROL::Ptr > problem = ROL::makePtr>( obj, x ); + problem->check(true, *outStream); - ROL::OptimizationSolver solver( problem, parlist ); + ROL::Solver solver( problem, parlist ); solver.solve(*outStream); *outStream << "x_opt = [" << (*x_ptr)[0] << ", " << (*x_ptr)[1] << "]" << std::endl; @@ -138,8 +139,6 @@ int main(int argc, char *argv[]) { return 0; - - return 0; } From 0554d77895ebc759760c501686e5ffe508e2a2f5 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Fri, 21 Jun 2024 14:41:10 -0600 Subject: [PATCH 025/243] compile_parameters.py makes a csv file of the parameters with their fully expanded sublist prefix. There are still a few places in ROL where ParameterList usage is inconsistent with the rest of the code base that must either be handled separately in the parsing process or made to conform in the source. --- .../rol/rol_parameters/compile_parameters.py | 115 ++++++++++++++++++ .../rol/rol_parameters/parse_parameters.py | 11 -- packages/rol/rol_parameters/sublists.py | 28 ----- 3 files changed, 115 insertions(+), 39 deletions(-) create mode 100644 packages/rol/rol_parameters/compile_parameters.py delete mode 100644 packages/rol/rol_parameters/parse_parameters.py delete mode 100644 packages/rol/rol_parameters/sublists.py diff --git a/packages/rol/rol_parameters/compile_parameters.py b/packages/rol/rol_parameters/compile_parameters.py new file mode 100644 index 000000000000..d4c620436475 --- /dev/null +++ b/packages/rol/rol_parameters/compile_parameters.py @@ -0,0 +1,115 @@ +import re +import pathlib +import subprocess +from typing import Set, Optional, List, Tuple +from read_cpp_source import read_cpp_source + +# Compile regex patterns once +SUBLIST_PATTERN = re.compile(r'\bsublist\s*\(\s*"([^"]+)"\s*\)', re.MULTILINE) +GET_KEY_PATTERN = re.compile(r'\bget\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) +SET_KEY_PATTERN = re.compile(r'\bset\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) + +def find_instances(root_path: pathlib.Path, + search_token: str, + include: Optional[str|Set[str]] = None, + exclude: Optional[str|Set[str]] = None, + exclude_dir: Optional[str|Set[str]] = None) -> Set[pathlib.Path]: + + # Ensure the root path is an existant directory + assert( root_path.exists() ) + assert( root_path.is_dir() ) + + def join(arg): + if isinstance(arg,str): + return [arg] + else: + return list(arg) + + cmd = ['grep','-r',search_token] + + if include is not None: + for inc in join(include): + cmd.append(f'--include={inc}') + + if exclude is not None: + for exc in join(exclude): + cmd.append(f'--exclude={exc}') + + if exclude_dir is not None: + for exc_dir in join(exclude_dir): + cmd.append(f'--exclude-dir={exc_dir}') + + cmd.append(str(root_path)) + + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + # Check if the command was successful + if result.returncode != 0: + raise Exception(f"Error executing grep: {result.stderr}") + + make_relative = lambda path_str : pathlib.Path(path_str).relative_to(root_path,walk_up=True) + + files = { make_relative(line.split(':')[0]) for line in result.stdout.splitlines() } + return files + + + +def parse_cpp_file(file_path: pathlib.Path) -> Set[Tuple[str, ...]]: + cpp = read_cpp_source(file_path) + cpp = re.sub(';', '\n', cpp) + + def has_token(line: str) -> bool: + return ('sublist(' in line) or ('get(' in line) or ('set(' in line) + + lines = [re.sub(r'\s+', ' ', line).strip() for line in cpp.splitlines() if has_token(line) and '"' in line] + + names = {} + code = [] + instances = set() + + for line in lines: + line = re.sub(r'->', '.', line) + if '&' in line: + assignment = line.split('&')[1].strip() + lhs, rhs = assignment.split('=') + names[lhs.strip()] = rhs.strip().split('.') + else: + code.append(line.strip() if '=' not in line else line.split('=')[1].strip()) + + for k, v in names.items(): + if v[0] in names: + names[k] = names[v[0]] + v[1:] + for c in code: + elem = c.split('.') + if elem[0] in names: + elem = names[elem[0]] + elem[1:] + if len(elem) > 1: + tpl = tuple(filter(has_token, elem)) + if all((e.count('"') in [2, 4]) for e in tpl): + instances.add(tuple(e.split('"')[1] for e in tpl)) + + return instances + + + + + +def write_to_csv(instances: List[Tuple[str, ...]], output_file: str): + with open(output_file, 'w') as f: + for line in instances: + f.write(','.join(line) + '\n') + +def main(): + rol_src = pathlib.Path('/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol/src') + + relative_filepaths = find_instances(rol_src, 'ParameterList', + include={'*.hpp', '*.cpp'}, + exclude_dir={'compatibility', 'step', 'zoo'}) + all_instances = set() + for filepath in relative_filepaths: + all_instances.update(parse_cpp_file(rol_src / filepath)) + + write_to_csv(sorted(all_instances), 'all_parameters.csv') + +if __name__ == '__main__': + main() diff --git a/packages/rol/rol_parameters/parse_parameters.py b/packages/rol/rol_parameters/parse_parameters.py deleted file mode 100644 index b6701cad7430..000000000000 --- a/packages/rol/rol_parameters/parse_parameters.py +++ /dev/null @@ -1,11 +0,0 @@ -import re - -def crop_to_scope( cpp, token ): - pattern = re.compile(rf'\{{[^{{}}]*{token}[^{{}}][^{{}}]*\}}', re.MULTILINE) - match = re.search(pattern, cpp) - if match: - return match.group() - -def get_sublist_variable_name(cpp,token): - pattern = re.compile(rf'\{{[^{{}}]*{token}[^{{}}][^{{}}]*\}}', re.MULTILINE) - diff --git a/packages/rol/rol_parameters/sublists.py b/packages/rol/rol_parameters/sublists.py deleted file mode 100644 index 983ad8e7e7ae..000000000000 --- a/packages/rol/rol_parameters/sublists.py +++ /dev/null @@ -1,28 +0,0 @@ -import re -from find_files import find_files -from read_cpp_source import read_cpp_source - - -def find_sublist_instances(root_path): - assert(root_path.exists()) - assert(root_path.is_dir()) - - files = [find_files(root_path, "sublist", include=['*.hpp']) - sublist_pattern = re.compile(r'^.*?(\.\s*sublist\s*\(\s*"[^"]*"\s*\)(.*);)',re.MULTILINE) -# sublist_pattern = re.compile(r'^.*?(\.\s*sublist\s*\(\s*"[^"]*"\s*\))',re.MULTILINE) - - results = dict() - - for file in files: - cpp = read_cpp_source(file) - matches = list(re.finditer(sublist_pattern, cpp)) - if len(matches): - if file not in results.keys(): - results[file] = [] - [results[file].append(m.groups()) for m in matches] - - return results - - - - From e758f520b35c70a5212b5b20ef57c746a6f7f01f Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Mon, 15 Apr 2024 15:48:39 -0600 Subject: [PATCH 026/243] Fixed bound check. --- .../src/function/constraint/ROL_Constraint_Partitioned_Def.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/rol/src/function/constraint/ROL_Constraint_Partitioned_Def.hpp b/packages/rol/src/function/constraint/ROL_Constraint_Partitioned_Def.hpp index 520f636c61dd..dfc30cfa1194 100644 --- a/packages/rol/src/function/constraint/ROL_Constraint_Partitioned_Def.hpp +++ b/packages/rol/src/function/constraint/ROL_Constraint_Partitioned_Def.hpp @@ -66,7 +66,7 @@ int Constraint_Partitioned::getNumberConstraintEvaluations(void) const { template Ptr> Constraint_Partitioned::get(int ind) const { - if (ind < 0 || ind > static_cast(cvec_.size())) { + if (ind < 0 || ind >= static_cast(cvec_.size())) { throw Exception::NotImplemented(">>> Constraint_Partitioned::get : Index out of bounds!"); } return cvec_[ind]; From df04bda2f02ee8666af530cb165c52c562591e75 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:18:43 -0600 Subject: [PATCH 027/243] Fixed issue with relative tolerances when Solver::reset is used. --- .../src/status/ROL_ConstraintStatusTest.hpp | 20 ++++++++++++++----- packages/rol/src/status/ROL_StatusTest.hpp | 18 +++++++++++------ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/packages/rol/src/status/ROL_ConstraintStatusTest.hpp b/packages/rol/src/status/ROL_ConstraintStatusTest.hpp index fd10e73416a0..b15841f12338 100644 --- a/packages/rol/src/status/ROL_ConstraintStatusTest.hpp +++ b/packages/rol/src/status/ROL_ConstraintStatusTest.hpp @@ -58,10 +58,11 @@ template class ConstraintStatusTest : public StatusTest { private: - Real gtol_; - Real ctol_; - Real stol_; + Real gtol_, gtol0_; + Real ctol_, ctol0_; + Real stol_, stol0_; int max_iter_; + bool use_rel_; public: @@ -73,14 +74,23 @@ class ConstraintStatusTest : public StatusTest { ctol_ = parlist.sublist("Status Test").get("Constraint Tolerance", em6); stol_ = parlist.sublist("Status Test").get("Step Tolerance", em6*gtol_); max_iter_ = parlist.sublist("Status Test").get("Iteration Limit", 100); + use_rel_ = parlist.sublist("Status Test").get("Use Relative Tolerances", false); + gtol0_ = gtol_; + ctol0_ = ctol_; + stol0_ = stol_; } - ConstraintStatusTest( Real gtol = 1e-6, Real ctol = 1e-6, Real stol = 1e-12, int max_iter = 100 ) : - gtol_(gtol), ctol_(ctol), stol_(stol), max_iter_(max_iter) {} + ConstraintStatusTest( Real gtol = 1e-6, Real ctol = 1e-6, Real stol = 1e-12, int max_iter = 100, bool use_rel = false ) : + gtol_(gtol), gtol0_(gtol), ctol_(ctol), ctol0_(ctol), stol_(stol), stol0_(stol), max_iter_(max_iter), use_rel_(use_rel) {} /** \brief Check algorithm status. */ virtual bool check( AlgorithmState &state ) { + if (state.iter==0 && use_rel_) { + gtol_ = gtol0_*std::max(state.gnorm,static_cast(1e-2)); + ctol_ = ctol0_*std::max(state.cnorm,static_cast(1e-2)); + stol_ = stol0_*std::max(std::min(state.gnorm,state.cnorm),static_cast(1e-2)); + } if ( ((state.gnorm > gtol_) || (state.cnorm > ctol_)) && (state.snorm > stol_) && (state.iter < max_iter_) ) { diff --git a/packages/rol/src/status/ROL_StatusTest.hpp b/packages/rol/src/status/ROL_StatusTest.hpp index 5f054a86f573..a523273f2261 100644 --- a/packages/rol/src/status/ROL_StatusTest.hpp +++ b/packages/rol/src/status/ROL_StatusTest.hpp @@ -58,8 +58,8 @@ template class StatusTest { private: - Real gtol_; - Real stol_; + Real gtol_, gtol0_; + Real stol_, stol0_; int max_iter_; bool use_rel_; @@ -67,23 +67,29 @@ class StatusTest { virtual ~StatusTest() {} - StatusTest( ROL::ParameterList &parlist ) { + StatusTest( ParameterList &parlist ) { Real em6(1e-6); gtol_ = parlist.sublist("Status Test").get("Gradient Tolerance", em6); stol_ = parlist.sublist("Status Test").get("Step Tolerance", em6*gtol_); max_iter_ = parlist.sublist("Status Test").get("Iteration Limit", 100); use_rel_ = parlist.sublist("Status Test").get("Use Relative Tolerances", false); + gtol0_ = gtol_; + stol0_ = stol_; } StatusTest( Real gtol = 1.e-6, Real stol = 1.e-12, int max_iter = 100, bool use_rel = false ) : - gtol_(gtol), stol_(stol), max_iter_(max_iter), use_rel_(use_rel) {} + gtol_(gtol), gtol0_(gtol), stol_(stol), stol0_(stol), max_iter_(max_iter), use_rel_(use_rel) {} /** \brief Check algorithm status. + + If "Use Relative Tolerances" is set to "true" upon construction, the + gradient and step tolerances are scaled by the norm of the initial + gradient. */ virtual bool check( AlgorithmState &state ) { if (state.iter==0 && use_rel_) { - gtol_ *= state.gnorm; - stol_ *= state.gnorm; + gtol_ = gtol0_*state.gnorm; + stol_ = stol0_*state.gnorm; } if ( (state.gnorm > gtol_) && (state.snorm > stol_) && From 24c9d4e5dd934fadfbfa848b39b757ebbc0d274a Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:23:01 -0600 Subject: [PATCH 028/243] Code clean up to use apply instead of dot, which avoids a Riesz map application. --- .../ROL_AugmentedLagrangianObjective.hpp | 20 ++++------ .../src/step/trustregion/ROL_TruncatedCG.hpp | 38 ++++++++----------- 2 files changed, 22 insertions(+), 36 deletions(-) diff --git a/packages/rol/src/algorithm/TypeG/augmentedlagrangian/ROL_AugmentedLagrangianObjective.hpp b/packages/rol/src/algorithm/TypeG/augmentedlagrangian/ROL_AugmentedLagrangianObjective.hpp index 9f077e3200c0..7a13c1c3cd26 100644 --- a/packages/rol/src/algorithm/TypeG/augmentedlagrangian/ROL_AugmentedLagrangianObjective.hpp +++ b/packages/rol/src/algorithm/TypeG/augmentedlagrangian/ROL_AugmentedLagrangianObjective.hpp @@ -183,13 +183,12 @@ class AugmentedLagrangianObjective : public Objective { val *= fscale_; // Compute penalty term const Real half(0.5); - primConVector_->set(multiplier_->dual()); - primConVector_->axpy(half*cscale_*penaltyParameter_,*getConstraintVec(x,tol)); - val += cscale_*getConstraintVec(x,tol)->dot(*primConVector_); + dualConVector_->set(*multiplier_); + dualConVector_->axpy(half*cscale_*penaltyParameter_,getConstraintVec(x,tol)->dual()); + val += cscale_*dualConVector_->apply(*getConstraintVec(x,tol)); + //val += cscale_*getConstraintVec(x,tol)->dot(*primConVector_); // Scale augmented Lagrangian - if (scaleLagrangian_) { - val /= penaltyParameter_; - } + if (scaleLagrangian_) val /= penaltyParameter_; return val; } @@ -203,10 +202,7 @@ class AugmentedLagrangianObjective : public Objective { con_->applyAdjointJacobian(*dualOptVector_,*dualConVector_,x,tol); g.axpy(cscale_,*dualOptVector_); // Compute gradient of Augmented Lagrangian - if ( scaleLagrangian_ ) { - const Real one(1); - g.scale(one/penaltyParameter_); - } + if ( scaleLagrangian_ ) g.scale(static_cast(1)/penaltyParameter_); } void hessVec( Vector &hv, const Vector &v, const Vector &x, Real &tol ) { @@ -234,9 +230,7 @@ class AugmentedLagrangianObjective : public Objective { hv.zero(); } // Build hessVec of Augmented Lagrangian - if ( scaleLagrangian_ ) { - hv.scale(static_cast(1)/penaltyParameter_); - } + if ( scaleLagrangian_ ) hv.scale(static_cast(1)/penaltyParameter_); } // Return objective function value diff --git a/packages/rol/src/step/trustregion/ROL_TruncatedCG.hpp b/packages/rol/src/step/trustregion/ROL_TruncatedCG.hpp index 2d159bf7cf88..58437835044d 100644 --- a/packages/rol/src/step/trustregion/ROL_TruncatedCG.hpp +++ b/packages/rol/src/step/trustregion/ROL_TruncatedCG.hpp @@ -113,7 +113,7 @@ class TruncatedCG : public TrustRegion { model.precond(*v_,*g_,s,tol); // Initialize basis vector p_->set(*v_); p_->scale(-one); - Real pnorm2 = v_->dot(g_->dual()); + Real pnorm2 = g_->apply(*v_); if ( pnorm2 <= zero ) { iflag = 4; iter = 0; @@ -122,65 +122,57 @@ class TruncatedCG : public TrustRegion { // Initialize scalar storage iter = 0; iflag = 0; Real kappa(0), beta(0), sigma(0), alpha(0), tmp(0), sMp(0); - Real gv = v_->dot(g_->dual()); + Real gv = g_->apply(*v_); pRed_ = zero; // Iterate CG for (iter = 0; iter < maxit_; iter++) { // Apply Hessian to direction p model.hessVec(*Hp_,*p_,s,tol); // Check positivity of Hessian - kappa = p_->dot(Hp_->dual()); + kappa = Hp_->apply(*p_); if (kappa <= zero) { sigma = (-sMp+sqrt(sMp*sMp+pnorm2*(del*del-snorm2)))/pnorm2; s.axpy(sigma,*p_); - iflag = 2; + iflag = 2; break; } // Update step alpha = gv/kappa; - s_->set(s); + s_->set(s); s_->axpy(alpha,*p_); s1norm2 = snorm2 + two*alpha*sMp + alpha*alpha*pnorm2; // Check if step exceeds trust region radius if (s1norm2 >= del*del) { sigma = (-sMp+sqrt(sMp*sMp+pnorm2*(del*del-snorm2)))/pnorm2; s.axpy(sigma,*p_); - iflag = 3; + iflag = 3; break; } // Update model predicted reduction pRed_ += half*alpha*gv; // Set step to temporary step and store norm s.set(*s_); - snorm2 = s1norm2; + snorm2 = s1norm2; // Check for convergence g_->axpy(alpha,*Hp_); normg = g_->norm(); - if (normg < gtol) { - break; - } + if (normg < gtol) break; // Preconditioned updated (projected) gradient vector model.precond(*v_,*g_,s,tol); - tmp = gv; - gv = v_->dot(g_->dual()); - beta = gv/tmp; + tmp = gv; + gv = g_->apply(*v_); + beta = gv/tmp; // Update basis vector p_->scale(beta); p_->axpy(-one,*v_); sMp = beta*(sMp+alpha*pnorm2); - pnorm2 = gv + beta*beta*pnorm2; + pnorm2 = gv + beta*beta*pnorm2; } // Update model predicted reduction - if (iflag > 0) { - pRed_ += sigma*(gv-half*sigma*kappa); - } + if (iflag > 0) pRed_ += sigma*(gv-half*sigma*kappa); // Check iteration count - if (iter == maxit_) { - iflag = 1; - } - if (iflag != 1) { - iter++; - } + if (iter == maxit_) iflag = 1; + if (iflag != 1) iter++; // Update norm of step and update model predicted reduction model.primalTransform(*s_,s); s.set(*s_); From c26f024d93debbea47c723f9dd4c6e6666be9aad Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:24:57 -0600 Subject: [PATCH 029/243] Added get method for the original objective function. --- .../rol/src/function/objective/ROL_AffineTransformObjective.hpp | 1 + .../src/function/objective/ROL_AffineTransformObjective_Def.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/rol/src/function/objective/ROL_AffineTransformObjective.hpp b/packages/rol/src/function/objective/ROL_AffineTransformObjective.hpp index d3e0e4e7bad7..00c6f46bbf7e 100644 --- a/packages/rol/src/function/objective/ROL_AffineTransformObjective.hpp +++ b/packages/rol/src/function/objective/ROL_AffineTransformObjective.hpp @@ -86,6 +86,7 @@ class AffineTransformObjective : public Objective { Real value( const Vector &x, Real &tol ) override; void gradient( Vector &g, const Vector &x, Real &tol ) override; void hessVec( Vector &hv, const Vector &v, const Vector &x, Real &tol ) override; + const Ptr> getObjective() const {return obj_;} public: void setParameter(const std::vector ¶m) override; diff --git a/packages/rol/src/function/objective/ROL_AffineTransformObjective_Def.hpp b/packages/rol/src/function/objective/ROL_AffineTransformObjective_Def.hpp index d25515185191..e8d265f3490e 100644 --- a/packages/rol/src/function/objective/ROL_AffineTransformObjective_Def.hpp +++ b/packages/rol/src/function/objective/ROL_AffineTransformObjective_Def.hpp @@ -50,7 +50,7 @@ template AffineTransformObjective::AffineTransformObjective(const Ptr> &obj, const Ptr> &acon, const Vector &range, - const Ptr> &storage) + const Ptr> &storage) : obj_(obj), acon_(acon), storage_(storage) { primal_ = range.clone(); Av_ = range.clone(); From fddc7f0c795e35068e927679e5cf0a0d02085bc4 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:26:07 -0600 Subject: [PATCH 030/243] Simplified the regression error computation. --- .../src/sol/function/ROL_RegressionError.hpp | 27 +++++-------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/packages/rol/src/sol/function/ROL_RegressionError.hpp b/packages/rol/src/sol/function/ROL_RegressionError.hpp index 653f3c99bd9d..1d8126c35ff2 100644 --- a/packages/rol/src/sol/function/ROL_RegressionError.hpp +++ b/packages/rol/src/sol/function/ROL_RegressionError.hpp @@ -72,36 +72,23 @@ class RegressionError : public StdObjective { Real value( const std::vector &x, Real &tol ) { checkSize(x); - // Parse Input Vector - std::vector c; c.assign(x.begin()+1,x.end()); - Real c0 = x[0]; - // Parse Data const std::vector data = Objective::getParameter(); - std::vector X; X.assign(data.begin()+1,data.end()); - Real Y = data[0]; - // Build Error - int Xdim = X.size(); - Real val = Y-c0; - for (int i = 0; i < Xdim; ++i) { - val -= c[i]*X[i]; - } + const unsigned dim = x.size(); + Real val = data[0] - x[0]; + for (unsigned i = 1; i < dim; ++i) val -= data[i] * x[i]; return val; } void gradient( std::vector &g, const std::vector &x, Real &tol ) { - checkSize(x); checkSize(g); - // Parse Data + checkSize(g); const std::vector data = Objective::getParameter(); - std::vector X; X.assign(data.begin()+1,data.end()); - // Build Error - int Xdim = X.size(); + const unsigned dim = g.size(); g[0] = static_cast(-1); - for (int i = 0; i < Xdim; ++i) { - g[i+1] = -X[i]; - } + for (unsigned i = 1; i < dim; ++i) g[i] = -data[i]; } void hessVec( std::vector &hv, const std::vector &v, const std::vector &x, Real &tol ) { + checkSize(hv); hv.assign(hv.size(),static_cast(0)); } }; // class RegressionError From 92c2ef21ff53880e7170413456470d0194c209d1 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:28:36 -0600 Subject: [PATCH 031/243] Updated examples to v2.0. --- .../rol/example/tensor-opt/example_01.cpp | 25 +++++++----- .../topology-optimization/example_02.cpp | 40 ++++++++++--------- .../example/topology-optimization/input.xml | 1 + .../topology-optimization/input_ex02.xml | 6 +-- 4 files changed, 42 insertions(+), 30 deletions(-) diff --git a/packages/rol/example/tensor-opt/example_01.cpp b/packages/rol/example/tensor-opt/example_01.cpp index 95657cd0de75..15061f2f43c5 100644 --- a/packages/rol/example/tensor-opt/example_01.cpp +++ b/packages/rol/example/tensor-opt/example_01.cpp @@ -81,8 +81,8 @@ #include "ROL_Vector.hpp" #include "ROL_CArrayVector.hpp" #include "ROL_Bounds.hpp" -#include "ROL_OptimizationSolver.hpp" -#include "ROL_StatusTest.hpp" +#include "ROL_Problem.hpp" +#include "ROL_Solver.hpp" //#pragma GCC diagnostic pop @@ -633,10 +633,10 @@ class SemidefiniteProgramming /******************************************************************************/ private: - const std::string _parfile; - const ROL::Ptr _parlist; + const std::string _parfile; + const ROL::Ptr _parlist; - const ROL::Ptr> _lower, _upper; + const ROL::Ptr> _lower, _upper; const ROL::Ptr> _x; const ROL::Ptr> _bnd; @@ -648,8 +648,8 @@ class SemidefiniteProgramming std::vector>> _ibnd; const ROL::Ptr> _obj; - ROL::Ptr> _problem; - ROL::Ptr> _solver; + ROL::Ptr> _problem; + ROL::Ptr> _solver; DT_ _A_i_up[6]; DT_ _A_i_lo[6]; @@ -691,8 +691,15 @@ class SemidefiniteProgramming my_cast &>(* _icon[3]).set_A(_A_j_lo); my_cast &>(* _icon[3]).set_F(_F_n); - _problem = ROL::makePtr>(_obj, _x, _bnd, _icon, _imul, _ibnd); - _solver = ROL::makePtr>(* _problem, * _parlist); + _problem = ROL::makePtr>(_obj, _x); + _problem->addBoundConstraint(_bnd); + _problem->addConstraint("Inequality Constraint 0", _icon[0], _imul[0], _ibnd[0]); + _problem->addConstraint("Inequality Constraint 1", _icon[1], _imul[1], _ibnd[1]); + _problem->addConstraint("Inequality Constraint 2", _icon[2], _imul[2], _ibnd[2]); + _problem->addConstraint("Inequality Constraint 3", _icon[3], _imul[3], _ibnd[3]); + _solver = ROL::makePtr>(_problem, * _parlist); + //_problem = ROL::makePtr>(_obj, _x, _bnd, _icon, _imul, _ibnd); + //_solver = ROL::makePtr>(* _problem, * _parlist); _x->zero(); } diff --git a/packages/rol/example/topology-optimization/example_02.cpp b/packages/rol/example/topology-optimization/example_02.cpp index ebff1af6e015..94638723c457 100644 --- a/packages/rol/example/topology-optimization/example_02.cpp +++ b/packages/rol/example/topology-optimization/example_02.cpp @@ -57,7 +57,8 @@ #include "ROL_StdVector.hpp" #include "ROL_Reduced_Objective_SimOpt.hpp" #include "ROL_Bounds.hpp" -#include "ROL_OptimizationSolver.hpp" +#include "ROL_Problem.hpp" +#include "ROL_Solver.hpp" #include "ROL_ParameterList.hpp" #include "Teuchos_SerialDenseVector.hpp" @@ -926,7 +927,7 @@ int main(int argc, char *argv[]) { uint ny = 10; // Number of y-elements (20 for prob = 0, 20 for prob = 1). int P = 1; // SIMP penalization power. RealT frac = 0.4; // Volume fraction. - ROL::Ptr > pFEM = ROL::makePtr>(nx,ny,P,prob); + ROL::Ptr> pFEM = ROL::makePtr>(nx,ny,P,prob); // Read optimization input parameter list. std::string filename = "input_ex02.xml"; auto parlist = ROL::getParametersFromXmlFile( filename ); @@ -943,36 +944,39 @@ int main(int argc, char *argv[]) { // Initialize bound constraints. ROL::Ptr> lo_ptr = ROL::makePtr>(pFEM->numZ(),1.e-3); ROL::Ptr> hi_ptr = ROL::makePtr>(pFEM->numZ(),1.0); - ROL::Ptr > lop = ROL::makePtr>(lo_ptr); - ROL::Ptr > hip = ROL::makePtr>(hi_ptr); + ROL::Ptr> lop = ROL::makePtr>(lo_ptr); + ROL::Ptr> hip = ROL::makePtr>(hi_ptr); bound = ROL::makePtr>(lop,hip); // Initialize control vector. - ROL::Ptr > z_ptr = ROL::makePtr> (pFEM->numZ(), frac); + ROL::Ptr> z_ptr = ROL::makePtr> (pFEM->numZ(), frac); ROL::StdVector z(z_ptr); - ROL::Ptr > zp = ROL::makePtrFromRef(z); + ROL::Ptr> zp = ROL::makePtrFromRef(z); // Initialize state vector. - ROL::Ptr > u_ptr = ROL::makePtr>(pFEM->numU(), 0.0); + ROL::Ptr> u_ptr = ROL::makePtr>(pFEM->numU(), 0.0); ROL::StdVector u(u_ptr); - ROL::Ptr > up = ROL::makePtrFromRef(u); + ROL::Ptr> up = ROL::makePtrFromRef(u); // Initialize adjoint vector. - ROL::Ptr > p_ptr = ROL::makePtr>(pFEM->numU(), 0.0); + ROL::Ptr> p_ptr = ROL::makePtr>(pFEM->numU(), 0.0); ROL::StdVector p(p_ptr); - ROL::Ptr > pp = ROL::makePtrFromRef(p); + ROL::Ptr> pp = ROL::makePtrFromRef(p); // Initialize multiplier vector. - ROL::Ptr > l_ptr = ROL::makePtr>(1, 0.0); + ROL::Ptr> l_ptr = ROL::makePtr>(1, 0.0); ROL::StdVector l(l_ptr); - ROL::Ptr > lp = ROL::makePtrFromRef(l); + ROL::Ptr> lp = ROL::makePtrFromRef(l); // Initialize objective function. pobj = ROL::makePtr>(pFEM); // Initialize reduced objective function. robj = ROL::makePtr>(pobj,pcon,up,zp,pp); - // Run optimization. - ROL::OptimizationProblem problem(robj, zp, bound, vcon, lp); + + // Define optimization problem. + ROL::Ptr> problem = ROL::makePtr>(robj,zp); + problem->addBoundConstraint(bound); + problem->addLinearConstraint("Volume Constraint",vcon,lp); + problem->finalize(false,true,*outStream); bool derivCheck = true; // Derivative check flag. - if (derivCheck) { - problem.check(*outStream); - } - ROL::OptimizationSolver solver(problem, *parlist); + if (derivCheck) problem->check(true,*outStream); + // Solve optimization problem. + ROL::Solver solver(problem, *parlist); solver.solve(*outStream); } catch (std::logic_error& err) { diff --git a/packages/rol/example/topology-optimization/input.xml b/packages/rol/example/topology-optimization/input.xml index d5d8157988f9..04d6678df0d6 100644 --- a/packages/rol/example/topology-optimization/input.xml +++ b/packages/rol/example/topology-optimization/input.xml @@ -2,6 +2,7 @@ + diff --git a/packages/rol/example/topology-optimization/input_ex02.xml b/packages/rol/example/topology-optimization/input_ex02.xml index 1ed82585eaf7..e3c4e128bd15 100644 --- a/packages/rol/example/topology-optimization/input_ex02.xml +++ b/packages/rol/example/topology-optimization/input_ex02.xml @@ -23,8 +23,8 @@ - + @@ -52,7 +52,7 @@ - + @@ -92,7 +92,7 @@ - + From a807756e74776352e416fe352313f82e76c45df6 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:30:55 -0600 Subject: [PATCH 032/243] More example clean up. --- packages/rol/example/dense-hessian/example_01.cpp | 9 ++++++--- packages/rol/src/zoo/testproblems/ROL_HS14.hpp | 8 ++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/packages/rol/example/dense-hessian/example_01.cpp b/packages/rol/example/dense-hessian/example_01.cpp index 1cbab6195558..ab72db17ac08 100644 --- a/packages/rol/example/dense-hessian/example_01.cpp +++ b/packages/rol/example/dense-hessian/example_01.cpp @@ -49,7 +49,8 @@ #define USE_HESSVEC 1 #include "ROL_Rosenbrock.hpp" -#include "ROL_OptimizationSolver.hpp" +#include "ROL_Problem.hpp" +#include "ROL_Solver.hpp" #include "ROL_ScaledStdVector.hpp" #include "ROL_Stream.hpp" #include "ROL_HelperFunctions.hpp" @@ -82,6 +83,7 @@ int main(int argc, char *argv[]) { // Set algorithm parameters. ROL::ParameterList parlist; + parlist.sublist("General").set("Output Level", 1); parlist.sublist("Step").set("Type", "Line Search"); parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").set("Type", "Newton-Krylov"); parlist.sublist("Status Test").set("Gradient Tolerance",1.e-12); @@ -101,9 +103,10 @@ int main(int argc, char *argv[]) { ROL::PrimalScaledStdVector x(x_ptr, scale_ptr); // Define problem. - ROL::OptimizationProblem problem(ROL::makePtrFromRef(obj), ROL::makePtrFromRef(x)); - ROL::OptimizationSolver solver(problem, parlist); + ROL::Ptr> problem = ROL::makePtr>(ROL::makePtrFromRef(obj), ROL::makePtrFromRef(x)); + problem->finalize(false, true, *outStream); // Solve problem. + ROL::Solver solver(problem, parlist); solver.solve(*outStream); // Set true solution. diff --git a/packages/rol/src/zoo/testproblems/ROL_HS14.hpp b/packages/rol/src/zoo/testproblems/ROL_HS14.hpp index 4718309f1177..3e422f934b33 100644 --- a/packages/rol/src/zoo/testproblems/ROL_HS14.hpp +++ b/packages/rol/src/zoo/testproblems/ROL_HS14.hpp @@ -86,7 +86,7 @@ class Objective_HS14 : public StdObjective { const Real c2(2); hv[0] = c2*v[0]; hv[1] = c2*v[1]; - } + } }; template @@ -97,7 +97,7 @@ class Constraint_HS14a : public StdConstraint { void value( std::vector &c, const std::vector &x, Real &tol ) { const Real c1(1), c2(2); c[0] = x[0] - c2*x[1] + c1; - } + } void applyJacobian(std::vector &jv, const std::vector &v, const std::vector &x, Real &tol) { @@ -127,7 +127,7 @@ class Constraint_HS14b : public StdConstraint { void value( std::vector &c, const std::vector &x, Real &tol ) { const Real c0(0.25), c1(1), c2(2); c[0] = -c0*std::pow(x[0],c2) - std::pow(x[1],c2) + c1; - } + } void applyJacobian(std::vector &jv, const std::vector &v, const std::vector &x, Real &tol) { @@ -139,7 +139,7 @@ class Constraint_HS14b : public StdConstraint { const std::vector &x, Real &tol ) { const Real c0(0.25), c2(2); ajv[0] = -c0*c2*x[0]*v[0]; - ajv[1] = -c2*x[1]*v[0]; + ajv[1] = -c2*x[1]*v[0]; } void applyAdjointHessian(std::vector &ahuv, const std::vector &u, From 28f3be80ade1165636eb834af563210391a1f0ee Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:31:28 -0600 Subject: [PATCH 033/243] Fixed labels. --- packages/rol/src/oed/utilities/ROL_OED_Timer_Def.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/oed/utilities/ROL_OED_Timer_Def.hpp b/packages/rol/src/oed/utilities/ROL_OED_Timer_Def.hpp index f2eaaac4ef21..3dbc84ead429 100644 --- a/packages/rol/src/oed/utilities/ROL_OED_Timer_Def.hpp +++ b/packages/rol/src/oed/utilities/ROL_OED_Timer_Def.hpp @@ -106,8 +106,8 @@ void Timer::summarize(std::ostream &stream, std::ios_base::fmtflags old(stream.flags()); stream << std::setprecision(6); stream << " " << name_ << std::endl; - stream << std::setw(50) << std::right << "Ave. Time (s)" - << std::setw(25) << std::right << "Ave. #Calls" + stream << std::setw(50) << std::right << "Avg. Time (s)" + << std::setw(25) << std::right << "Avg. #Calls" << std::endl; for (typename std::map::iterator it = count_.begin(); it != count_.end(); ++it) { stream << std::setw(30) << std::right << it->first From ccc00af482f567ffdaaa5915381148ccf9e8a78f Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 26 Jun 2024 15:32:44 -0600 Subject: [PATCH 034/243] Example clean up. --- .../src/filtered_compliance_robj.hpp | 16 ++------ .../src/pde_elasticity.hpp | 38 +++++++------------ .../src/pde_filter.hpp | 2 +- .../src/traction.hpp | 3 +- 4 files changed, 20 insertions(+), 39 deletions(-) diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/filtered_compliance_robj.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/filtered_compliance_robj.hpp index 62201ab86f03..1f0e20891590 100644 --- a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/filtered_compliance_robj.hpp +++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/filtered_compliance_robj.hpp @@ -114,20 +114,12 @@ class TopOptFilteredComplianceObjective : public ROL::Objective { // Reject: flag = false, iter > -1 // Accept: flag = true, iter > -1 if (flag) { - if (iter > -1) { - update_accept(z,iter); - } - else { - update_temp(z,iter); - } + if (iter > -1) update_accept(z,iter); + else update_temp(z,iter); } else { - if (iter > -1) { - update_revert(z,iter); - } - else { - update_trial(z,iter); - } + if (iter > -1) update_revert(z,iter); + else update_trial(z,iter); } } comp_->update(*Fz_,flag,iter); diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp index be79a9a28f59..dd7d73707640 100644 --- a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp +++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp @@ -117,42 +117,30 @@ class PDE_Elasticity : public PDE { int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0); int cubDegree = parlist.sublist("Problem").get("Cubature Degree",4); int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); - int probDim = parlist.sublist("Problem").get("Problem Dimension",2); - if (probDim > 3 || probDim < 2) { - TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, - ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 2 or 3!"); - } - if (basisOrder > 2 || basisOrder < 1) { - TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, - ">>> PDE-OPT/poisson/pde_poisson.hpp: Basis order is not 1 or 2!"); - } + int probDim = parlist.sublist("Problem").get("Problem Dimension",3); + TEUCHOS_TEST_FOR_EXCEPTION(probDim > 3 || probDim < 2, std::invalid_argument, + ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp: Problem dimension is not 2 or 3!"); + TEUCHOS_TEST_FOR_EXCEPTION(basisOrder > 2 || basisOrder < 1, std::invalid_argument, + ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp: Basis order is not 1 or 2!"); if (probDim == 2) { - if (basisOrder == 1) { + if (basisOrder == 1) basisPtr_ = ROL::makePtr>>(); - } - else if (basisOrder == 2) { + else if (basisOrder == 2) basisPtr_ = ROL::makePtr>>(); - } - if (basisOrderDens == 1) { + if (basisOrderDens == 1) basisPtrDens_ = ROL::makePtr>>(); - } - else { + else basisPtrDens_ = ROL::makePtr>>(); - } } else if (probDim == 3) { - if (basisOrder == 1) { + if (basisOrder == 1) basisPtr_ = ROL::makePtr>>(); - } - else if (basisOrder == 2) { + else if (basisOrder == 2) basisPtr_ = ROL::makePtr>>(); - } basisPtrDens_ = ROL::makePtr>>(); } basisPtrs_.clear(); basisPtrsDens_.clear(); - for (int i=0; i { void setDensityFields(const std::vector>>> &basisPtrs) { if (getFields2called_) { TEUCHOS_TEST_FOR_EXCEPTION(getFields2called_, std::invalid_argument, - ">>> PDE-OPT/topo-opt/elasticity/src/pde_elasticity.hpp: Must call before getFields2!"); + ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp: Must call before getFields2!"); } else { basisPtrDens_ = basisPtrs[0]; diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_filter.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_filter.hpp index 55ebc1d0cc7a..ad92fa3e8409 100644 --- a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_filter.hpp +++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_filter.hpp @@ -88,7 +88,7 @@ class PDE_Filter : public PDE { int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0); int cubDegree = parlist.sublist("Problem").get("Cubature Degree",4); // int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); - int probDim = parlist.sublist("Problem").get("Problem Dimension",2); + int probDim = parlist.sublist("Problem").get("Problem Dimension",3); TEUCHOS_TEST_FOR_EXCEPTION(probDim>3||probDim<2, std::invalid_argument, ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 2 or 3!"); TEUCHOS_TEST_FOR_EXCEPTION(basisOrder>2||basisOrder<1, std::invalid_argument, diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/traction.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/traction.hpp index 779b8acc0631..d341d8c07419 100644 --- a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/traction.hpp +++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/traction.hpp @@ -78,7 +78,8 @@ class Traction { Traction(void) {}; Traction(int dim) : dim_(dim) { - assert(dim > 3 || dim < 2); + TEUCHOS_TEST_FOR_EXCEPTION(dim > 3 || dim < 2, std::invalid_argument, + ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/traction.hpp: Problem dimension is not 2 or 3!"); if (dim==2) { offset_ = 0; sidesets_.push_back(2); From f3c719d5091fe4a10798fd8838158adf9b7593f0 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Tue, 9 Jul 2024 12:24:51 -0600 Subject: [PATCH 035/243] Fixed errors in example_01 --- packages/rol/example/tensor-opt/example_01.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/rol/example/tensor-opt/example_01.cpp b/packages/rol/example/tensor-opt/example_01.cpp index 15061f2f43c5..75b7eace379f 100644 --- a/packages/rol/example/tensor-opt/example_01.cpp +++ b/packages/rol/example/tensor-opt/example_01.cpp @@ -758,7 +758,7 @@ class SemidefiniteProgramming _x->wrap(x); for (auto& it : _imul) it->zero(); _solver->reset(); - _problem->reset(); + //_problem->reset(); _solver->solve(outStream); return _x->data(); @@ -787,7 +787,7 @@ class SemidefiniteProgramming set_node(A_j, lambda_j_lo, lambda_j_up); set_flux(F); - _problem->check(outStream); + _problem->check(true,outStream); } void checkConstraints(DT_ sol[3]) From b822012230a106434bed9141be4667295eb8b48a Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Tue, 9 Jul 2024 12:25:36 -0600 Subject: [PATCH 036/243] Changed full space solver from AL to CS. --- packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp b/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp index 287f383de54b..66d0c82e4ea9 100644 --- a/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp +++ b/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp @@ -60,7 +60,7 @@ class Constraint_SimOpt; #include "ROL_SimConstraint.hpp" #include "ROL_Objective_FSsolver.hpp" #include "ROL_TypeU_TrustRegionAlgorithm.hpp" -#include "ROL_TypeE_AugmentedLagrangianAlgorithm.hpp" +#include "ROL_TypeE_CompositeStepAlgorithm.hpp" /** @ingroup func_group \class ROL::Constraint_SimOpt @@ -304,7 +304,7 @@ class Constraint_SimOpt : public Constraint { parlist.sublist("Status Test").set("Constraint Tolerance",ctol); parlist.sublist("Status Test").set("Step Tolerance",stol_); parlist.sublist("Status Test").set("Iteration Limit",maxit_); - Ptr> algo = makePtr>(parlist); + Ptr> algo = makePtr>(parlist); algo->run(u,*obj,*con,*l,*stream); value(c,u,z,tol); } From 7dede5e578ba92146936805ec9b681858ad17b86 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 10 Jul 2024 17:08:21 -0600 Subject: [PATCH 037/243] Added stabilized biconjugate gradient method. --- packages/rol/src/step/krylov/ROL_BiCGSTAB.hpp | 147 ++++++++++++++++++ .../rol/src/step/krylov/ROL_KrylovFactory.hpp | 20 ++- 2 files changed, 161 insertions(+), 6 deletions(-) create mode 100644 packages/rol/src/step/krylov/ROL_BiCGSTAB.hpp diff --git a/packages/rol/src/step/krylov/ROL_BiCGSTAB.hpp b/packages/rol/src/step/krylov/ROL_BiCGSTAB.hpp new file mode 100644 index 000000000000..5e96b62d90ca --- /dev/null +++ b/packages/rol/src/step/krylov/ROL_BiCGSTAB.hpp @@ -0,0 +1,147 @@ +// @HEADER +// ************************************************************************ +// +// Rapid Optimization Library (ROL) Package +// Copyright (2014) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact lead developers: +// Drew Kouri (dpkouri@sandia.gov) and +// Denis Ridzal (dridzal@sandia.gov) +// +// ************************************************************************ +// @HEADER + +#ifndef ROL_BICGSTAB_H +#define ROL_BICGSTAB_H + +/** \class ROL::ConjugateGradients + \brief Provides definitions of the Conjugate Gradient solver. +*/ + +#include "ROL_Krylov.hpp" +#include "ROL_Types.hpp" + +namespace ROL { + +template +class BiCGSTAB : public Krylov { + + bool isInitialized_; + const bool useInexact_; + Ptr> r_, r1_, p_, v_, s_, t_, h_, y_, z_; + +public: + BiCGSTAB(Real absTol = 1.e-4, Real relTol = 1.e-2, unsigned maxit = 100, bool useInexact = false) + : Krylov(absTol,relTol,maxit), isInitialized_(false), useInexact_(useInexact) {} + + BiCGSTAB(ParameterList &parlist, bool useInexact = false) + : Krylov(parlist), isInitialized_(false), useInexact_(useInexact) {} + + Real run( Vector &x, LinearOperator &A, const Vector &b, LinearOperator &M, + int &iter, int &flag ) { + if ( !isInitialized_ ) { + r_ = b.clone(); r1_ = b.clone(); p_ = b.clone(); + v_ = b.clone(); s_ = b.clone(); t_ = b.clone(); + h_ = x.clone(); y_ = x.clone(); z_ = x.clone(); + isInitialized_ = true; + } + + Real rho(1), rho1(1), alpha(1), beta(0), omega(1); + Real rnorm = b.norm(); + Real itol = std::sqrt(ROL_EPSILON()); + const Real rtol = std::min(Krylov::getAbsoluteTolerance(),Krylov::getRelativeTolerance()*rnorm); + if (rnorm <= rtol) return rnorm; + + x.zero(); + v_->zero(); + p_->zero(); + r_->set(b); + r1_->set(*r_); + + iter = 0; + flag = 0; + + for (iter = 0; iter < (int)Krylov::getMaximumIteration(); iter++) { + rho1 = r_->dot(*r1_); + beta = (rho1/rho)*(alpha/omega); + p_->axpy(-omega,*v_); + p_->scale(beta); + p_->plus(*r_); + + if ( useInexact_ ) + itol = rtol/((Real)Krylov::getMaximumIteration() * rnorm); + M.applyInverse(*y_, *p_, itol); + A.apply(*v_, *y_, itol); + + alpha = rho1 / v_->dot(*r1_); + h_->set(x); + h_->axpy(alpha,*y_); + s_->set(*r_); + s_->axpy(-alpha,*v_); + + rnorm = s_->norm(); + if (rnorm <= rtol) { + x.set(*h_); + break; + } + + if ( useInexact_ ) + itol = rtol/((Real)Krylov::getMaximumIteration() * rnorm); + M.applyInverse(*z_, *s_, itol); + A.apply(*t_, *z_, itol); + + omega = t_->dot(*s_) / t_->dot(*t_); + x.set(*h_); + x.axpy(omega,*z_); + r_->set(*s_); + r_->axpy(-omega,*t_); + + rnorm = r_->norm(); + if (rnorm <= rtol) break; + + rho = rho1; + } + if (iter == (int)Krylov::getMaximumIteration()) { + flag = 1; + } + else { + iter++; + } + return rnorm; + } +}; + + +} + +#endif diff --git a/packages/rol/src/step/krylov/ROL_KrylovFactory.hpp b/packages/rol/src/step/krylov/ROL_KrylovFactory.hpp index f18c599d50a2..49e17ddcfd68 100644 --- a/packages/rol/src/step/krylov/ROL_KrylovFactory.hpp +++ b/packages/rol/src/step/krylov/ROL_KrylovFactory.hpp @@ -51,6 +51,7 @@ #include "ROL_ConjugateResiduals.hpp" #include "ROL_GMRES.hpp" #include "ROL_MINRES.hpp" +#include "ROL_BiCGSTAB.hpp" namespace ROL { /** \enum ROL::EKrylov @@ -60,6 +61,7 @@ namespace ROL { \arg CR Conjugate Residual Method \arg GMRES Generalized Minimum Residual Method \arg MINRES Minimum Residual Method + \arg BICGSTAB Stablized Bi-Conjugate Gradient Method \arg USERDEFINED User defined Krylov method \arg LAST Dummy type */ @@ -68,6 +70,7 @@ namespace ROL { KRYLOV_CR, KRYLOV_GMRES, KRYLOV_MINRES, + KRYLOV_BICGSTAB, KRYLOV_USERDEFINED, KRYLOV_LAST }; @@ -79,6 +82,7 @@ namespace ROL { case KRYLOV_CR: retString = "Conjugate Residuals"; break; case KRYLOV_GMRES: retString = "GMRES"; break; case KRYLOV_MINRES: retString = "MINRES"; break; + case KRYLOV_BICGSTAB: retString = "BiCGSTAB"; break; case KRYLOV_USERDEFINED: retString = "User Defined"; break; case KRYLOV_LAST: retString = "Last Type (Dummy)"; break; default: retString = "INVALID EKrylov"; @@ -92,9 +96,11 @@ namespace ROL { \return 1 if the argument is a valid Secant; 0 otherwise. */ inline int isValidKrylov(EKrylov type){ - return( (type == KRYLOV_CG) || - (type == KRYLOV_CR) || - (type == KRYLOV_GMRES) || + return( (type == KRYLOV_CG) || + (type == KRYLOV_CR) || + (type == KRYLOV_GMRES) || + (type == KRYLOV_MINRES) || + (type == KRYLOV_BICGSTAB) || (type == KRYLOV_USERDEFINED) ); } @@ -135,17 +141,19 @@ namespace ROL { parlist.sublist("General").sublist("Krylov").get("Type","GMRES")); Real absTol = parlist.sublist("General").sublist("Krylov").get("Absolute Tolerance", em4); Real relTol = parlist.sublist("General").sublist("Krylov").get("Relative Tolerance", em2); - int maxit = parlist.sublist("General").sublist("Krylov").get("Iteration Limit", 20); + int maxit = parlist.sublist("General").sublist("Krylov").get("Iteration Limit", 20); bool inexact = parlist.sublist("General").get("Inexact Hessian-Times-A-Vector",false); switch(ekv) { case KRYLOV_CR: return makePtr>(absTol,relTol,maxit,inexact); case KRYLOV_CG: return makePtr>(absTol,relTol,maxit,inexact); - case KRYLOV_MINRES: - return makePtr>(absTol,relTol,maxit,inexact); case KRYLOV_GMRES: return makePtr>(parlist); + case KRYLOV_MINRES: + return makePtr>(absTol,relTol,maxit,inexact); + case KRYLOV_BICGSTAB: + return makePtr>(absTol,relTol,maxit,inexact); default: return nullPtr; } From f93884216b4a170c84c539c1106ae18cd8cb4c03 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Thu, 18 Jul 2024 17:13:59 -0600 Subject: [PATCH 038/243] Revert "Temporary SimOpt patch" This reverts commit 1ea6251d88c7848322f30d8ffe434ad0cd9daee4. --- .../rol/src/function/simopt/ROL_Constraint_SimOpt.hpp | 9 +-------- .../rol/src/function/simopt/ROL_Objective_SimOpt.hpp | 6 +----- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp b/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp index 73439a6b0173..287f383de54b 100644 --- a/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp +++ b/packages/rol/src/function/simopt/ROL_Constraint_SimOpt.hpp @@ -206,17 +206,10 @@ class Constraint_SimOpt : public Constraint { --- */ - virtual void value_simopt(Vector &c, - const Vector &u, - const Vector &z, - Real &tol) {}; virtual void value(Vector &c, const Vector &u, const Vector &z, - Real &tol) - { - value_simopt(c, u, z, tol); - } + Real &tol) = 0; /** \brief Given \f$z\f$, solve \f$c(u,z)=0\f$ for \f$u\f$. diff --git a/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp b/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp index 0595e828a4f1..77162993ea82 100644 --- a/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp +++ b/packages/rol/src/function/simopt/ROL_Objective_SimOpt.hpp @@ -84,11 +84,7 @@ class Objective_SimOpt : public Objective { /** \brief Compute value. */ - virtual Real value_simopt( const Vector &u, const Vector &z, Real &tol ) { return 0; } - virtual Real value( const Vector &u, const Vector &z, Real &tol ) - { - return value_simopt(u, z, tol); - } + virtual Real value( const Vector &u, const Vector &z, Real &tol ) = 0; Real value( const Vector &x, Real &tol ) { const ROL::Vector_SimOpt &xs = dynamic_cast&>( From f6a66bf156a0128e09a45391af4f9008c05e0ea1 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Fri, 19 Jul 2024 12:32:16 -0600 Subject: [PATCH 039/243] Add fix for SimOpt --- packages/rol/pyrol/python/getTypeName.py | 44 ++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/packages/rol/pyrol/python/getTypeName.py b/packages/rol/pyrol/python/getTypeName.py index 7838425e1f0a..f71336fca1ac 100644 --- a/packages/rol/pyrol/python/getTypeName.py +++ b/packages/rol/pyrol/python/getTypeName.py @@ -19,13 +19,38 @@ def new_method(self, *args, **kwargs): return cls_method(self, *args, **kwargs) return new_method -def withTrackingConstructor(cls): +def _decorator23(value_method): + def value(*args): + if len(args) == 2: + x, tol = args + return value_method(x.get_1(), x.get_2(), tol) + elif len(args) == 3: + u, z, tol = args + return value_method(u, z, tol) + else: + raise ArgumentError("Unexcepted number of arguments") + return value + + +def _decorator34(value_method): + def value(*args): + if len(args) == 3: + c, x, tol = args + return value_method(c, x.get_1(), x.get_2(), tol) + elif len(args) == 4: + c, u, z, tol = args + return value_method(c, u, z, tol) + raise ArgumentError("Unexcepted number of arguments") + return value + + +def withTrackingConstructor(cls_name, cls): class newCls(cls): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._tracked_constructor_args = [] - + track(self, *args, **kwargs) method_names = [m for m in dir(cls) if callable(getattr(cls, m))] @@ -33,13 +58,18 @@ def __init__(self, *args, **kwargs): if name.startswith('add'): setattr(newCls, name, tracking_method(getattr(cls, name))) + if cls_name.find('Objective_SimOpt') == 0: + self.value = _decorator23(self.value) + elif cls_name.find('Constraint_SimOpt') == 0: + self.value = _decorator34(self.value) + return newCls ROL_members = {} for cls_name, cls_obj in inspect.getmembers(sys.modules['pyrol.pyrol.ROL']): if inspect.isclass(cls_obj): - cls_obj = withTrackingConstructor(cls_obj) + cls_obj = withTrackingConstructor(cls_name, cls_obj) trackedTypes.append(cls_obj) setattr(sys.modules['pyrol.pyrol.ROL'], cls_name, cls_obj) ROL_members[cls_name] = (cls_obj, inspect.isclass(cls_obj)) @@ -57,3 +87,11 @@ def getTypeName(class_name, scalar_type=getDefaultScalarType()): return ROL_classes[i][1] print("Warning: Unknown type \"{}\", the function returns None.".format(class_name)) return None + + +def getTypeName2(class_name): + for i in range(0, len(ROL_classes)): + if ROL_classes[i][0].lower().find(class_name.lower()) == 0: + return ROL_classes[i][1] + print("Warning: Unknown type \"{}\", the function returns None.".format(class_name)) + return None From c4bbff6688b8b14ea98e7ba168bc4ebacc9dd54f Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Fri, 19 Jul 2024 16:36:04 -0600 Subject: [PATCH 040/243] Added safeguard to avoid applying null space operator if all components are active. --- .../algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp index 1b9037800551..1fded5aacbc9 100644 --- a/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp @@ -169,8 +169,11 @@ void LSecantBAlgorithm::run(Vector &x, bnd.pruneActive(*pwa1,x,zero); gfree->set(pwa1->dual()); if (hasEcon_) { - applyFreePrecond(*pwa1,*gfree,x,*secant_,bnd,tol0,*dwa1,*pwa2); - gfnorm = pwa1->norm(); + gfnorm = gfree->norm(); + if (gfnorm > zero) { + applyFreePrecond(*pwa1,*gfree,x,*secant_,bnd,tol0,*dwa1,*pwa2); + gfnorm = pwa1->norm(); + } } else { gfnorm = gfree->norm(); From e67173ac2cec396bf91655f5b3b9f7cec7e4a16f Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 15 Jul 2024 17:15:36 -0600 Subject: [PATCH 041/243] MueLu: Cut Drop Converted to Use Kokkos Original code within ORIGINAL ifdef. New code within NEW ifdef. DropTol structure marked with KOKKOS_INLINE_FUNCTION and default values are hard coded. Default Algorithm and Cut Drop Algorithm split into separate for loops in NEW code. Cut Drop converted to use Kokkos nested parallel loops. Timers placed in new code and are commented out. Code passes current unit tests. Saw a speedup of about 1.5x with Cuda and 1.2x with Serial when running unit tests with 10,000,000 rows. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_decl.hpp | 1 + .../MueLu_CoalesceDropFactory_def.hpp | 1663 ++++++++++++++++- 2 files changed, 1657 insertions(+), 7 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp index 96b5e778f6bc..db5e9a291313 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp @@ -160,6 +160,7 @@ class CoalesceDropFactory : public SingleLevelFactoryBase { //@} void Build(Level& currentLevel) const; // Build + void BuildKokkos(Level& currentLevel) const; private: // pre-drop function diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 2c421c477bde..a8befaea592b 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -61,6 +61,8 @@ #include +#include //NEW +#include //NEW #include "MueLu_CoalesceDropFactory_decl.hpp" #include "MueLu_AmalgamationFactory.hpp" @@ -92,22 +94,30 @@ namespace MueLu { namespace Details { template struct DropTol { + KOKKOS_INLINE_FUNCTION //NEW DropTol() = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol const&) = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol&&) = default; DropTol& operator=(DropTol const&) = default; DropTol& operator=(DropTol&&) = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(real_type val_, real_type diag_, LO col_, bool drop_) : val{val_} , diag{diag_} , col{col_} , drop{drop_} {} - real_type val{Teuchos::ScalarTraits::zero()}; - real_type diag{Teuchos::ScalarTraits::zero()}; - LO col{Teuchos::OrdinalTraits::invalid()}; + real_type val{0}; + real_type diag{0}; + LO col{-1}; + //NEW Can't run these host functions on device + //real_type val{Teuchos::ScalarTraits::zero()}; + //real_type diag{Teuchos::ScalarTraits::zero()}; + //LO col{Teuchos::OrdinalTraits::invalid()}; bool drop{true}; // CMS: Auxillary information for debugging info @@ -414,6 +424,1645 @@ void CoalesceDropFactory::Build(Level TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); + /************************** RS or SA-style Classical Dropping (and variants) **************************/ + if (algo == "classical") { + if (predrop_ == null) { + // ap: this is a hack: had to declare predrop_ as mutable + predrop_ = rcp(new PreDropFunctionConstVal(threshold)); + } + + if (predrop_ != null) { + RCP predropConstVal = rcp_dynamic_cast(predrop_); + TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast, + "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed."); + // If a user provided a predrop function, it overwrites the XML threshold parameter + SC newt = predropConstVal->GetThreshold(); + if (newt != threshold) { + GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl; + threshold = newt; + } + } + // At this points we either have + // (predrop_ != null) + // Therefore, it is sufficient to check only threshold + if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { + // Case 1: scalar problem, no dropping => just use matrix graph + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + // Detect and record rows that correspond to Dirichlet boundary conditions + auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); + + graph->SetBoundaryNodeMap(boundaryNodes); + numTotal = A->getLocalNumEntries(); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < boundaryNodes.size(); ++i) + if (boundaryNodes[i]) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "DofsPerNode", 1); + Set(currentLevel, "Graph", graph); + + } else if ((BlockSize == 1 && threshold != STS::zero()) || + (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || + (BlockSize == 1 && useSignedClassicalRS) || + (BlockSize == 1 && useSignedClassicalSA)) { + // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original + // graph's map information, e.g., whether index is local + // OR a matrix without a CrsGraph + + // allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + using MT = typename STS::magnitudeType; + RCP ghostedDiag; + ArrayRCP ghostedDiagVals; + ArrayRCP negMaxOffDiagonal; + // RS style needs the max negative off-diagonal, SA style needs the diagonal + if (useSignedClassicalRS) { + if (ghostedBlockNumber.is_null()) { + negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl; + } else { + negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber); + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; + } + } else { + ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + ghostedDiagVals = ghostedDiag->getData(0); + } + auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) { + if (ghostedBlockNumber.is_null()) { + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); + } else { + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl; + Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes); + } + } + + LO realnnz = 0; + rows(0) = 0; +#define NEW +#ifdef ORIGINAL + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + if (classicalAlgo == defaultAlgo) { + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } else { + /* Cut Algorithm */ + // CMS + using DropTol = Details::DropTol; + std::vector drop_vec; + drop_vec.reserve(nnz); + const real_type zero = Teuchos::ScalarTraits::zero(); + const real_type one = Teuchos::ScalarTraits::one(); + LO rownnz = 0; + // NOTE: This probably needs to be fixed for rowsum + + // find magnitudes + for (LO colID = 0; colID < (LO)nnz; colID++) { + LO col = indices[colID]; + if (row == col) { + drop_vec.emplace_back(zero, one, colID, false); + continue; + } + + // Don't aggregate boundaries + if (boundaryNodes[colID]) continue; + typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + drop_vec.emplace_back(aij, aiiajj, colID, false); + } + + const size_t n = drop_vec.size(); + + if (classicalAlgo == unscaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val; + auto b = y.val; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } else if (classicalAlgo == scaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + bool drop = false; + // printf("[%d] Scaled Cut: ",(int)row); + // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if (a > realThreshold * b) { + drop = true; + +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); + } + drop_vec[i].drop = drop; + } + // printf("\n"); + } + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { + LO col = indices[drop_vec[idxID].col]; + // don't drop diagonal + if (row == col) { + columns[realnnz++] = col; + rownnz++; + continue; + } + + if (!drop_vec[idxID].drop) { + columns[realnnz++] = col; + rownnz++; + } else { + numDropped++; + } + } + // CMS + rows[row + 1] = realnnz; + } + } // end for row +#endif + +#ifdef NEW + if(classicalAlgo == defaultAlgo) { + SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } // end for row + } + else { //NEW START + //auto stackedTimer = rcp(new Teuchos::StackedTimer("timer")); + //Teuchos::TimeMonitor::setStackedTimer(stackedTimer); + //stackedTimer->start("init"); + SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using DropTol = Details::DropTol; + + //move from host to device + ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); + Kokkos::View ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy(ghostedDiagValsArrayView); + auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + + auto At = Utilities::Op2TpetraCrs(A); + auto A_device = At->getLocalMatrixDevice(); + + int algorithm = classicalAlgo; + Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + //stackedTimer->stop("init"); + + //stackedTimer->start("loop"); + Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.row(row); + size_t nnz = rowView.length; + + size_t n = 0; + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { + LO col = rowView.colidx(colID); + if(row == col) { + drop_view(colID) = DropTol(0, 1, colID, false); + count++; + } + //Don't aggregate boundaries + else if(!boundaryNodesDevice(colID)) { + typename STS::magnitudeType aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType aij = static_cast(std::fabs(static_cast(rowView.value(colID) * rowView.value(colID)))); // |a_i j|^2 + drop_view(colID) = DropTol(aij, aiiajj, colID, false); + count++; + } + }, n); + if (algorithm == unscaled_cut) { + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + //find index where dropping starts + size_t dropStart; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = drop_view(i - 1); + auto const& y = drop_view(i); + auto a = x.val; + auto b = y.val; + if(a > realThreshold * b) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + } else if (algorithm == scaled_cut) { + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + + //find index where dropping starts + size_t dropStart; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = drop_view(i - 1); + auto const& y = drop_view(i); + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if(a > realThreshold * b) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + } + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + //don't drop diagonal + if(row == col || !drop_view(idxID).drop) { + keep++; + } + else { + rowView.colidx(idxID) = -1; + drop++; + } + }, rownnz, rowDropped); + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, realnnz, numDropped); + //stackedTimer->stop("loop"); + + //stackedTimer->start("remove"); + + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); + Kokkos::deep_copy(columns, columnsDevice); + + //stackedTimer->stop("remove"); + + //update row indices + //stackedTimer->start("scan"); + auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if(is_final) rowsDevice(i+1) = partial_sum; + }); + Kokkos::deep_copy(rows, rowsDevice); + //stackedTimer->stop("scan"); + + //stackedTimer->stop("timer"); + //stackedTimer->report(std::cout, Teuchos::DefaultComm::getComm()); + } //NEW END +#endif + + numTotal = A->getLocalNumEntries(); + + if (aggregationMayCreateDirichlet) { + // If the only element remaining after filtering is diagonal, mark node as boundary + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + if (rows[row + 1] - rows[row] <= 1) + boundaryNodes[row] = true; + } + } + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A")); + graph->SetBoundaryNodeMap(boundaryNodes); + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < boundaryNodes.size(); ++i) + if (boundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", 1); + + // If we're doing signed classical, we might want to block-diagonalize *after* the dropping + if (generateColoringGraph) { + RCP colorGraph; + RCP importer = A->getCrsGraph()->getImporter(); + BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer); + Set(currentLevel, "Coloring Graph", colorGraph); + // #define CMS_DUMP +#ifdef CMS_DUMP + { + Xpetra::IO::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(graph)->GetCrsGraph()); + Xpetra::IO::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(colorGraph)->GetCrsGraph()); + // int rank = graph->GetDomainMap()->getComm()->getRank(); + // { + // std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); + // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); + // colorGraph->print(*fancy,Debug); + // } + // { + // std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); + // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); + // graph->print(*fancy,Debug); + // } + } +#endif + } // end generateColoringGraph + } else if (BlockSize > 1 && threshold == STS::zero()) { + // Case 3: Multiple DOF/node problem without dropping + const RCP rowMap = A->getRowMap(); + const RCP colMap = A->getColMap(); + + graphType = "amalgamated"; + + // build node row map (uniqueMap) and node column map (nonUniqueMap) + // the arrays rowTranslation and colTranslation contain the local node id + // given a local dof id. The data is calculated by the AmalgamationFactory and + // stored in the variable container "UnAmalgamationInfo" + RCP uniqueMap = amalInfo->getNodeRowMap(); + RCP nonUniqueMap = amalInfo->getNodeColMap(); + Array rowTranslation = *(amalInfo->getRowTranslation()); + Array colTranslation = *(amalInfo->getColTranslation()); + + // get number of local nodes + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + // Allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + ArrayRCP pointBoundaryNodes; + pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); + + // extract striding information + LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) + LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) + LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + blkSize = Teuchos::as(strMap->getFixedBlockSize()); + blkId = strMap->getStridedBlockId(); + if (blkId > -1) + blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); + } + + // loop over all local nodes + LO realnnz = 0; + rows(0) = 0; + Array indicesExtra; + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). + // Therefore, looping over all dofs is fine here. We use blkPartSize as we work + // with local ids. + // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) + // node. + bool isBoundary = false; + if (pL.get("aggregation: greedy Dirichlet") == true) { + for (LO j = 0; j < blkPartSize; j++) { + if (pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = true; + break; + } + } + } else { + isBoundary = true; + for (LO j = 0; j < blkPartSize; j++) { + if (!pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = false; + break; + } + } + } + + // Merge rows of A + // The array indicesExtra contains local column node ids for the current local node "row" + if (!isBoundary) + MergeRows(*A, row, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + numTotal += indices.size(); + + // add the local column node ids to the full columns array which + // contains the local column node ids for all local node rows + LO nnz = indices.size(), rownnz = 0; + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns(realnnz++) = col; + rownnz++; + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + rows(row + 1) = realnnz; + } // for (LO row = 0; row < numRows; row++) + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes + << " agglomerated Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); // full block size + + } else if (BlockSize > 1 && threshold != STS::zero()) { + // Case 4: Multiple DOF/node problem with dropping + const RCP rowMap = A->getRowMap(); + const RCP colMap = A->getColMap(); + graphType = "amalgamated"; + + // build node row map (uniqueMap) and node column map (nonUniqueMap) + // the arrays rowTranslation and colTranslation contain the local node id + // given a local dof id. The data is calculated by the AmalgamationFactory and + // stored in the variable container "UnAmalgamationInfo" + RCP uniqueMap = amalInfo->getNodeRowMap(); + RCP nonUniqueMap = amalInfo->getNodeColMap(); + Array rowTranslation = *(amalInfo->getRowTranslation()); + Array colTranslation = *(amalInfo->getColTranslation()); + + // get number of local nodes + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + // Allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); + + // extract striding information + LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) + LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) + LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + blkSize = Teuchos::as(strMap->getFixedBlockSize()); + blkId = strMap->getStridedBlockId(); + if (blkId > -1) + blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); + } + + // extract diagonal data for dropping strategy + RCP ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + const ArrayRCP ghostedDiagVals = ghostedDiag->getData(0); + + // loop over all local nodes + LO realnnz = 0; + rows[0] = 0; + Array indicesExtra; + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). + // Therefore, looping over all dofs is fine here. We use blkPartSize as we work + // with local ids. + // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) + // node. + bool isBoundary = false; + if (pL.get("aggregation: greedy Dirichlet") == true) { + for (LO j = 0; j < blkPartSize; j++) { + if (pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = true; + break; + } + } + } else { + isBoundary = true; + for (LO j = 0; j < blkPartSize; j++) { + if (!pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = false; + break; + } + } + } + + // Merge rows of A + // The array indicesExtra contains local column node ids for the current local node "row" + if (!isBoundary) + MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + numTotal += indices.size(); + + // add the local column node ids to the full columns array which + // contains the local column node ids for all local node rows + LO nnz = indices.size(), rownnz = 0; + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns[realnnz++] = col; + rownnz++; + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + rows[row + 1] = realnnz; + } // for (LO row = 0; row < numRows; row++) + // columns.resize(realnnz); + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes + << " agglomerated Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); // full block size + } + + } else if (algo == "distance laplacian") { + LO blkSize = A->GetFixedBlockSize(); + GO indexBase = A->getRowMap()->getIndexBase(); + // [*0*] : FIXME + // ap: somehow, if I move this line to [*1*], Belos throws an error + // I'm not sure what's going on. Do we always have to Get data, if we did + // DeclareInput for it? + // RCP Coords = Get< RCP >(currentLevel, "Coordinates"); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); + + if ((blkSize == 1) && (threshold == STS::zero())) { + // Trivial case: scalar problem, no dropping. Can return original graph + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + graph->SetBoundaryNodeMap(pointBoundaryNodes); + graphType = "unamalgamated"; + numTotal = A->getLocalNumEntries(); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < pointBoundaryNodes.size(); ++i) + if (pointBoundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "DofsPerNode", blkSize); + Set(currentLevel, "Graph", graph); + + } else { + // ap: We make quite a few assumptions here; general case may be a lot different, + // but much much harder to implement. We assume that: + // 1) all maps are standard maps, not strided maps + // 2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic + // way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i + // + // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo, + // but as I totally don't understand that code, here is my solution + + // [*1*]: see [*0*] + + // Check that the number of local coordinates is consistent with the #rows in A + TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible, + "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ")."); + + const RCP colMap = A->getColMap(); + RCP uniqueMap, nonUniqueMap; + Array colTranslation; + if (blkSize == 1) { + uniqueMap = A->getRowMap(); + nonUniqueMap = A->getColMap(); + graphType = "unamalgamated"; + + } else { + uniqueMap = Coords->getMap(); + TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible, + "Different index bases for matrix and coordinates"); + + AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation); + + graphType = "amalgamated"; + } + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + RCP ghostedCoords; + RCP ghostedLaplDiag; + Teuchos::ArrayRCP ghostedLaplDiagData; + if (threshold != STS::zero()) { + // Get ghost coordinates + RCP importer; + { + SubFactoryMonitor m1(*this, "Import construction", currentLevel); + if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) { + GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl; + importer = realA->getCrsGraph()->getImporter(); + } else { + GetOStream(Warnings0) << "Constructing new importer instance" << std::endl; + importer = ImportFactory::Build(uniqueMap, nonUniqueMap); + } + } // subtimer + ghostedCoords = Xpetra::MultiVectorFactory::Build(nonUniqueMap, Coords->getNumVectors()); + { + SubFactoryMonitor m1(*this, "Coordinate import", currentLevel); + ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT); + } // subtimer + + // Construct Distance Laplacian diagonal + RCP localLaplDiag = VectorFactory::Build(uniqueMap); + Array indicesExtra; + Teuchos::Array> coordData; + if (threshold != STS::zero()) { + const size_t numVectors = ghostedCoords->getNumVectors(); + coordData.reserve(numVectors); + for (size_t j = 0; j < numVectors; j++) { + Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); + coordData.push_back(tmpData); + } + } + { + SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel); + ArrayRCP localLaplDiagData = localLaplDiag->getDataNonConst(0); + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + + if (blkSize == 1) { + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + } else { + // Merge rows of A + indicesExtra.resize(0); + MergeRows(*A, row, indicesExtra, colTranslation); + indices = indicesExtra; + } + + LO nnz = indices.size(); + bool haveAddedToDiag = false; + for (LO colID = 0; colID < nnz; colID++) { + const LO col = indices[colID]; + + if (row != col) { + if (use_dlap_weights == SINGLE_WEIGHTS) { + /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col, + MueLu::Utilities::Distance2(coordData, row, col), + MueLu::Utilities::Distance2(dlap_weights(),coordData, row, col));*/ + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + // printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities::Distance2(coordData, row, col)); + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + haveAddedToDiag = true; + } + } + // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns. + // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs. + if (!haveAddedToDiag) + localLaplDiagData[row] = STS::rmax(); + } + } // subtimer + { + SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel); + ghostedLaplDiag = VectorFactory::Build(nonUniqueMap); + ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT); + ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0); + } // subtimer + + } else { + GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl; + } + + // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian + + // allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + +#ifdef HAVE_MUELU_DEBUG + // DEBUGGING + for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666; +#endif + + // Extra array for if we're allowing symmetrization with cutting + ArrayRCP rows_stop; + bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric; + if (use_stop_array) + // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows); + rows_stop.resize(numRows); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + LO realnnz = 0; + rows(0) = 0; + + Array indicesExtra; + { + SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel); + Teuchos::Array> coordData; + if (threshold != STS::zero()) { + const size_t numVectors = ghostedCoords->getNumVectors(); + coordData.reserve(numVectors); + for (size_t j = 0; j < numVectors; j++) { + Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); + coordData.push_back(tmpData); + } + } + + ArrayView vals; // CMS hackery + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + bool isBoundary = false; + + if (blkSize == 1) { + // ArrayView vals;//CMS uncomment + A->getLocalRowView(row, indices, vals); + isBoundary = pointBoundaryNodes[row]; + } else { + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + for (LO j = 0; j < blkSize; j++) { + if (!pointBoundaryNodes[row * blkSize + j]) { + isBoundary = false; + break; + } + } + + // Merge rows of A + if (!isBoundary) + MergeRows(*A, row, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + } + numTotal += indices.size(); + + LO nnz = indices.size(), rownnz = 0; + + if (use_stop_array) { + rows(row + 1) = rows(row) + nnz; + realnnz = rows(row); + } + + if (threshold != STS::zero()) { + // default + if (distanceLaplacianAlgo == defaultAlgo) { + /* Standard Distance Laplacian */ + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + + if (row == col) { + columns(realnnz++) = col; + rownnz++; + continue; + } + + // We do not want the distance Laplacian aggregating boundary nodes + if (isBoundary) continue; + + SC laplVal; + if (use_dlap_weights == SINGLE_WEIGHTS) { + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); + real_type aij = STS::magnitude(laplVal * laplVal); + + if (aij > aiiajj) { + columns(realnnz++) = col; + rownnz++; + } else { + numDropped++; + } + } + } else { + /* Cut Algorithm */ + using DropTol = Details::DropTol; + std::vector drop_vec; + drop_vec.reserve(nnz); + const real_type zero = Teuchos::ScalarTraits::zero(); + const real_type one = Teuchos::ScalarTraits::one(); + + // find magnitudes + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + + if (row == col) { + drop_vec.emplace_back(zero, one, colID, false); + continue; + } + // We do not want the distance Laplacian aggregating boundary nodes + if (isBoundary) continue; + + SC laplVal; + if (use_dlap_weights == SINGLE_WEIGHTS) { + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + + real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); + real_type aij = STS::magnitude(laplVal * laplVal); + + drop_vec.emplace_back(aij, aiiajj, colID, false); + } + + const size_t n = drop_vec.size(); + + if (distanceLaplacianAlgo == unscaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val; + auto b = y.val; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } + + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { + LO col = indices[drop_vec[idxID].col]; + + // don't drop diagonal + if (row == col) { + columns(realnnz++) = col; + rownnz++; + // printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val); + continue; + } + + if (!drop_vec[idxID].drop) { + columns(realnnz++) = col; + // printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); + rownnz++; + } else { + // printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); + numDropped++; + } + } + } + } else { + // Skip laplace calculation and threshold comparison for zero threshold + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns(realnnz++) = col; + rownnz++; + } + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + + if (use_stop_array) + rows_stop[row] = rownnz + rows[row]; + else + rows[row + 1] = realnnz; + } // for (LO row = 0; row < numRows; row++) + + } // subtimer + + if (use_stop_array) { + // Do symmetrization of the cut matrix + // NOTE: We assume nested row/column maps here + for (LO row = 0; row < numRows; row++) { + for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) { + LO col = columns[colidx]; + if (col >= numRows) continue; + + bool found = false; + for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) { + if (columns[t_col] == row) + found = true; + } + // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing + // into a Dirichlet unknown. In that case don't. + if (!found && !pointBoundaryNodes[col] && Teuchos::as(rows_stop[col]) < rows[col + 1]) { + LO new_idx = rows_stop[col]; + // printf("(%d,%d) SYMADD entry\n",col,row); + columns[new_idx] = row; + rows_stop[col]++; + numDropped--; + } + } + } + + // Condense everything down + LO current_start = 0; + for (LO row = 0; row < numRows; row++) { + LO old_start = current_start; + for (LO col = rows(row); col < rows_stop[row]; col++) { + if (current_start != col) { + columns(current_start) = columns(col); + } + current_start++; + } + rows[row] = old_start; + } + rows(numRows) = realnnz = current_start; + } + + RCP graph; + { + SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel); + graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + } // subtimer + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes" + << " using threshold " << dirichletThreshold << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); + } + } + + if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) { + RCP> comm = A->getRowMap()->getComm(); + GO numGlobalTotal, numGlobalDropped; + MueLu_sumAll(comm, numTotal, numGlobalTotal); + MueLu_sumAll(comm, numDropped, numGlobalDropped); + GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal; + if (numGlobalTotal != 0) + GetOStream(Statistics1) << " (" << 100 * Teuchos::as(numGlobalDropped) / Teuchos::as(numGlobalTotal) << "%)"; + GetOStream(Statistics1) << std::endl; + } + + } else { + // what Tobias has implemented + + SC threshold = as(pL.get("aggregation: drop tol")); + // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + GetOStream(Runtime0) << "algorithm = \"" + << "failsafe" + << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + Set(currentLevel, "Filtering", (threshold != STS::zero())); + + RCP rowMap = A->getRowMap(); + RCP colMap = A->getColMap(); + + LO blockdim = 1; // block dim for fixed size blocks + GO indexBase = rowMap->getIndexBase(); // index base of maps + GO offset = 0; + + // 1) check for blocking/striding information + if (A->IsView("stridedMaps") && + Teuchos::rcp_dynamic_cast(A->getRowMap("stridedMaps")) != Teuchos::null) { + Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!) + RCP strMap = Teuchos::rcp_dynamic_cast(A->getRowMap()); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed."); + blockdim = strMap->getFixedBlockSize(); + offset = strMap->getOffset(); + oldView = A->SwitchToView(oldView); + GetOStream(Statistics1) << "CoalesceDropFactory::Build():" + << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl; + } else + GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl; + + // 2) get row map for amalgamated matrix (graph of A) + // with same distribution over all procs as row map of A + RCP nodeMap = amalInfo->getNodeRowMap(); + GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl; + + // 3) create graph of amalgamated matrix + RCP crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim); + + LO numRows = A->getRowMap()->getLocalNumElements(); + LO numNodes = nodeMap->getLocalNumElements(); + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes); + Kokkos::deep_copy(amalgBoundaryNodes, false); + const ArrayRCP numberDirichletRowsPerNode(numNodes, 0); // helper array counting the number of Dirichlet nodes associated with node + bool bIsDiagonalEntry = false; // boolean flag stating that grid==gcid + + // 4) do amalgamation. generate graph of amalgamated matrix + // Note, this code is much more inefficient than the leightwight implementation + // Most of the work has already been done in the AmalgamationFactory + for (LO row = 0; row < numRows; row++) { + // get global DOF id + GO grid = rowMap->getGlobalElement(row); + + // reinitialize boolean helper variable + bIsDiagonalEntry = false; + + // translate grid to nodeid + GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase); + + size_t nnz = A->getNumEntriesInLocalRow(row); + Teuchos::ArrayView indices; + Teuchos::ArrayView vals; + A->getLocalRowView(row, indices, vals); + + RCP> cnodeIds = Teuchos::rcp(new std::vector); // global column block ids + LO realnnz = 0; + for (LO col = 0; col < Teuchos::as(nnz); col++) { + GO gcid = colMap->getGlobalElement(indices[col]); // global column id + + if (vals[col] != STS::zero()) { + GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase); + cnodeIds->push_back(cnodeId); + realnnz++; // increment number of nnz in matrix row + if (grid == gcid) bIsDiagonalEntry = true; + } + } + + if (realnnz == 1 && bIsDiagonalEntry == true) { + LO lNodeId = nodeMap->getLocalElement(nodeId); + numberDirichletRowsPerNode[lNodeId] += 1; // increment Dirichlet row counter associated with lNodeId + if (numberDirichletRowsPerNode[lNodeId] == blockdim) // mark full Dirichlet nodes + amalgBoundaryNodes[lNodeId] = true; + } + + Teuchos::ArrayRCP arr_cnodeIds = Teuchos::arcp(cnodeIds); + + if (arr_cnodeIds.size() > 0) + crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds()); + } + // fill matrix graph + crsGraph->fillComplete(nodeMap, nodeMap); + + // 5) create MueLu Graph object + RCP graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A")); + + // Detect and record rows that correspond to Dirichlet boundary conditions + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + // 6) store results in Level + // graph->SetBoundaryNodeMap(gBoundaryNodeMap); + Set(currentLevel, "DofsPerNode", blockdim); + Set(currentLevel, "Graph", graph); + + } // if (doExperimentalWrap) ... else ... + +} // Build + +template +void CoalesceDropFactory::BuildKokkos(Level& currentLevel) const { + FactoryMonitor m(*this, "BuildKokkos", currentLevel); + + typedef Teuchos::ScalarTraits STS; + typedef typename STS::magnitudeType real_type; + typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; + + if (predrop_ != Teuchos::null) + GetOStream(Parameters0) << predrop_->description(); + + RCP realA = Get>(currentLevel, "A"); + RCP amalInfo = Get>(currentLevel, "UnAmalgamationInfo"); + const ParameterList& pL = GetParameterList(); + bool doExperimentalWrap = pL.get("lightweight wrap"); + + GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl; + std::string algo = pL.get("aggregation: drop scheme"); + const bool aggregationMayCreateDirichlet = pL.get("aggregation: dropping may create Dirichlet"); + + RCP Coords; + RCP A; + + bool use_block_algorithm = false; + LO interleaved_blocksize = as(pL.get("aggregation: block diagonal: interleaved blocksize")); + bool useSignedClassicalRS = false; + bool useSignedClassicalSA = false; + bool generateColoringGraph = false; + + // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it + // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case + typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); + + RCP ghostedBlockNumber; + ArrayRCP g_block_id; + + if (algo == "distance laplacian") { + // Grab the coordinates for distance laplacian + Coords = Get>(currentLevel, "Coordinates"); + A = realA; + } else if (algo == "signed classical sa") { + useSignedClassicalSA = true; + algo = "classical"; + A = realA; + } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") { + useSignedClassicalRS = true; + // if(realA->GetFixedBlockSize() > 1) { + RCP BlockNumber = Get>(currentLevel, "BlockNumber"); + // Ghost the column block numbers if we need to + RCP importer = realA->getCrsGraph()->getImporter(); + if (!importer.is_null()) { + SubFactoryMonitor m1(*this, "Block Number import", currentLevel); + ghostedBlockNumber = Xpetra::VectorFactory::Build(importer->getTargetMap()); + ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT); + } else { + ghostedBlockNumber = BlockNumber; + } + g_block_id = ghostedBlockNumber->getData(0); + // } + if (algo == "block diagonal colored signed classical") + generateColoringGraph = true; + algo = "classical"; + A = realA; + + } else if (algo == "block diagonal") { + // Handle the "block diagonal" filtering and then leave + BlockDiagonalize(currentLevel, realA, false); + return; + } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") { + // Handle the "block diagonal" filtering, and then continue onward + use_block_algorithm = true; + RCP filteredMatrix = BlockDiagonalize(currentLevel, realA, true); + if (algo == "block diagonal distance laplacian") { + // We now need to expand the coordinates by the interleaved blocksize + RCP OldCoords = Get>(currentLevel, "Coordinates"); + if (OldCoords->getLocalLength() != realA->getLocalNumRows()) { + LO dim = (LO)OldCoords->getNumVectors(); + Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim); + for (LO k = 0; k < dim; k++) { + ArrayRCP old_vec = OldCoords->getData(k); + ArrayRCP new_vec = Coords->getDataNonConst(k); + for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) { + LO new_base = i * dim; + for (LO j = 0; j < interleaved_blocksize; j++) + new_vec[new_base + j] = old_vec[i]; + } + } + } else { + Coords = OldCoords; + } + algo = "distance laplacian"; + } else if (algo == "block diagonal classical") { + algo = "classical"; + } + // All cases + A = filteredMatrix; + rowSumTol = -1.0; + } else { + A = realA; + } + + // Distance Laplacian weights + Array dlap_weights = pL.get>("aggregation: distance laplacian directional weights"); + enum { NO_WEIGHTS = 0, + SINGLE_WEIGHTS, + BLOCK_WEIGHTS }; + int use_dlap_weights = NO_WEIGHTS; + if (algo == "distance laplacian") { + LO dim = (LO)Coords->getNumVectors(); + // If anything isn't 1.0 we need to turn on the weighting + bool non_unity = false; + for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) { + if (dlap_weights[i] != 1.0) { + non_unity = true; + } + } + if (non_unity) { + LO blocksize = use_block_algorithm ? as(pL.get("aggregation: block diagonal: interleaved blocksize")) : 1; + if ((LO)dlap_weights.size() == dim) + use_dlap_weights = SINGLE_WEIGHTS; + else if ((LO)dlap_weights.size() == blocksize * dim) + use_dlap_weights = BLOCK_WEIGHTS; + else { + TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, + "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize"); + } + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl; + } + } + + // decide wether to use the fast-track code path for standard maps or the somewhat slower + // code path for non-standard maps + /*bool bNonStandardMaps = false; + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0) + bNonStandardMaps = true; + }*/ + + if (doExperimentalWrap) { + TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm"); + TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)"); + + SC threshold; + // If we're doing the ML-style halving of the drop tol at each level, we do that here. + if (pL.get("aggregation: use ml scaling of drop tol")) + threshold = pL.get("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID()); + else + threshold = as(pL.get("aggregation: drop tol")); + + std::string distanceLaplacianAlgoStr = pL.get("aggregation: distance laplacian algo"); + std::string classicalAlgoStr = pL.get("aggregation: classical algo"); + real_type realThreshold = STS::magnitude(threshold); // CMS: Rename this to "magnitude threshold" sometime + + //////////////////////////////////////////////////// + // Remove this bit once we are confident that cut-based dropping works. +#ifdef HAVE_MUELU_DEBUG + int distanceLaplacianCutVerbose = 0; +#endif +#ifdef DJS_READ_ENV_VARIABLES + if (getenv("MUELU_DROP_TOLERANCE_MODE")) { + distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE")); + } + + if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) { + auto tmp = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD")); + realThreshold = 1e-4 * tmp; + } + +#ifdef HAVE_MUELU_DEBUG + if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) { + distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE")); + } +#endif +#endif + //////////////////////////////////////////////////// + + enum decisionAlgoType { defaultAlgo, + unscaled_cut, + scaled_cut, + scaled_cut_symmetric }; + + decisionAlgoType distanceLaplacianAlgo = defaultAlgo; + decisionAlgoType classicalAlgo = defaultAlgo; + if (algo == "distance laplacian") { + if (distanceLaplacianAlgoStr == "default") + distanceLaplacianAlgo = defaultAlgo; + else if (distanceLaplacianAlgoStr == "unscaled cut") + distanceLaplacianAlgo = unscaled_cut; + else if (distanceLaplacianAlgoStr == "scaled cut") + distanceLaplacianAlgo = scaled_cut; + else if (distanceLaplacianAlgoStr == "scaled cut symmetric") + distanceLaplacianAlgo = scaled_cut_symmetric; + else + TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\""); + GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + } else if (algo == "classical") { + if (classicalAlgoStr == "default") + classicalAlgo = defaultAlgo; + else if (classicalAlgoStr == "unscaled cut") + classicalAlgo = unscaled_cut; + else if (classicalAlgoStr == "scaled cut") + classicalAlgo = scaled_cut; + else + TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\""); + GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + + } else + GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + Set(currentLevel, "Filtering", (threshold != STS::zero())); + + const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); + + // NOTE: We don't support signed classical RS or SA with cut drop at present + TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation"); + TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation"); + + GO numDropped = 0, numTotal = 0; + std::string graphType = "unamalgamated"; // for description purposes only + + /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. + BlockSize is the number of storage blocks that must kept together during the amalgamation process. + + Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: + + numPDEs = BlockSize * storageblocksize. + + If numPDEs==1 + Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1 + No other values makes sense. + + If numPDEs>1 + If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs. + If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n. + Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested. + */ + TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); + const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); + /************************** RS or SA-style Classical Dropping (and variants) **************************/ if (algo == "classical") { if (predrop_ == null) { @@ -506,7 +2155,7 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { size_t nnz = A->getNumEntriesInLocalRow(row); bool rowIsDirichlet = boundaryNodes[row]; ArrayView indices; @@ -573,11 +2222,11 @@ void CoalesceDropFactory::Build(Level rows(row + 1) = realnnz; } } else { - /* Cut Algorithm */ + /* Cut Algorithm */ // CMS using DropTol = Details::DropTol; std::vector drop_vec; - drop_vec.reserve(nnz); + drop_vec.reserve(nnz); const real_type zero = Teuchos::ScalarTraits::zero(); const real_type one = Teuchos::ScalarTraits::one(); LO rownnz = 0; @@ -1594,7 +3243,7 @@ void CoalesceDropFactory::Build(Level } // if (doExperimentalWrap) ... else ... -} // Build +} // BuildKokkos template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 105e33e71f5ab985b6559fcc047b4b596336efb1 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 22 Jul 2024 12:19:34 -0600 Subject: [PATCH 042/243] MueLu: Cut Drop Memory Optimization DropTol structure in algorithm replaced with new, smaller DropTolKokkos structure. Computations are now done on the fly. Code passes current unit tests. No significant change in speed. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 130 ++++++++++++------ 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index a8befaea592b..eeb3f91dbfd6 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -94,35 +94,48 @@ namespace MueLu { namespace Details { template struct DropTol { - KOKKOS_INLINE_FUNCTION //NEW DropTol() = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol const&) = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol&&) = default; DropTol& operator=(DropTol const&) = default; DropTol& operator=(DropTol&&) = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(real_type val_, real_type diag_, LO col_, bool drop_) : val{val_} , diag{diag_} , col{col_} , drop{drop_} {} - real_type val{0}; - real_type diag{0}; - LO col{-1}; - //NEW Can't run these host functions on device - //real_type val{Teuchos::ScalarTraits::zero()}; - //real_type diag{Teuchos::ScalarTraits::zero()}; - //LO col{Teuchos::OrdinalTraits::invalid()}; + real_type val{Teuchos::ScalarTraits::zero()}; + real_type diag{Teuchos::ScalarTraits::zero()}; + LO col{Teuchos::OrdinalTraits::invalid()}; bool drop{true}; // CMS: Auxillary information for debugging info // real_type aux_val {Teuchos::ScalarTraits::nan()}; }; + +template +struct DropTolKokkos { + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos() = default; + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(DropTolKokkos const&) = default; + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(DropTolKokkos&&) = default; + + DropTolKokkos& operator=(DropTolKokkos const&) = default; + DropTolKokkos& operator=(DropTolKokkos&&) = default; + + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(LO col_, bool drop_) + : col{col_} + , drop{drop_} {} + + LO col{-1}; + LO drop{true}; +}; } // namespace Details template @@ -767,7 +780,7 @@ void CoalesceDropFactory::Build(Level using ExecSpace = typename Node::execution_space; using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; - using DropTol = Details::DropTol; + using DropTolKokkos = Details::DropTolKokkos; //move from host to device ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); @@ -779,7 +792,7 @@ void CoalesceDropFactory::Build(Level int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); //stackedTimer->stop("init"); //stackedTimer->start("loop"); @@ -790,74 +803,103 @@ void CoalesceDropFactory::Build(Level size_t n = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { LO col = rowView.colidx(colID); if(row == col) { - drop_view(colID) = DropTol(0, 1, colID, false); + drop_view(colID) = DropTolKokkos(colID, true); count++; } //Don't aggregate boundaries else if(!boundaryNodesDevice(colID)) { - typename STS::magnitudeType aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| - typename STS::magnitudeType aij = static_cast(std::fabs(static_cast(rowView.value(colID) * rowView.value(colID)))); // |a_i j|^2 - drop_view(colID) = DropTol(aij, aiiajj, colID, false); + drop_view(colID) = DropTolKokkos(colID, false); count++; } }, n); + + size_t dropStart = n; if (algorithm == unscaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.val > b.val; + Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { + if(x.drop || y.drop) { + return x.drop < y.drop; + } + else { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + return x_aij > y_aij; + } }); //find index where dropping starts - size_t dropStart; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { auto const& x = drop_view(i - 1); auto const& y = drop_view(i); - auto a = x.val; - auto b = y.val; - if(a > realThreshold * b) { + typename STS::magnitudeType x_aij = 0; + typename STS::magnitudeType y_aij = 0; + if(!x.drop) { + x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + } + if(!y.drop) { + y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + } + + if(x_aij > realThreshold * y_aij) { if(i < min) { min = i; } } }, Kokkos::Min(dropStart)); - - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; - }); - } } else if (algorithm == scaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; + Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { + if(x.drop || y.drop) { + return x.drop < y.drop; + } + else { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + return x_aij / x_aiiajj > y_aij / y_aiiajj; + } }); + //find index where dropping starts - size_t dropStart; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { auto const& x = drop_view(i - 1); auto const& y = drop_view(i); - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if(a > realThreshold * b) { + typename STS::magnitudeType x_val = 0; + typename STS::magnitudeType y_val = 0; + if(!x.drop) { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + x_val = x_aij / x_aiiajj; + } + if(!y.drop) { + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + y_val = y_aij / y_aiiajj; + } + + if(x_val > realThreshold * y_val) { if(i < min) { min = i; } } }, Kokkos::Min(dropStart)); - - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; - }); - } } - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.col < b.col; + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) { + return a.col < b.col; }); - + LO rownnz = 0; GO rowDropped = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { From cdee728089dcf993f67cf194dbc51575a4a766e5 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 22 Jul 2024 19:00:22 -0600 Subject: [PATCH 043/243] MueLu: Sorting Now Resembles numpy.argsort Per Christian's request. DropTolKokkos structure removed and replaced with view indices and view of drop flags. ORIGINAL code removed. BuildKokkos removed. Removed commented out timers. Added comments. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_decl.hpp | 1 - .../MueLu_CoalesceDropFactory_def.hpp | 1736 +---------------- 2 files changed, 53 insertions(+), 1684 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp index db5e9a291313..96b5e778f6bc 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp @@ -160,7 +160,6 @@ class CoalesceDropFactory : public SingleLevelFactoryBase { //@} void Build(Level& currentLevel) const; // Build - void BuildKokkos(Level& currentLevel) const; private: // pre-drop function diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index eeb3f91dbfd6..da606ab20ff6 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -61,8 +61,8 @@ #include -#include //NEW -#include //NEW +#include +#include #include "MueLu_CoalesceDropFactory_decl.hpp" #include "MueLu_AmalgamationFactory.hpp" @@ -115,27 +115,6 @@ struct DropTol { // CMS: Auxillary information for debugging info // real_type aux_val {Teuchos::ScalarTraits::nan()}; }; - -template -struct DropTolKokkos { - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos() = default; - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(DropTolKokkos const&) = default; - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(DropTolKokkos&&) = default; - - DropTolKokkos& operator=(DropTolKokkos const&) = default; - DropTolKokkos& operator=(DropTolKokkos&&) = default; - - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(LO col_, bool drop_) - : col{col_} - , drop{drop_} {} - - LO col{-1}; - LO drop{true}; -}; } // namespace Details template @@ -529,180 +508,6 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; -#define NEW -#ifdef ORIGINAL - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - if (classicalAlgo == defaultAlgo) { - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } else { - /* Cut Algorithm */ - // CMS - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - LO rownnz = 0; - // NOTE: This probably needs to be fixed for rowsum - - // find magnitudes - for (LO colID = 0; colID < (LO)nnz; colID++) { - LO col = indices[colID]; - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - - // Don't aggregate boundaries - if (boundaryNodes[colID]) continue; - typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (classicalAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (classicalAlgo == scaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - bool drop = false; - // printf("[%d] Scaled Cut: ",(int)row); - // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; - -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); - } - drop_vec[i].drop = drop; - } - // printf("\n"); - } - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - // don't drop diagonal - if (row == col) { - columns[realnnz++] = col; - rownnz++; - continue; - } - - if (!drop_vec[idxID].drop) { - columns[realnnz++] = col; - rownnz++; - } else { - numDropped++; - } - } - // CMS - rows[row + 1] = realnnz; - } - } // end for row -#endif - -#ifdef NEW if(classicalAlgo == defaultAlgo) { SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { @@ -772,15 +577,11 @@ void CoalesceDropFactory::Build(Level } } // end for row } - else { //NEW START - //auto stackedTimer = rcp(new Teuchos::StackedTimer("timer")); - //Teuchos::TimeMonitor::setStackedTimer(stackedTimer); - //stackedTimer->start("init"); + else { SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); using ExecSpace = typename Node::execution_space; using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; - using DropTolKokkos = Details::DropTolKokkos; //move from host to device ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); @@ -792,10 +593,9 @@ void CoalesceDropFactory::Build(Level int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); - //stackedTimer->stop("init"); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto index_views = Kokkos::View("index_views", A_device.nnz()); - //stackedTimer->start("loop"); Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { LO row = teamMember.league_rank(); auto rowView = A_device.row(row); @@ -803,45 +603,52 @@ void CoalesceDropFactory::Build(Level size_t n = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + index_view(colID) = colID; LO col = rowView.colidx(colID); + //ignore diagonals for now, they are checked again later if(row == col) { - drop_view(colID) = DropTolKokkos(colID, true); + drop_view(colID) = true; count++; } //Don't aggregate boundaries - else if(!boundaryNodesDevice(colID)) { - drop_view(colID) = DropTolKokkos(colID, false); + else if(boundaryNodesDevice(colID)) { + drop_view(colID) = true; + } + else { + drop_view(colID) = false; count++; } }, n); size_t dropStart = n; if (algorithm == unscaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { - if(x.drop || y.drop) { - return x.drop < y.drop; + //push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); } else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); return x_aij > y_aij; } }); //find index where dropping starts Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = drop_view(i - 1); - auto const& y = drop_view(i); + auto const& x = index_view(i - 1); + auto const& y = index_view(i); typename STS::magnitudeType x_aij = 0; typename STS::magnitudeType y_aij = 0; - if(!x.drop) { - x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + if(!drop_view(x)) { + x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); } - if(!y.drop) { - y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + if(!drop_view(y)) { + y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); } if(x_aij > realThreshold * y_aij) { @@ -851,34 +658,34 @@ void CoalesceDropFactory::Build(Level } }, Kokkos::Min(dropStart)); } else if (algorithm == scaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { - if(x.drop || y.drop) { - return x.drop < y.drop; + //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); } else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); return x_aij / x_aiiajj > y_aij / y_aiiajj; } }); - //find index where dropping starts Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = drop_view(i - 1); - auto const& y = drop_view(i); + auto const& x = index_view(i - 1); + auto const& y = index_view(i); typename STS::magnitudeType x_val = 0; typename STS::magnitudeType y_val = 0; - if(!x.drop) { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + if(!drop_view(x)) { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); x_val = x_aij / x_aiiajj; } - if(!y.drop) { - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + if(!drop_view(y)) { + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); y_val = y_aij / y_aiiajj; } @@ -890,22 +697,19 @@ void CoalesceDropFactory::Build(Level }, Kokkos::Min(dropStart)); } + //drop everything to the right of where values stop passing threshold if(dropStart < n) { Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; + drop_view(index_view(i)) = true; }); } - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) { - return a.col < b.col; - }); - LO rownnz = 0; GO rowDropped = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal - if(row == col || !drop_view(idxID).drop) { + if(row == col || !drop_view(idxID)) { keep++; } else { @@ -913,1459 +717,25 @@ void CoalesceDropFactory::Build(Level drop++; } }, rownnz, rowDropped); + globalnnz += rownnz; totalDropped += rowDropped; rownnzView(row) = rownnz; }, realnnz, numDropped); - //stackedTimer->stop("loop"); - - //stackedTimer->start("remove"); - + + //update column indices so that kept indices are aligned to the left for subview that happens later on auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); - //stackedTimer->stop("remove"); - - //update row indices - //stackedTimer->start("scan"); + //update row indices by adding up new # of nnz in each row auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { partial_sum += rownnzView(i); if(is_final) rowsDevice(i+1) = partial_sum; }); Kokkos::deep_copy(rows, rowsDevice); - //stackedTimer->stop("scan"); - - //stackedTimer->stop("timer"); - //stackedTimer->report(std::cout, Teuchos::DefaultComm::getComm()); - } //NEW END -#endif - - numTotal = A->getLocalNumEntries(); - - if (aggregationMayCreateDirichlet) { - // If the only element remaining after filtering is diagonal, mark node as boundary - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - if (rows[row + 1] - rows[row] <= 1) - boundaryNodes[row] = true; - } - } - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A")); - graph->SetBoundaryNodeMap(boundaryNodes); - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < boundaryNodes.size(); ++i) - if (boundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", 1); - - // If we're doing signed classical, we might want to block-diagonalize *after* the dropping - if (generateColoringGraph) { - RCP colorGraph; - RCP importer = A->getCrsGraph()->getImporter(); - BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer); - Set(currentLevel, "Coloring Graph", colorGraph); - // #define CMS_DUMP -#ifdef CMS_DUMP - { - Xpetra::IO::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(graph)->GetCrsGraph()); - Xpetra::IO::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(colorGraph)->GetCrsGraph()); - // int rank = graph->GetDomainMap()->getComm()->getRank(); - // { - // std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); - // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); - // colorGraph->print(*fancy,Debug); - // } - // { - // std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); - // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); - // graph->print(*fancy,Debug); - // } - } -#endif - } // end generateColoringGraph - } else if (BlockSize > 1 && threshold == STS::zero()) { - // Case 3: Multiple DOF/node problem without dropping - const RCP rowMap = A->getRowMap(); - const RCP colMap = A->getColMap(); - - graphType = "amalgamated"; - - // build node row map (uniqueMap) and node column map (nonUniqueMap) - // the arrays rowTranslation and colTranslation contain the local node id - // given a local dof id. The data is calculated by the AmalgamationFactory and - // stored in the variable container "UnAmalgamationInfo" - RCP uniqueMap = amalInfo->getNodeRowMap(); - RCP nonUniqueMap = amalInfo->getNodeColMap(); - Array rowTranslation = *(amalInfo->getRowTranslation()); - Array colTranslation = *(amalInfo->getColTranslation()); - - // get number of local nodes - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - // Allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - ArrayRCP pointBoundaryNodes; - pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); - - // extract striding information - LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) - LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) - LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - blkSize = Teuchos::as(strMap->getFixedBlockSize()); - blkId = strMap->getStridedBlockId(); - if (blkId > -1) - blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); - } - - // loop over all local nodes - LO realnnz = 0; - rows(0) = 0; - Array indicesExtra; - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). - // Therefore, looping over all dofs is fine here. We use blkPartSize as we work - // with local ids. - // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) - // node. - bool isBoundary = false; - if (pL.get("aggregation: greedy Dirichlet") == true) { - for (LO j = 0; j < blkPartSize; j++) { - if (pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = true; - break; - } - } - } else { - isBoundary = true; - for (LO j = 0; j < blkPartSize; j++) { - if (!pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = false; - break; - } - } - } - - // Merge rows of A - // The array indicesExtra contains local column node ids for the current local node "row" - if (!isBoundary) - MergeRows(*A, row, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - numTotal += indices.size(); - - // add the local column node ids to the full columns array which - // contains the local column node ids for all local node rows - LO nnz = indices.size(), rownnz = 0; - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns(realnnz++) = col; - rownnz++; - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - rows(row + 1) = realnnz; - } // for (LO row = 0; row < numRows; row++) - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes - << " agglomerated Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); // full block size - - } else if (BlockSize > 1 && threshold != STS::zero()) { - // Case 4: Multiple DOF/node problem with dropping - const RCP rowMap = A->getRowMap(); - const RCP colMap = A->getColMap(); - graphType = "amalgamated"; - - // build node row map (uniqueMap) and node column map (nonUniqueMap) - // the arrays rowTranslation and colTranslation contain the local node id - // given a local dof id. The data is calculated by the AmalgamationFactory and - // stored in the variable container "UnAmalgamationInfo" - RCP uniqueMap = amalInfo->getNodeRowMap(); - RCP nonUniqueMap = amalInfo->getNodeColMap(); - Array rowTranslation = *(amalInfo->getRowTranslation()); - Array colTranslation = *(amalInfo->getColTranslation()); - - // get number of local nodes - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - // Allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); - - // extract striding information - LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) - LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) - LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - blkSize = Teuchos::as(strMap->getFixedBlockSize()); - blkId = strMap->getStridedBlockId(); - if (blkId > -1) - blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); - } - - // extract diagonal data for dropping strategy - RCP ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - const ArrayRCP ghostedDiagVals = ghostedDiag->getData(0); - - // loop over all local nodes - LO realnnz = 0; - rows[0] = 0; - Array indicesExtra; - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). - // Therefore, looping over all dofs is fine here. We use blkPartSize as we work - // with local ids. - // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) - // node. - bool isBoundary = false; - if (pL.get("aggregation: greedy Dirichlet") == true) { - for (LO j = 0; j < blkPartSize; j++) { - if (pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = true; - break; - } - } - } else { - isBoundary = true; - for (LO j = 0; j < blkPartSize; j++) { - if (!pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = false; - break; - } - } - } - - // Merge rows of A - // The array indicesExtra contains local column node ids for the current local node "row" - if (!isBoundary) - MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - numTotal += indices.size(); - - // add the local column node ids to the full columns array which - // contains the local column node ids for all local node rows - LO nnz = indices.size(), rownnz = 0; - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns[realnnz++] = col; - rownnz++; - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - rows[row + 1] = realnnz; - } // for (LO row = 0; row < numRows; row++) - // columns.resize(realnnz); - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes - << " agglomerated Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); // full block size - } - - } else if (algo == "distance laplacian") { - LO blkSize = A->GetFixedBlockSize(); - GO indexBase = A->getRowMap()->getIndexBase(); - // [*0*] : FIXME - // ap: somehow, if I move this line to [*1*], Belos throws an error - // I'm not sure what's going on. Do we always have to Get data, if we did - // DeclareInput for it? - // RCP Coords = Get< RCP >(currentLevel, "Coordinates"); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); - - if ((blkSize == 1) && (threshold == STS::zero())) { - // Trivial case: scalar problem, no dropping. Can return original graph - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - graph->SetBoundaryNodeMap(pointBoundaryNodes); - graphType = "unamalgamated"; - numTotal = A->getLocalNumEntries(); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < pointBoundaryNodes.size(); ++i) - if (pointBoundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "DofsPerNode", blkSize); - Set(currentLevel, "Graph", graph); - - } else { - // ap: We make quite a few assumptions here; general case may be a lot different, - // but much much harder to implement. We assume that: - // 1) all maps are standard maps, not strided maps - // 2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic - // way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i - // - // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo, - // but as I totally don't understand that code, here is my solution - - // [*1*]: see [*0*] - - // Check that the number of local coordinates is consistent with the #rows in A - TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible, - "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ")."); - - const RCP colMap = A->getColMap(); - RCP uniqueMap, nonUniqueMap; - Array colTranslation; - if (blkSize == 1) { - uniqueMap = A->getRowMap(); - nonUniqueMap = A->getColMap(); - graphType = "unamalgamated"; - - } else { - uniqueMap = Coords->getMap(); - TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible, - "Different index bases for matrix and coordinates"); - - AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation); - - graphType = "amalgamated"; - } - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - RCP ghostedCoords; - RCP ghostedLaplDiag; - Teuchos::ArrayRCP ghostedLaplDiagData; - if (threshold != STS::zero()) { - // Get ghost coordinates - RCP importer; - { - SubFactoryMonitor m1(*this, "Import construction", currentLevel); - if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) { - GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl; - importer = realA->getCrsGraph()->getImporter(); - } else { - GetOStream(Warnings0) << "Constructing new importer instance" << std::endl; - importer = ImportFactory::Build(uniqueMap, nonUniqueMap); - } - } // subtimer - ghostedCoords = Xpetra::MultiVectorFactory::Build(nonUniqueMap, Coords->getNumVectors()); - { - SubFactoryMonitor m1(*this, "Coordinate import", currentLevel); - ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT); - } // subtimer - - // Construct Distance Laplacian diagonal - RCP localLaplDiag = VectorFactory::Build(uniqueMap); - Array indicesExtra; - Teuchos::Array> coordData; - if (threshold != STS::zero()) { - const size_t numVectors = ghostedCoords->getNumVectors(); - coordData.reserve(numVectors); - for (size_t j = 0; j < numVectors; j++) { - Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); - coordData.push_back(tmpData); - } - } - { - SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel); - ArrayRCP localLaplDiagData = localLaplDiag->getDataNonConst(0); - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - - if (blkSize == 1) { - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - } else { - // Merge rows of A - indicesExtra.resize(0); - MergeRows(*A, row, indicesExtra, colTranslation); - indices = indicesExtra; - } - - LO nnz = indices.size(); - bool haveAddedToDiag = false; - for (LO colID = 0; colID < nnz; colID++) { - const LO col = indices[colID]; - - if (row != col) { - if (use_dlap_weights == SINGLE_WEIGHTS) { - /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col, - MueLu::Utilities::Distance2(coordData, row, col), - MueLu::Utilities::Distance2(dlap_weights(),coordData, row, col));*/ - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - // printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities::Distance2(coordData, row, col)); - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - haveAddedToDiag = true; - } - } - // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns. - // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs. - if (!haveAddedToDiag) - localLaplDiagData[row] = STS::rmax(); - } - } // subtimer - { - SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel); - ghostedLaplDiag = VectorFactory::Build(nonUniqueMap); - ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT); - ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0); - } // subtimer - - } else { - GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl; - } - - // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian - - // allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - -#ifdef HAVE_MUELU_DEBUG - // DEBUGGING - for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666; -#endif - - // Extra array for if we're allowing symmetrization with cutting - ArrayRCP rows_stop; - bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric; - if (use_stop_array) - // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows); - rows_stop.resize(numRows); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - LO realnnz = 0; - rows(0) = 0; - - Array indicesExtra; - { - SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel); - Teuchos::Array> coordData; - if (threshold != STS::zero()) { - const size_t numVectors = ghostedCoords->getNumVectors(); - coordData.reserve(numVectors); - for (size_t j = 0; j < numVectors; j++) { - Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); - coordData.push_back(tmpData); - } - } - - ArrayView vals; // CMS hackery - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - bool isBoundary = false; - - if (blkSize == 1) { - // ArrayView vals;//CMS uncomment - A->getLocalRowView(row, indices, vals); - isBoundary = pointBoundaryNodes[row]; - } else { - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - for (LO j = 0; j < blkSize; j++) { - if (!pointBoundaryNodes[row * blkSize + j]) { - isBoundary = false; - break; - } - } - - // Merge rows of A - if (!isBoundary) - MergeRows(*A, row, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - } - numTotal += indices.size(); - - LO nnz = indices.size(), rownnz = 0; - - if (use_stop_array) { - rows(row + 1) = rows(row) + nnz; - realnnz = rows(row); - } - - if (threshold != STS::zero()) { - // default - if (distanceLaplacianAlgo == defaultAlgo) { - /* Standard Distance Laplacian */ - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - - if (row == col) { - columns(realnnz++) = col; - rownnz++; - continue; - } - - // We do not want the distance Laplacian aggregating boundary nodes - if (isBoundary) continue; - - SC laplVal; - if (use_dlap_weights == SINGLE_WEIGHTS) { - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); - real_type aij = STS::magnitude(laplVal * laplVal); - - if (aij > aiiajj) { - columns(realnnz++) = col; - rownnz++; - } else { - numDropped++; - } - } - } else { - /* Cut Algorithm */ - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - - // find magnitudes - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - // We do not want the distance Laplacian aggregating boundary nodes - if (isBoundary) continue; - - SC laplVal; - if (use_dlap_weights == SINGLE_WEIGHTS) { - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - - real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); - real_type aij = STS::magnitude(laplVal * laplVal); - - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (distanceLaplacianAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } - - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - - // don't drop diagonal - if (row == col) { - columns(realnnz++) = col; - rownnz++; - // printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val); - continue; - } - - if (!drop_vec[idxID].drop) { - columns(realnnz++) = col; - // printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); - rownnz++; - } else { - // printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); - numDropped++; - } - } - } - } else { - // Skip laplace calculation and threshold comparison for zero threshold - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns(realnnz++) = col; - rownnz++; - } - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - - if (use_stop_array) - rows_stop[row] = rownnz + rows[row]; - else - rows[row + 1] = realnnz; - } // for (LO row = 0; row < numRows; row++) - - } // subtimer - - if (use_stop_array) { - // Do symmetrization of the cut matrix - // NOTE: We assume nested row/column maps here - for (LO row = 0; row < numRows; row++) { - for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) { - LO col = columns[colidx]; - if (col >= numRows) continue; - - bool found = false; - for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) { - if (columns[t_col] == row) - found = true; - } - // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing - // into a Dirichlet unknown. In that case don't. - if (!found && !pointBoundaryNodes[col] && Teuchos::as(rows_stop[col]) < rows[col + 1]) { - LO new_idx = rows_stop[col]; - // printf("(%d,%d) SYMADD entry\n",col,row); - columns[new_idx] = row; - rows_stop[col]++; - numDropped--; - } - } - } - - // Condense everything down - LO current_start = 0; - for (LO row = 0; row < numRows; row++) { - LO old_start = current_start; - for (LO col = rows(row); col < rows_stop[row]; col++) { - if (current_start != col) { - columns(current_start) = columns(col); - } - current_start++; - } - rows[row] = old_start; - } - rows(numRows) = realnnz = current_start; - } - - RCP graph; - { - SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel); - graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - } // subtimer - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes" - << " using threshold " << dirichletThreshold << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); - } - } - - if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) { - RCP> comm = A->getRowMap()->getComm(); - GO numGlobalTotal, numGlobalDropped; - MueLu_sumAll(comm, numTotal, numGlobalTotal); - MueLu_sumAll(comm, numDropped, numGlobalDropped); - GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal; - if (numGlobalTotal != 0) - GetOStream(Statistics1) << " (" << 100 * Teuchos::as(numGlobalDropped) / Teuchos::as(numGlobalTotal) << "%)"; - GetOStream(Statistics1) << std::endl; - } - - } else { - // what Tobias has implemented - - SC threshold = as(pL.get("aggregation: drop tol")); - // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - GetOStream(Runtime0) << "algorithm = \"" - << "failsafe" - << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - Set(currentLevel, "Filtering", (threshold != STS::zero())); - - RCP rowMap = A->getRowMap(); - RCP colMap = A->getColMap(); - - LO blockdim = 1; // block dim for fixed size blocks - GO indexBase = rowMap->getIndexBase(); // index base of maps - GO offset = 0; - - // 1) check for blocking/striding information - if (A->IsView("stridedMaps") && - Teuchos::rcp_dynamic_cast(A->getRowMap("stridedMaps")) != Teuchos::null) { - Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!) - RCP strMap = Teuchos::rcp_dynamic_cast(A->getRowMap()); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed."); - blockdim = strMap->getFixedBlockSize(); - offset = strMap->getOffset(); - oldView = A->SwitchToView(oldView); - GetOStream(Statistics1) << "CoalesceDropFactory::Build():" - << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl; - } else - GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl; - - // 2) get row map for amalgamated matrix (graph of A) - // with same distribution over all procs as row map of A - RCP nodeMap = amalInfo->getNodeRowMap(); - GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl; - - // 3) create graph of amalgamated matrix - RCP crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim); - - LO numRows = A->getRowMap()->getLocalNumElements(); - LO numNodes = nodeMap->getLocalNumElements(); - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes); - Kokkos::deep_copy(amalgBoundaryNodes, false); - const ArrayRCP numberDirichletRowsPerNode(numNodes, 0); // helper array counting the number of Dirichlet nodes associated with node - bool bIsDiagonalEntry = false; // boolean flag stating that grid==gcid - - // 4) do amalgamation. generate graph of amalgamated matrix - // Note, this code is much more inefficient than the leightwight implementation - // Most of the work has already been done in the AmalgamationFactory - for (LO row = 0; row < numRows; row++) { - // get global DOF id - GO grid = rowMap->getGlobalElement(row); - - // reinitialize boolean helper variable - bIsDiagonalEntry = false; - - // translate grid to nodeid - GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase); - - size_t nnz = A->getNumEntriesInLocalRow(row); - Teuchos::ArrayView indices; - Teuchos::ArrayView vals; - A->getLocalRowView(row, indices, vals); - - RCP> cnodeIds = Teuchos::rcp(new std::vector); // global column block ids - LO realnnz = 0; - for (LO col = 0; col < Teuchos::as(nnz); col++) { - GO gcid = colMap->getGlobalElement(indices[col]); // global column id - - if (vals[col] != STS::zero()) { - GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase); - cnodeIds->push_back(cnodeId); - realnnz++; // increment number of nnz in matrix row - if (grid == gcid) bIsDiagonalEntry = true; - } - } - - if (realnnz == 1 && bIsDiagonalEntry == true) { - LO lNodeId = nodeMap->getLocalElement(nodeId); - numberDirichletRowsPerNode[lNodeId] += 1; // increment Dirichlet row counter associated with lNodeId - if (numberDirichletRowsPerNode[lNodeId] == blockdim) // mark full Dirichlet nodes - amalgBoundaryNodes[lNodeId] = true; - } - - Teuchos::ArrayRCP arr_cnodeIds = Teuchos::arcp(cnodeIds); - - if (arr_cnodeIds.size() > 0) - crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds()); - } - // fill matrix graph - crsGraph->fillComplete(nodeMap, nodeMap); - - // 5) create MueLu Graph object - RCP graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A")); - - // Detect and record rows that correspond to Dirichlet boundary conditions - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - // 6) store results in Level - // graph->SetBoundaryNodeMap(gBoundaryNodeMap); - Set(currentLevel, "DofsPerNode", blockdim); - Set(currentLevel, "Graph", graph); - - } // if (doExperimentalWrap) ... else ... - -} // Build - -template -void CoalesceDropFactory::BuildKokkos(Level& currentLevel) const { - FactoryMonitor m(*this, "BuildKokkos", currentLevel); - - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType real_type; - typedef Xpetra::MultiVector RealValuedMultiVector; - typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; - - if (predrop_ != Teuchos::null) - GetOStream(Parameters0) << predrop_->description(); - - RCP realA = Get>(currentLevel, "A"); - RCP amalInfo = Get>(currentLevel, "UnAmalgamationInfo"); - const ParameterList& pL = GetParameterList(); - bool doExperimentalWrap = pL.get("lightweight wrap"); - - GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl; - std::string algo = pL.get("aggregation: drop scheme"); - const bool aggregationMayCreateDirichlet = pL.get("aggregation: dropping may create Dirichlet"); - - RCP Coords; - RCP A; - - bool use_block_algorithm = false; - LO interleaved_blocksize = as(pL.get("aggregation: block diagonal: interleaved blocksize")); - bool useSignedClassicalRS = false; - bool useSignedClassicalSA = false; - bool generateColoringGraph = false; - - // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it - // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case - typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); - - RCP ghostedBlockNumber; - ArrayRCP g_block_id; - - if (algo == "distance laplacian") { - // Grab the coordinates for distance laplacian - Coords = Get>(currentLevel, "Coordinates"); - A = realA; - } else if (algo == "signed classical sa") { - useSignedClassicalSA = true; - algo = "classical"; - A = realA; - } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") { - useSignedClassicalRS = true; - // if(realA->GetFixedBlockSize() > 1) { - RCP BlockNumber = Get>(currentLevel, "BlockNumber"); - // Ghost the column block numbers if we need to - RCP importer = realA->getCrsGraph()->getImporter(); - if (!importer.is_null()) { - SubFactoryMonitor m1(*this, "Block Number import", currentLevel); - ghostedBlockNumber = Xpetra::VectorFactory::Build(importer->getTargetMap()); - ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT); - } else { - ghostedBlockNumber = BlockNumber; - } - g_block_id = ghostedBlockNumber->getData(0); - // } - if (algo == "block diagonal colored signed classical") - generateColoringGraph = true; - algo = "classical"; - A = realA; - - } else if (algo == "block diagonal") { - // Handle the "block diagonal" filtering and then leave - BlockDiagonalize(currentLevel, realA, false); - return; - } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") { - // Handle the "block diagonal" filtering, and then continue onward - use_block_algorithm = true; - RCP filteredMatrix = BlockDiagonalize(currentLevel, realA, true); - if (algo == "block diagonal distance laplacian") { - // We now need to expand the coordinates by the interleaved blocksize - RCP OldCoords = Get>(currentLevel, "Coordinates"); - if (OldCoords->getLocalLength() != realA->getLocalNumRows()) { - LO dim = (LO)OldCoords->getNumVectors(); - Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim); - for (LO k = 0; k < dim; k++) { - ArrayRCP old_vec = OldCoords->getData(k); - ArrayRCP new_vec = Coords->getDataNonConst(k); - for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) { - LO new_base = i * dim; - for (LO j = 0; j < interleaved_blocksize; j++) - new_vec[new_base + j] = old_vec[i]; - } - } - } else { - Coords = OldCoords; - } - algo = "distance laplacian"; - } else if (algo == "block diagonal classical") { - algo = "classical"; - } - // All cases - A = filteredMatrix; - rowSumTol = -1.0; - } else { - A = realA; - } - - // Distance Laplacian weights - Array dlap_weights = pL.get>("aggregation: distance laplacian directional weights"); - enum { NO_WEIGHTS = 0, - SINGLE_WEIGHTS, - BLOCK_WEIGHTS }; - int use_dlap_weights = NO_WEIGHTS; - if (algo == "distance laplacian") { - LO dim = (LO)Coords->getNumVectors(); - // If anything isn't 1.0 we need to turn on the weighting - bool non_unity = false; - for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) { - if (dlap_weights[i] != 1.0) { - non_unity = true; - } - } - if (non_unity) { - LO blocksize = use_block_algorithm ? as(pL.get("aggregation: block diagonal: interleaved blocksize")) : 1; - if ((LO)dlap_weights.size() == dim) - use_dlap_weights = SINGLE_WEIGHTS; - else if ((LO)dlap_weights.size() == blocksize * dim) - use_dlap_weights = BLOCK_WEIGHTS; - else { - TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, - "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize"); - } - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl; - } - } - - // decide wether to use the fast-track code path for standard maps or the somewhat slower - // code path for non-standard maps - /*bool bNonStandardMaps = false; - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0) - bNonStandardMaps = true; - }*/ - - if (doExperimentalWrap) { - TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm"); - TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)"); - - SC threshold; - // If we're doing the ML-style halving of the drop tol at each level, we do that here. - if (pL.get("aggregation: use ml scaling of drop tol")) - threshold = pL.get("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID()); - else - threshold = as(pL.get("aggregation: drop tol")); - - std::string distanceLaplacianAlgoStr = pL.get("aggregation: distance laplacian algo"); - std::string classicalAlgoStr = pL.get("aggregation: classical algo"); - real_type realThreshold = STS::magnitude(threshold); // CMS: Rename this to "magnitude threshold" sometime - - //////////////////////////////////////////////////// - // Remove this bit once we are confident that cut-based dropping works. -#ifdef HAVE_MUELU_DEBUG - int distanceLaplacianCutVerbose = 0; -#endif -#ifdef DJS_READ_ENV_VARIABLES - if (getenv("MUELU_DROP_TOLERANCE_MODE")) { - distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE")); - } - - if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) { - auto tmp = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD")); - realThreshold = 1e-4 * tmp; - } - -#ifdef HAVE_MUELU_DEBUG - if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) { - distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE")); - } -#endif -#endif - //////////////////////////////////////////////////// - - enum decisionAlgoType { defaultAlgo, - unscaled_cut, - scaled_cut, - scaled_cut_symmetric }; - - decisionAlgoType distanceLaplacianAlgo = defaultAlgo; - decisionAlgoType classicalAlgo = defaultAlgo; - if (algo == "distance laplacian") { - if (distanceLaplacianAlgoStr == "default") - distanceLaplacianAlgo = defaultAlgo; - else if (distanceLaplacianAlgoStr == "unscaled cut") - distanceLaplacianAlgo = unscaled_cut; - else if (distanceLaplacianAlgoStr == "scaled cut") - distanceLaplacianAlgo = scaled_cut; - else if (distanceLaplacianAlgoStr == "scaled cut symmetric") - distanceLaplacianAlgo = scaled_cut_symmetric; - else - TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\""); - GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - } else if (algo == "classical") { - if (classicalAlgoStr == "default") - classicalAlgo = defaultAlgo; - else if (classicalAlgoStr == "unscaled cut") - classicalAlgo = unscaled_cut; - else if (classicalAlgoStr == "scaled cut") - classicalAlgo = scaled_cut; - else - TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\""); - GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - - } else - GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - Set(currentLevel, "Filtering", (threshold != STS::zero())); - - const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); - - // NOTE: We don't support signed classical RS or SA with cut drop at present - TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation"); - TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation"); - - GO numDropped = 0, numTotal = 0; - std::string graphType = "unamalgamated"; // for description purposes only - - /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. - BlockSize is the number of storage blocks that must kept together during the amalgamation process. - - Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: - - numPDEs = BlockSize * storageblocksize. - - If numPDEs==1 - Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1 - No other values makes sense. - - If numPDEs>1 - If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs. - If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n. - Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested. - */ - TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); - const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); - - /************************** RS or SA-style Classical Dropping (and variants) **************************/ - if (algo == "classical") { - if (predrop_ == null) { - // ap: this is a hack: had to declare predrop_ as mutable - predrop_ = rcp(new PreDropFunctionConstVal(threshold)); - } - - if (predrop_ != null) { - RCP predropConstVal = rcp_dynamic_cast(predrop_); - TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast, - "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed."); - // If a user provided a predrop function, it overwrites the XML threshold parameter - SC newt = predropConstVal->GetThreshold(); - if (newt != threshold) { - GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl; - threshold = newt; - } - } - // At this points we either have - // (predrop_ != null) - // Therefore, it is sufficient to check only threshold - if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { - // Case 1: scalar problem, no dropping => just use matrix graph - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - // Detect and record rows that correspond to Dirichlet boundary conditions - auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); - - graph->SetBoundaryNodeMap(boundaryNodes); - numTotal = A->getLocalNumEntries(); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < boundaryNodes.size(); ++i) - if (boundaryNodes[i]) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "DofsPerNode", 1); - Set(currentLevel, "Graph", graph); - - } else if ((BlockSize == 1 && threshold != STS::zero()) || - (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || - (BlockSize == 1 && useSignedClassicalRS) || - (BlockSize == 1 && useSignedClassicalSA)) { - // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original - // graph's map information, e.g., whether index is local - // OR a matrix without a CrsGraph - - // allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - using MT = typename STS::magnitudeType; - RCP ghostedDiag; - ArrayRCP ghostedDiagVals; - ArrayRCP negMaxOffDiagonal; - // RS style needs the max negative off-diagonal, SA style needs the diagonal - if (useSignedClassicalRS) { - if (ghostedBlockNumber.is_null()) { - negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl; - } else { - negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber); - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; - } - } else { - ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - ghostedDiagVals = ghostedDiag->getData(0); - } - auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) { - if (ghostedBlockNumber.is_null()) { - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); - } else { - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl; - Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes); - } - } - - LO realnnz = 0; - rows(0) = 0; - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - if (classicalAlgo == defaultAlgo) { - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } else { - /* Cut Algorithm */ - // CMS - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - LO rownnz = 0; - // NOTE: This probably needs to be fixed for rowsum - - // find magnitudes - for (LO colID = 0; colID < (LO)nnz; colID++) { - LO col = indices[colID]; - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - - // Don't aggregate boundaries - if (boundaryNodes[colID]) continue; - typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (classicalAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (classicalAlgo == scaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - bool drop = false; - // printf("[%d] Scaled Cut: ",(int)row); - // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; - -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); - } - drop_vec[i].drop = drop; - } - // printf("\n"); - } - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - // don't drop diagonal - if (row == col) { - columns[realnnz++] = col; - rownnz++; - continue; - } - - if (!drop_vec[idxID].drop) { - columns[realnnz++] = col; - rownnz++; - } else { - numDropped++; - } - } - // CMS - rows[row + 1] = realnnz; - } - } // end for row + } numTotal = A->getLocalNumEntries(); @@ -3285,7 +1655,7 @@ void CoalesceDropFactory::BuildKokkos } // if (doExperimentalWrap) ... else ... -} // BuildKokkos +} // Build template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 8c8c83ba2966e1bbfb51cbcd8fe82dca744b4d65 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Thu, 1 Aug 2024 11:00:51 -0600 Subject: [PATCH 044/243] Python: Avoid install errors with older pybind11 smart_holder SHA --- packages/rol/pyrol/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/rol/pyrol/pyproject.toml b/packages/rol/pyrol/pyproject.toml index 17c36d0221a2..0138249c8c0e 100644 --- a/packages/rol/pyrol/pyproject.toml +++ b/packages/rol/pyrol/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["scikit-build-core>=0.3.3","pybind11 @ git+https://github.com/pybind/pybind11.git@smart_holder"] +requires = ["scikit-build-core>=0.3.3","pybind11 @ git+https://github.com/pybind/pybind11.git@c6c9a9e59b2b64393de0432aa6867ed27367912a"] build-backend = "scikit_build_core.build" From 7c78025bf66884fd8b788bac2ae352e871241e7d Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Wed, 24 Jul 2024 17:51:23 -0600 Subject: [PATCH 045/243] MueLu: std::complex Replaced With Kokkos::complex Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 475 +++++++++--------- 1 file changed, 241 insertions(+), 234 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index da606ab20ff6..ad5895e2e41b 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -475,10 +475,10 @@ void CoalesceDropFactory::Build(Level typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); using MT = typename STS::magnitudeType; - RCP ghostedDiag; + RCP ghostedDiag; ArrayRCP ghostedDiagVals; ArrayRCP negMaxOffDiagonal; - // RS style needs the max negative off-diagonal, SA style needs the diagonal + // RS style needs the max negative off-diagonal, SA style needs the diagonal if (useSignedClassicalRS) { if (ghostedBlockNumber.is_null()) { negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); @@ -491,10 +491,12 @@ void CoalesceDropFactory::Build(Level } } else { ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - ghostedDiagVals = ghostedDiag->getData(0); - } + if(classicalAlgo == defaultAlgo) { + ghostedDiagVals = ghostedDiag->getData(0); + } + } auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) { + if (rowSumTol > 0.) { if (ghostedBlockNumber.is_null()) { if (GetVerbLevel() & Statistics1) GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; @@ -508,234 +510,239 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - if(classicalAlgo == defaultAlgo) { - SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } // end for row - } - else { - SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); - using ExecSpace = typename Node::execution_space; - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - - //move from host to device - ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); - Kokkos::View ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy(ghostedDiagValsArrayView); - auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); - - auto At = Utilities::Op2TpetraCrs(A); - auto A_device = At->getLocalMatrixDevice(); - - int algorithm = classicalAlgo; - Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); - auto index_views = Kokkos::View("index_views", A_device.nnz()); - - Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { - LO row = teamMember.league_rank(); - auto rowView = A_device.row(row); - size_t nnz = rowView.length; - - size_t n = 0; - auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - - //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { - index_view(colID) = colID; - LO col = rowView.colidx(colID); - //ignore diagonals for now, they are checked again later - if(row == col) { - drop_view(colID) = true; - count++; - } - //Don't aggregate boundaries - else if(boundaryNodesDevice(colID)) { - drop_view(colID) = true; - } - else { - drop_view(colID) = false; - count++; - } - }, n); - - size_t dropStart = n; - if (algorithm == unscaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - return x_aij > y_aij; - } - }); - - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename STS::magnitudeType x_aij = 0; - typename STS::magnitudeType y_aij = 0; - if(!drop_view(x)) { - x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - } - if(!drop_view(y)) { - y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - } - - if(x_aij > realThreshold * y_aij) { - if(i < min) { - min = i; - } - } - }, Kokkos::Min(dropStart)); - } else if (algorithm == scaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); - return x_aij / x_aiiajj > y_aij / y_aiiajj; - } - }); - - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename STS::magnitudeType x_val = 0; - typename STS::magnitudeType y_val = 0; - if(!drop_view(x)) { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); - x_val = x_aij / x_aiiajj; - } - if(!drop_view(y)) { - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); - y_val = y_aij / y_aiiajj; - } - - if(x_val > realThreshold * y_val) { - if(i < min) { - min = i; - } - } - }, Kokkos::Min(dropStart)); - } - - //drop everything to the right of where values stop passing threshold - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(index_view(i)) = true; - }); - } - - LO rownnz = 0; - GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { - LO col = rowView.colidx(idxID); - //don't drop diagonal - if(row == col || !drop_view(idxID)) { - keep++; - } - else { - rowView.colidx(idxID) = -1; - drop++; - } - }, rownnz, rowDropped); - - globalnnz += rownnz; - totalDropped += rowDropped; - rownnzView(row) = rownnz; - }, realnnz, numDropped); - - //update column indices so that kept indices are aligned to the left for subview that happens later on - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); - Kokkos::deep_copy(columns, columnsDevice); - - //update row indices by adding up new # of nnz in each row - auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); - Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { - partial_sum += rownnzView(i); - if(is_final) rowsDevice(i+1) = partial_sum; - }); - Kokkos::deep_copy(rows, rowsDevice); - } + if(classicalAlgo == defaultAlgo) { + SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } // end for row + } + else { + /* Cut Algorithm */ + SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using ATS = Kokkos::ArithTraits; + using impl_scalar_type = typename ATS::val_type; + using implATS = Kokkos::ArithTraits; + + //move from host to device + auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + auto thresholdKokkos = static_cast(threshold); + auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); + + auto At = Utilities::Op2TpetraCrs(A); + auto A_device = At->getLocalMatrixDevice(); + + int algorithm = classicalAlgo; + Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto index_views = Kokkos::View("index_views", A_device.nnz()); + + Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.row(row); + size_t nnz = rowView.length; + + size_t n = 0; + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + + //find magnitudes + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + index_view(colID) = colID; + LO col = rowView.colidx(colID); + //ignore diagonals for now, they are checked again later + if(row == col) { + drop_view(colID) = true; + count++; + } + //Don't aggregate boundaries + else if(boundaryNodesDevice(colID)) { + drop_view(colID) = true; + } + else { + drop_view(colID) = false; + count++; + } + }, n); + + size_t dropStart = n; + if (algorithm == unscaled_cut) { + //push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } + else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + return x_aij > y_aij; + } + }); + + //find index where dropping starts + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_aij = 0; + typename implATS::magnitudeType y_aij = 0; + if(!drop_view(x)) { + x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + } + if(!drop_view(y)) { + y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + } + + if(x_aij > realThresholdKokkos * y_aij) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + } else if (algorithm == scaled_cut) { + //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } + else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); + } + }); + + //find index where dropping starts + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_val = 0; + typename implATS::magnitudeType y_val = 0; + if(!drop_view(x)) { + typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + x_val = x_aij / x_aiiajj; + } + if(!drop_view(y)) { + typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + y_val = y_aij / y_aiiajj; + } + + if(x_val > realThresholdKokkos * y_val) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + } + + //drop everything to the right of where values stop passing threshold + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(index_view(i)) = true; + }); + } + + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + //don't drop diagonal + if(row == col || !drop_view(idxID)) { + keep++; + } + else { + rowView.colidx(idxID) = -1; + drop++; + } + }, rownnz, rowDropped); + + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, realnnz, numDropped); + + //update column indices so that kept indices are aligned to the left for subview that happens later on + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); + Kokkos::deep_copy(columns, columnsDevice); + + //update row indices by adding up new # of nnz in each row + auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if(is_final) rowsDevice(i+1) = partial_sum; + }); + Kokkos::deep_copy(rows, rowsDevice); + } numTotal = A->getLocalNumEntries(); @@ -1655,7 +1662,7 @@ void CoalesceDropFactory::Build(Level } // if (doExperimentalWrap) ... else ... -} // Build +} // Build template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 5290c24cb28adee2a184bd525b068bddf95d3830 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Wed, 7 Aug 2024 17:23:05 -0600 Subject: [PATCH 046/243] find_instances.py now finds and constructs parameter hierarchy for nearly all parameters in rol/src. Signed-off-by: Greg von Winckel --- .../rol_parameters/all_rol_parameters.json | 499 ++++++++++++++++++ packages/rol/rol_parameters/find_instances.py | 203 +++++++ .../rol/rol_parameters/list_of_rol_files.txt | 148 ++++++ 3 files changed, 850 insertions(+) create mode 100644 packages/rol/rol_parameters/all_rol_parameters.json create mode 100644 packages/rol/rol_parameters/find_instances.py create mode 100644 packages/rol/rol_parameters/list_of_rol_files.txt diff --git a/packages/rol/rol_parameters/all_rol_parameters.json b/packages/rol/rol_parameters/all_rol_parameters.json new file mode 100644 index 000000000000..124e41cd1d51 --- /dev/null +++ b/packages/rol/rol_parameters/all_rol_parameters.json @@ -0,0 +1,499 @@ +{"Absolute Value Approximation": {}, + "Adaptive Rank": {}, + "Additive Rank Update": {}, + "Adjoint Domain Seed": {}, + "Adjoint Range Seed": {}, + "Adjoint Rank": {}, + "Dimension": {}, + "Distribution": {"Name": {}}, + "Dynamic Constraint": {"Solve": {"Absolute Residual Tolerance": {}, + "Backtracking Factor": {}, + "Iteration Limit": {}, + "Output Iteration History": {}, + "Relative Residual Tolerance": {}, + "Solver Type": {}, + "Step Tolerance": {}, + "Sufficient Decrease Tolerance": {}, + "Zero Initial Guess": {}}}, + "General": {"Inexact Gradient": {}, + "Inexact Hessian-Times-A-Vector": {}, + "Inexact Objective Function": {}, + "Krylov": {"Absolute Tolerance": {}, + "Iteration Limit": {}, + "Relative Tolerance": {}, + "Type": {}, + "User Defined Krylov Name": {}}, + "Output Level": {}, + "Polyhedral Projection": {"Absolute Tolerance": {}, + "Douglas-Rachford": {"Constraint Weight": {}, + "Penalty Parameter": {}, + "Relaxation Parameter": {}}, + "Iteration Limit": {}, + "Multiplier Tolerance": {}, + "Relative Tolerance": {}, + "Semismooth Newton": {"Backtracking Rate": {}, + "Krylov": {}, + "Line Search Type": {}, + "Project onto Separating Hyperplane": {}, + "Regularization Scale": {}, + "Relative Error Scale": {}, + "Step Tolerance": {}, + "Sufficient Decrease Tolerance": {}}, + "Type": {}}, + "Secant": {"Type": {}, + "Use as Hessian": {}, + "Use as Preconditioner": {}, + "User Defined Secant Name": {}}}, + "Log Rank Update Shift": {}, + "Log Rank Update Slope": {}, + "Lower Bound": {}, + "Maximum Rank": {}, + "Maximum Tolerance": {}, + "Mean": {}, + "Number of Quadrature Points": {}, + "Number of Samples": {}, + "OED": {"A-Optimality": {"Number of Samples": {}, + "Randomized Trace Estimation": {}}, + "C-Optimality": {"C Value": {}}, + "Constraint Scaling": {}, + "Double-Well Penalty Parameter": {}, + "I-Optimality": {"Number of Samples": {}, + "Randomized Trace Estimation": {}, + "Use Trace Form": {}}, + "L1 Penalty Parameter": {}, + "Objective Scaling": {}, + "Optimality Type": {}, + "R-Optimality": {"Confidence Level": {}, + "Convex Combination Parameter": {}, + "Smoothing Parameter": {}, + "Use Primal-Dual Algorithm": {}}, + "Use Double-Well Penalty": {}, + "Use L1 Penalty": {}, + "Use Scaling": {}, + "Use Storage": {}}, + "Orthogonality Tolerance": {}, + "Output Frequency": {}, + "Plus Function": {"Smoothing Parameter": {}}, + "Points File Name": {}, + "Print Optimization Vector": {}, + "Print Quadrature to Screen": {}, + "Rank Update Factor": {}, + "Reorthogonalization Iterations": {}, + "SOL": {"Deviation Measure": {"CVaR": {}, + "Entropic": {}, + "Generalized Moreau-Yosida CVaR": {}, + "Log Quantile": {}, + "Moreau-Yosida CVaR": {}, + "Name": {}, + "Smoothed Upper Range": {}, + "Truncated Mean": {}}, + "Distribution": {"Arcsine": {"Lower Bound": {}, "Upper Bound": {}}, + "Beta": {"Shape 1": {}, "Shape 2": {}}, + "Cauchy": {"Location": {}, "Scale": {}}, + "Dirac": {"Location": {}}, + "Exponential": {"Location": {}, "Scale": {}}, + "Gamma": {"Scale": {}, "Shape": {}}, + "Gaussian": {"Mean": {}, "Variance": {}}, + "Gumbel": {"Location": {}, "Scale": {}}, + "Kumaraswamy": {"Exponent 1": {}, + "Exponent 2": {}, + "Lower Bound": {}, + "Upper Bound": {}}, + "Laplace": {"Mean": {}, "Scale": {}}, + "Logistic": {"Mean": {}, "Scale": {}}, + "Name": {}, + "Parabolic": {"Lower Bound": {}, "Upper Bound": {}}, + "Raised Cosine": {"Mean": {}, "Scale": {}}, + "Smale": {"Lower Bound": {}, "Upper Bound": {}}, + "Triangle": {"Lower Bound": {}, + "Peak Location": {}, + "Upper Bound": {}}, + "Truncated Exponential": {}, + "Truncated Gaussian": {}, + "Uniform": {"Lower Bound": {}, "Upper Bound": {}}}, + "Error Measure": {"Exponential": {}, + "Generalized Moreau-Yosida-Koenker-Bassett": {}, + "Huber": {}, + "Koenker-Bassett": {}, + "Log Quantile": {}, + "Moreau-Yosida-Koenker-Bassett": {}, + "Name": {}, + "Smoothed Worst Case": {}}, + "Initial Statistic": {}, + "Objective": {"Risk Measure": {"CVaR": {"Confidence Level": {}, + "Convex Combination Parameter": {}}, + "Confidence Level": {}, + "Convex Combination Parameter": {}, + "Name": {}, + "Smoothing Parameter": {}}, + "Risk Neutral": {"Use Storage": {}}, + "Store Sampled Value and Gradient": {}, + "Type": {}}, + "Primal Dual Risk": {"Dual Tolerance": {}, + "Dual Tolerance Decrease Exponent": {}, + "Dual Tolerance Update Exponent": {}, + "Dual Tolerance Update Scale": {}, + "Initial Constraint Tolerance": {}, + "Initial Dual Tolerance": {}, + "Initial Gradient Tolerance": {}, + "Initial Penalty Parameter": {}, + "Iteration Limit": {}, + "Maximum Penalty Parameter": {}, + "Penalty Update Scale": {}, + "Print Subproblem Solve History": {}, + "Solver Tolerance Decrease Scale": {}, + "Solver Tolerance Update Scale": {}, + "Update Frequency": {}}, + "Probability": {"Name": {}, "bPOE": {"Threshold": {}}}, + "Progressive Hedging": {"Dynamic Tolerance": {}, + "Fixed Tolerance": {}, + "Initial Penalty Parameter": {}, + "Iteration Limit": {}, + "Maximum Penalty Parameter": {}, + "Nonanticipativity Constraint Tolerance": {}, + "Penalty Update Frequency": {}, + "Penalty Update Scale": {}, + "Print Subproblem Solve History": {}, + "Use Inexact Solve": {}, + "Use Presolve": {}}, + "Regret Measure": {"Exponential": {}, + "Generalized Moreau-Yosida Mean Absolute Loss": {}, + "Log Quantile": {}, + "Mean Absolute Loss": {}, + "Mean L2": {}, + "Moreau-Yosida Mean Absolute Loss": {}, + "Name": {}, + "Smoothed Worst Case": {}, + "Truncated Mean": {}}, + "Risk Measure": {"CVaR": {"Confidence Level": {}, + "Convex Combination Parameter": {}}, + "Chebyshev Spectral Risk": {}, + "Convex Combination Risk Measure": {}, + "Entropic Risk": {}, + "F-Divergence": {}, + "Generalized Moreau-Yosida CVaR": {}, + "HMCR": {"Confidence Level": {}, + "Convex Combination Parameter": {}}, + "KL Divergence": {}, + "Log Quantile": {}, + "Mean Plus Deviation": {}, + "Mean Plus Deviation From Target": {}, + "Mean Plus Semi-Deviation": {"Coefficient": {}}, + "Mean Plus Semi-Deviation From Target": {"Coefficient": {}, + "Target": {}}, + "Mean Plus Variance": {}, + "Mean Plus Variance From Target": {}, + "Mixed CVaR": {}, + "Moreau-Yosida CVaR": {}, + "Name": {}, + "Quantile Radius": {}, + "Safety Margin": {}, + "Second Order CVaR": {}, + "Smoothed Worst Case": {}, + "Spectral Risk": {}, + "Truncated Mean": {}}, + "Sample Generator": {"SROM": {"Adaptive Sampling": {}, + "Atom Tolerance": {}, + "CDF Smoothing Parameter": {}, + "Number of New Samples Per Adaptation": {}, + "Number of Samples": {}, + "Presolve for Atom Locations": {}, + "Probability Tolerance": {}}, + "User Input": {}}, + "Store Sampled Value and Gradient": {}, + "Type": {}}, + "Scalar Minimization": {"Bisection": {"Iteration Limit": {}, "Tolerance": {}}, + "Brent"s": {"Iteration Limit": {}, "Tolerance": {}}, + "Golden Section": {"Iteration Limit": {}, + "Tolerance": {}}, + "Iteration Limit": {}, + "Tolerance": {}, + "Type": {}}, + "Scale": {}, + "SimOpt": {"Solve": {"Absolute Residual Tolerance": {}, + "Backtracking Factor": {}, + "Iteration Limit": {}, + "Output Iteration History": {}, + "Relative Residual Tolerance": {}, + "Solver Type": {}, + "Step Tolerance": {}, + "Sufficient Decrease Tolerance": {}, + "Zero Initial Guess": {}}}, + "Smoothing Parameter": {}, + "Standard Deviation": {}, + "State Domain Seed": {}, + "State Range Seed": {}, + "State Rank": {}, + "State Sensitivity Domain Seed": {}, + "State Sensitivity Range Seed": {}, + "State Sensitivity Rank": {}, + "Status Test": {"Constraint Tolerance": {}, + "Gradient Scale": {}, + "Gradient Tolerance": {}, + "Iteration Limit": {}, + "Proximal Gradient Parameter": {}, + "Step Tolerance": {}, + "Use Relative Tolerances": {}}, + "Step": {"Augmented Lagrangian": {"Constraint Scaling": {}, + "Feasibility Tolerance Decrease Exponent": {}, + "Feasibility Tolerance Update Exponent": {}, + "Initial Feasibility Tolerance": {}, + "Initial Optimality Tolerance": {}, + "Initial Penalty Parameter": {}, + "Level of Hessian Approximation": {}, + "Maximum Penalty Parameter": {}, + "Objective Scaling": {}, + "Optimality Tolerance Decrease Exponent": {}, + "Optimality Tolerance Update Exponent": {}, + "Penalty Parameter Growth Factor": {}, + "Penalty Parameter Reciprocal Lower Bound": {}, + "Print Intermediate Optimization History": {}, + "Subproblem Iteration Limit": {}, + "Use Default Initial Penalty Parameter": {}, + "Use Default Problem Scaling": {}, + "Use Scaled Augmented Lagrangian": {}}, + "Bundle": {"Cutting Plane Iteration Limit": {}, + "Cutting Plane Tolerance": {}, + "Distance Measure Coefficient": {}, + "Epsilon Solution Tolerance": {}, + "Initial Trust-Region Parameter": {}, + "Locality Measure Coefficient": {}, + "Lower Threshold for Serious Step": {}, + "Maximum Bundle Size": {}, + "Removal Size for Bundle Update": {}, + "Upper Threshold for Null Step": {}, + "Upper Threshold for Serious Step": {}}, + "Composite Step": {"Initial Radius": {}, + "Optimality System Solver": {"Fix Tolerance": {}, + "Iteration Limit": {}, + "Nominal Relative Tolerance": {}}, + "Tangential Subproblem Solver": {"Iteration Limit": {}, + "Relative Tolerance": {}}, + "Use Constraint Hessian": {}}, + "Fletcher": {"Inexact Solves": {}, + "Level of Hessian Approximation": {}, + "Maximum Penalty Parameter": {}, + "Minimum Penalty Parameter": {}, + "Minimum Regularization Parameter": {}, + "Modify Penalty Parameter": {}, + "Penalty Parameter": {}, + "Penalty Parameter Growth Factor": {}, + "Quadratic Penalty Parameter": {}, + "Regularization Parameter": {}, + "Regularization Parameter Decrease Factor": {}, + "Subproblem Iteration Limit": {}}, + "Interior Point": {"Barrier Penalty Reduction Factor": {}, + "Initial Barrier Parameter": {}, + "Linear Damping Coefficient": {}, + "Maximum Barrier Parameter": {}, + "Minimum Barrier Parameter": {}, + "Subproblem": {"Feasibility Tolerance Reduction Factor": {}, + "Initial Feasibility Tolerance": {}, + "Initial Optimality Tolerance": {}, + "Iteration Limit": {}, + "Optimality Tolerance Reduction Factor": {}, + "Print History": {}, + "Step Type": {}}, + "Use Linear Damping": {}}, + "Line Search": {"Accept Last Alpha": {}, + "Accept Linesearch Minimizer": {}, + "Apply Prox to Initial Guess": {}, + "Curvature Condition": {"General Parameter": {}, + "Generalized Wolfe Parameter": {}, + "Type": {}}, + "Descent Method": {"Nonlinear CG Type": {}, + "Type": {}, + "User Defined Descent Direction Name": {}, + "User Defined Nonlinear CG Name": {}}, + "Finite Difference Directional Derivative": {}, + "Function Evaluation Limit": {}, + "Inexact Newton": {"Lower Step Size Safeguard": {}, + "Subproblem Absolute Tolerance": {}, + "Subproblem Iteration Limit": {}, + "Subproblem Relative Tolerance": {}, + "Subproblem Solver": {}, + "Subproblem Tolerance Exponent": {}, + "Upper Step Size Safeguard": {}}, + "Initial Step Size": {}, + "Line-Search Method": {"Backtracking Rate": {}, + "Increase Rate": {}, + "Iteration Limit": {}, + "Path-Based Target Level": {"Target Relaxation Parameter": {}, + "Upper Bound on Path Length": {}}, + "Tolerance": {}, + "Type": {}, + "User Defined Line Search Name": {}}, + "Lower Bound for Initial Step Size": {}, + "Maximum Number of Function Evaluations": {}, + "Maximum Step Size": {}, + "Normalize Initial Step Size": {}, + "PQN": {"Lower Step Size Safeguard": {}, + "Subproblem Absolute Tolerance": {}, + "Subproblem Iteration Limit": {}, + "Subproblem Relative Tolerance": {}, + "Subproblem Solver": {}, + "Upper Step Size Safeguard": {}}, + "Quasi-Newton": {"L-Secant-B": {"Cauchy Point": {"Decrease Tolerance": {}, + "Expansion Rate": {}, + "Initial Step Size": {}, + "Maximum Number of Expansion Steps": {}, + "Maximum Number of Reduction Steps": {}, + "Normalize Initial Step Size": {}, + "Reduction Rate": {}}, + "Relative Tolerance Exponent": {}, + "Sufficient Decrease Parameter": {}}, + "Method": {}}, + "Status Test": {"Gradient Tolerance": {}}, + "Sufficient Decrease Tolerance": {}, + "Use Adaptive Step Size Selection": {}, + "Use Previous Step Length as Initial Guess": {}, + "User Defined Initial Step Size": {}}, + "Moreau-Yosida Penalty": {"Initial Penalty Parameter": {}, + "Maximum Penalty Parameter": {}, + "Penalty Parameter Growth Factor": {}, + "Subproblem": {"Feasibility Tolerance": {}, + "Iteration Limit": {}, + "Optimality Tolerance": {}, + "Print History": {}, + "Step Type": {}, + "Use Relative Tolerances": {}}, + "Update Multiplier": {}, + "Update Penalty": {}}, + "Primal Dual Active Set": {"Dual Scaling": {}, + "Iteration Limit": {}, + "Relative Gradient Tolerance": {}, + "Relative Step Tolerance": {}}, + "Primal Dual Interior Point": {"Barrier Objective": {"Initial Barrier Parameter": {}, + "Linear Damping Coefficient": {}, + "Use Linear Damping": {}}}, + "Spectral Gradient": {"Apply Prox to Initial Guess": {}, + "Backtracking Rate": {}, + "Function Evaluation Limit": {}, + "Initial Spectral Step Size": {}, + "Lower Step Size Safeguard": {}, + "Maximum Spectral Step Size": {}, + "Maximum Storage Size": {}, + "Minimum Spectral Step Size": {}, + "Sufficient Decrease Tolerance": {}, + "Upper Step Size Safeguard": {}}, + "Stabilized LCL": {"Constraint Scaling": {}, + "Elastic Penalty Parameter Growth Rate": {}, + "Feasibility Tolerance Decrease Exponent": {}, + "Feasibility Tolerance Increase Exponent": {}, + "Initial Elastic Penalty Parameter": {}, + "Initial Feasibility Tolerance": {}, + "Initial Optimality Tolerance": {}, + "Initial Penalty Parameter": {}, + "Level of Hessian Approximation": {}, + "Maximum Elastic Penalty Parameter": {}, + "Maximum Penalty Parameter": {}, + "Objective Scaling": {}, + "Optimality Tolerance Decrease Exponent": {}, + "Optimality Tolerance Increase Exponent": {}, + "Penalty Parameter Growth Factor": {}, + "Subproblem Iteration Limit": {}, + "Use Default Initial Penalty Parameter": {}, + "Use Default Problem Scaling": {}, + "Use Scaled Stabilized LCL": {}}, + "Trust Region": {"Apply Prox to Initial Guess": {}, + "Coleman-Li": {"Relative Tolerance Exponent": {}, + "Relaxation Safeguard": {}, + "Sufficient Decrease Parameter": {}}, + "General": {"Output Level": {}}, + "Inexact": {"Gradient": {"Relative Tolerance": {}, + "Tolerance Scaling": {}}, + "Value": {"Exponent": {}, + "Forcing Sequence Initial Value": {}, + "Forcing Sequence Reduction Factor": {}, + "Forcing Sequence Update Frequency": {}, + "Tolerance Scaling": {}}}, + "Initial Radius": {}, + "Kelley-Sachs": {"Binding Set Tolerance": {}, + "Initial Post-Smoothing Step Size": {}, + "Maximum Number of Smoothing Iterations": {}, + "Post-Smoothing Backtracking Rate": {}, + "Post-Smoothing Decrease Parameter": {}, + "Sufficient Decrease Parameter": {}}, + "Lin-More": {"Cauchy Point": {"Decrease Tolerance": {}, + "Expansion Rate": {}, + "Initial Step Size": {}, + "Maximum Number of Expansion Steps": {}, + "Maximum Number of Reduction Steps": {}, + "Normalize Initial Step Size": {}, + "Reduction Rate": {}}, + "Maximum Number of Minor Iterations": {}, + "Projected Search": {"Backtracking Rate": {}, + "Maximum Number of Steps": {}}, + "Relative Tolerance Exponent": {}, + "Sufficient Decrease Parameter": {}}, + "Maximum Radius": {}, + "Nonmonotone Storage Limit": {}, + "Nonmonotone Storage Size": {}, + "Radius Growing Rate": {}, + "Radius Growing Threshold": {}, + "Radius Shrinking Threshold": {}, + "SPG": {"Cauchy Point": {"Decrease Tolerance": {}, + "Expansion Rate": {}, + "Initial Step Size": {}, + "Maximum Number of Expansion Steps": {}, + "Maximum Number of Reduction Steps": {}, + "Normalize Initial Step Size": {}, + "Reduction Rate": {}}, + "Relative Tolerance Exponent": {}, + "Solver": {"Absolute Tolerance": {}, + "Compute Cauchy Point": {}, + "Iteration Limit": {}, + "Maximum Spectral Step Size": {}, + "Maximum Storage Size": {}, + "Minimum Spectral Step Size": {}, + "Relative Tolerance": {}, + "Sufficient Decrease Tolerance": {}, + "Use Nonmonotone Search": {}, + "Use Smallest Model Iterate": {}}, + "Sufficient Decrease Parameter": {}}, + "Safeguard Size": {}, + "Step Acceptance Threshold": {}, + "Subproblem Model": {}, + "Subproblem Solver": {}, + "TRN": {"Cauchy Point": {"Decrease Tolerance": {}, + "Expansion Rate": {}, + "Initial Step Size": {}, + "Maximum Number of Expansion Steps": {}, + "Maximum Number of Reduction Steps": {}, + "Normalize Initial Step Size": {}, + "Reduction Rate": {}}, + "Relative Tolerance Exponent": {}, + "Solver": {"Absolute Tolerance": {}, + "Iteration Limit": {}, + "Maximum Spectral Step Size": {}, + "Maximum Storage Size": {}, + "Minimum Spectral Step Size": {}, + "NCG": {"Descent Parameter": {}, + "Nonlinear CG Type": {}, + "Truncation Parameter for HZ CG": {}}, + "Relative Tolerance": {}, + "Subproblem Solver": {}, + "Sufficient Decrease Tolerance": {}, + "Use Nonmonotone Search": {}, + "Use Smallest Model Iterate": {}}, + "Sufficient Decrease Parameter": {}}, + "Use Radius Interpolation": {}}, + "Type": {}, + "iPiano": {"Apply Prox to Initial Guess": {}, + "Backtracking Rate": {}, + "Increase Rate": {}, + "Initial Lipschitz Constant Estimate": {}, + "Lower Interpolation Factor": {}, + "Momentum Parameter": {}, + "Reduction Iteration Limit": {}, + "Upper Interpolation Factor": {}, + "Use Constant Beta": {}}}, + "Sync Hessian Rank": {}, + "Truncate Approximation": {}, + "Upper Bound": {}, + "Use Basic Rank Update": {}, + "Use Hessian": {}, + "Use Only Sketched Sensitivity": {}, + "Use Sketching": {}, + "Weight Type": {}, + "Weights File Name": {}} diff --git a/packages/rol/rol_parameters/find_instances.py b/packages/rol/rol_parameters/find_instances.py new file mode 100644 index 000000000000..ef5d2f0b368e --- /dev/null +++ b/packages/rol/rol_parameters/find_instances.py @@ -0,0 +1,203 @@ +import re +import subprocess +import pathlib +from pprint import pprint +from typing import Set, Optional + + + +def run_grep_command(src_directory): + grep_command = [ + 'grep', + '-rE', + '-e', + r'(\.|\->)\s*(((s|g)et\s*\(\s*"([a-zA-Z0-9]|\s)+"\s*,\s*\S+\s*\))|sublist)', + '-e', + r'(\.|\->)\s*sublist\s*\(\s*\"', + src_directory + ] + + try: + result = subprocess.run(grep_command, capture_output=True, text=True, check=True) + return result.stdout + except subprocess.CalledProcessError as e: + print(f"Error occurred: {e}") + return e.stderr + + +def split_cpp_code(code_string): + # Use a regular expression to split on both '.' and '->' + # The regex looks for either '->' or '.' as delimiters + split_pattern = r'->|\.' + + # Split the string and discard the delimiters + tokens = re.split(split_pattern, code_string) + + # Remove any empty strings from the result and strip whitespace + tokens = [token.strip() for token in tokens if token.strip()] + + return tokens + + +def extract_quoted_substring(input_string): + # Regular expression pattern to match content between double quotes + pattern = r'"([^"]*)"' + + # Search for the pattern in the input string + match = re.search(pattern, input_string) + + if match: + # If a match is found, return the content between the quotes + return match.group(1) + else: + # If no match is found, return None or an empty string + return None # or return "" if you prefer + + + + + + + + + + + +def extract_quoted_strings(string_list): + return tuple((s.strip('"') for s in string_list if s.startswith('"') and s.endswith('"'))) + +def custom_sort_key(sublist): + return sublist[:len(sublist)] + +def sort_list_of_lists(list_of_lists): + return sorted(list_of_lists, key=custom_sort_key) + + + + +def parse_cpp_strings(input_list): + parsed_list = [] + + for item in input_list: + # Match a word without parentheses, a quoted string inside parentheses, + # or a quoted string as the first argument of get() or set() + match = re.search(r'(\w+)$|"([^"]*)"|\b(?:get|set)\s*\(\s*"([^"]*)"', item) + if match: + if match.group(1): # If it's a word without parentheses + parsed_list.append(match.group(1)) + elif match.group(2): # If it's a quoted string inside parentheses + parsed_list.append(f'"{match.group(2)}"') + elif match.group(3): # If it's a quoted string in get() or set() + parsed_list.append(f'"{match.group(3)}"') + + return parsed_list + +def build_hierarchy(data): + def resolve_list(value_list): + if not value_list: + return value_list + + first_item = value_list[0] + if first_item in data and not first_item.startswith('"'): + return resolve_list(data[first_item]) + value_list[1:] + else: + return [first_item] + resolve_list(value_list[1:]) + + return {key: resolve_list(value) for key, value in data.items()} + +def build_list_hierarchy(data_dict, input_lists): + def resolve_list(value_list): + if not value_list: + return value_list + + first_item = value_list[0] + if first_item in data_dict and not first_item.startswith('"'): + return resolve_list(data_dict[first_item]) + value_list[1:] + else: + return [first_item] + resolve_list(value_list[1:]) + + return [resolve_list(sublist) for sublist in input_lists] + +def create_hierarchical_dict(list_of_lists): + result = {} + for path in list_of_lists: + current = result + for key in path[:-1]: + if key not in current: + current[key] = {} + current = current[key] + current[path[-1]] = {} + return result + +if __name__ == '__main__': + + # Every line contains an instance calling at least one of the three functions: + # + # - ParameterList::sublist + # - ParameterList::get + # - ParameterList::set + # + # 1) Defining a local sublist variable + # 2) Getting a parameter + # 3) Setting a parameter + + rol_src = pathlib.Path('/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol/src') + + make_relative = lambda path_str : pathlib.Path(path_str).relative_to(rol_src,walk_up=True) + + strip_excess_whitespace = lambda text : re.sub(r'\s+',' ',text).strip() + + sublist_pattern = re.compile(r'\bsublist\s*\(\s*"([^"]+)"\s*\)') + + data = dict() + + exclusions = ['compatibility','step','zoo'] + + local_sublist_pattern = re.compile(r'[ParameterList|auto]\s*[&]\s*(\w+)\s*=\s*(\w+)[\.|\->](.*)') + + output = run_grep_command(rol_src) + for line in output.splitlines(): + splitline = line.split(':') + file = str(make_relative(splitline[0])) + code = strip_excess_whitespace(':'.join(splitline[1:])) + if not any(f'{e}/' in file for e in exclusions): + if file not in data.keys(): + data[file] = [code] + else: + data[file].append(code) + +# with open('list_of_rol_files.txt','w') as f: +# f.write('\n'.join(sorted(data.keys()))) + + paramset = set() + + for file, code in data.items(): +# print(f'{file}') + sublist = dict() + parameters = list() + for line in code: +# print(line) + # Look for locally defined sublists + match = re.search(local_sublist_pattern,line) + if match: + sublist[match.group(1)] = [match.group(2)] + parse_cpp_strings( split_cpp_code(match.group(3))) + else: + if '=' in line: + line = line.split('=')[1].strip() + parameters.append(parse_cpp_strings(split_cpp_code(line))) + sublist = build_hierarchy(sublist) +# print(sublist) + parameters = build_list_hierarchy(sublist,parameters) + [ paramset.add(tuple(p)) for p in map(extract_quoted_strings,parameters)] + + parameters = sorted(filter(len,map(list,paramset))) + +# for p in parameters: +# print(p) + + parameters = create_hierarchical_dict(parameters) + + +# pprint(parameters) +# for p in paramset: +# print(p) diff --git a/packages/rol/rol_parameters/list_of_rol_files.txt b/packages/rol/rol_parameters/list_of_rol_files.txt new file mode 100644 index 000000000000..49a3a4f21b40 --- /dev/null +++ b/packages/rol/rol_parameters/list_of_rol_files.txt @@ -0,0 +1,148 @@ +algorithm/ROL_OptimizationProblem.hpp +algorithm/ROL_OptimizationSolver.hpp +algorithm/TypeB/ROL_TypeB_AlgorithmFactory.hpp +algorithm/TypeB/ROL_TypeB_ColemanLiAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_GradientAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_InteriorPointAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_KelleySachsAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_LinMoreAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_MoreauYosidaAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_NewtonKrylovAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_PrimalDualActiveSetAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_QuasiNewtonAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_SpectralGradientAlgorithm_Def.hpp +algorithm/TypeB/ROL_TypeB_TrustRegionSPGAlgorithm_Def.hpp +algorithm/TypeE/ROL_TypeE_AlgorithmFactory.hpp +algorithm/TypeE/ROL_TypeE_AugmentedLagrangianAlgorithm_Def.hpp +algorithm/TypeE/ROL_TypeE_CompositeStepAlgorithm_Def.hpp +algorithm/TypeE/ROL_TypeE_FletcherAlgorithm_Def.hpp +algorithm/TypeE/ROL_TypeE_StabilizedLCLAlgorithm_Def.hpp +algorithm/TypeG/ROL_TypeG_AlgorithmFactory.hpp +algorithm/TypeG/ROL_TypeG_AugmentedLagrangianAlgorithm_Def.hpp +algorithm/TypeG/ROL_TypeG_InteriorPointAlgorithm_Def.hpp +algorithm/TypeG/ROL_TypeG_MoreauYosidaAlgorithm_Def.hpp +algorithm/TypeG/ROL_TypeG_StabilizedLCLAlgorithm_Def.hpp +algorithm/TypeG/augmentedlagrangian/ROL_AugmentedLagrangianObjective.hpp +algorithm/TypeG/fletcher/ROL_FletcherObjectiveBase_Def.hpp +algorithm/TypeG/interiorpoint/ROL_InteriorPointObjective.hpp +algorithm/TypeG/moreauyosida/ROL_MoreauYosidaObjective.hpp +algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp +algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm_Def.hpp +algorithm/TypeP/ROL_TypeP_ProxGradientAlgorithm_Def.hpp +algorithm/TypeP/ROL_TypeP_QuasiNewtonAlgorithm_Def.hpp +algorithm/TypeP/ROL_TypeP_SpectralGradientAlgorithm_Def.hpp +algorithm/TypeP/ROL_TypeP_TrustRegionAlgorithm_Def.hpp +algorithm/TypeP/ROL_TypeP_iPianoAlgorithm_Def.hpp +algorithm/TypeU/ROL_TypeU_AlgorithmFactory.hpp +algorithm/TypeU/ROL_TypeU_BundleAlgorithm_Def.hpp +algorithm/TypeU/ROL_TypeU_LineSearchAlgorithm_Def.hpp +algorithm/TypeU/ROL_TypeU_TrustRegionAlgorithm_Def.hpp +algorithm/TypeU/linesearch/ROL_BackTracking_U.hpp +algorithm/TypeU/linesearch/ROL_CubicInterp_U.hpp +algorithm/TypeU/linesearch/ROL_LineSearch_U.hpp +algorithm/TypeU/linesearch/ROL_LineSearch_U_Factory.hpp +algorithm/TypeU/linesearch/ROL_PathBasedTargetLevel_U.hpp +algorithm/TypeU/linesearch/ROL_ScalarMinimizationLineSearch_U.hpp +algorithm/TypeU/linesearch/descent/ROL_DescentDirection_U_Factory.hpp +algorithm/TypeU/linesearch/descent/ROL_NewtonKrylov_U.hpp +algorithm/TypeU/linesearch/descent/ROL_NonlinearCG_U.hpp +algorithm/TypeU/linesearch/descent/ROL_QuasiNewton_U.hpp +algorithm/TypeU/trustregion/ROL_SPGTrustRegion_U.hpp +algorithm/TypeU/trustregion/ROL_TruncatedCG_U.hpp +algorithm/TypeU/trustregion/ROL_TrustRegionModel_U.hpp +algorithm/TypeU/trustregion/ROL_TrustRegion_U_Factory.hpp +function/dynamic/ROL_DynamicConstraint.hpp +function/dynamic/ROL_ReducedDynamicObjective.hpp +function/polyproj/ROL_BrentsProjection_Def.hpp +function/polyproj/ROL_DaiFletcherProjection_Def.hpp +function/polyproj/ROL_DouglasRachfordProjection_Def.hpp +function/polyproj/ROL_DykstraProjection_Def.hpp +function/polyproj/ROL_PolyhedralProjectionFactory.hpp +function/polyproj/ROL_RiddersProjection_Def.hpp +function/polyproj/ROL_SemismoothNewtonProjection_Def.hpp +function/simopt/ROL_Constraint_SimOpt.hpp +oed/ROL_OED_Factory_Def.hpp +sol/algorithm/ROL_PrimalDualRisk.hpp +sol/algorithm/ROL_ProgressiveHedging.hpp +sol/algorithm/ROL_StochasticProblem_Def.hpp +sol/function/ROL_AbsoluteValue.hpp +sol/function/ROL_PlusFunction.hpp +sol/function/ROL_RiskBoundConstraint.hpp +sol/function/distribution/ROL_Arcsine.hpp +sol/function/distribution/ROL_Beta.hpp +sol/function/distribution/ROL_Cauchy.hpp +sol/function/distribution/ROL_Dirac.hpp +sol/function/distribution/ROL_DistributionFactory.hpp +sol/function/distribution/ROL_Exponential.hpp +sol/function/distribution/ROL_Gamma.hpp +sol/function/distribution/ROL_Gaussian.hpp +sol/function/distribution/ROL_Gumbel.hpp +sol/function/distribution/ROL_Kumaraswamy.hpp +sol/function/distribution/ROL_Laplace.hpp +sol/function/distribution/ROL_Logistic.hpp +sol/function/distribution/ROL_Parabolic.hpp +sol/function/distribution/ROL_RaisedCosine.hpp +sol/function/distribution/ROL_Smale.hpp +sol/function/distribution/ROL_Triangle.hpp +sol/function/distribution/ROL_TruncatedExponential.hpp +sol/function/distribution/ROL_TruncatedGaussian.hpp +sol/function/distribution/ROL_Uniform.hpp +sol/function/expectationquad/ROL_GenMoreauYosidaCVaR.hpp +sol/function/expectationquad/ROL_LogExponentialQuadrangle.hpp +sol/function/expectationquad/ROL_LogQuantileQuadrangle.hpp +sol/function/expectationquad/ROL_MeanVarianceQuadrangle.hpp +sol/function/expectationquad/ROL_MoreauYosidaCVaR.hpp +sol/function/expectationquad/ROL_QuantileQuadrangle.hpp +sol/function/expectationquad/ROL_SmoothedWorstCaseQuadrangle.hpp +sol/function/expectationquad/ROL_TruncatedMeanQuadrangle.hpp +sol/function/progressivehedging/ROL_PH_DeviationObjective.hpp +sol/function/progressivehedging/ROL_PH_ErrorObjective.hpp +sol/function/progressivehedging/ROL_PH_Objective.hpp +sol/function/progressivehedging/ROL_PH_ProbObjective.hpp +sol/function/progressivehedging/ROL_PH_RegretObjective.hpp +sol/function/progressivehedging/ROL_PH_RiskObjective.hpp +sol/function/progressivehedging/ROL_PH_bPOEObjective.hpp +sol/function/randvarfunctional/ROL_RandVarFunctionalFactory.hpp +sol/function/randvarfunctional/ROL_RandVarFunctionalInfo.hpp +sol/function/randvarfunctional/ROL_StochasticObjective.hpp +sol/function/randvarfunctional/deviation/ROL_DeviationMeasureFactory.hpp +sol/function/randvarfunctional/deviation/ROL_DeviationMeasureInfo.hpp +sol/function/randvarfunctional/error/ROL_ErrorMeasureFactory.hpp +sol/function/randvarfunctional/error/ROL_ErrorMeasureInfo.hpp +sol/function/randvarfunctional/probability/ROL_BPOE.hpp +sol/function/randvarfunctional/probability/ROL_ProbabilityFactory.hpp +sol/function/randvarfunctional/probability/ROL_ProbabilityInfo.hpp +sol/function/randvarfunctional/probability/ROL_SmoothedPOE.hpp +sol/function/randvarfunctional/regret/ROL_RegretMeasureFactory.hpp +sol/function/randvarfunctional/regret/ROL_RegretMeasureInfo.hpp +sol/function/randvarfunctional/risk/ROL_CVaR.hpp +sol/function/randvarfunctional/risk/ROL_ConvexCombinationRiskMeasure.hpp +sol/function/randvarfunctional/risk/ROL_EntropicRisk.hpp +sol/function/randvarfunctional/risk/ROL_HMCR.hpp +sol/function/randvarfunctional/risk/ROL_KLDivergence.hpp +sol/function/randvarfunctional/risk/ROL_MeanDeviation.hpp +sol/function/randvarfunctional/risk/ROL_MeanDeviationFromTarget.hpp +sol/function/randvarfunctional/risk/ROL_MeanSemiDeviation.hpp +sol/function/randvarfunctional/risk/ROL_MeanSemiDeviationFromTarget.hpp +sol/function/randvarfunctional/risk/ROL_MeanVariance.hpp +sol/function/randvarfunctional/risk/ROL_MeanVarianceFromTarget.hpp +sol/function/randvarfunctional/risk/ROL_MixedCVaR.hpp +sol/function/randvarfunctional/risk/ROL_QuantileRadius.hpp +sol/function/randvarfunctional/risk/ROL_RiskMeasureFactory.hpp +sol/function/randvarfunctional/risk/ROL_RiskMeasureInfo.hpp +sol/function/randvarfunctional/risk/fdivergence/ROL_FDivergence.hpp +sol/function/randvarfunctional/risk/spectral/ROL_ChebyshevSpectral.hpp +sol/function/randvarfunctional/risk/spectral/ROL_SecondOrderCVaR.hpp +sol/function/randvarfunctional/risk/spectral/ROL_SpectralRisk.hpp +sol/sampler/ROL_SROMGenerator.hpp +sol/sampler/ROL_UserInputGenerator.hpp +sol/status/ROL_PH_StatusTest.hpp +status/ROL_BundleStatusTest.hpp +status/ROL_ConstraintStatusTest.hpp +status/ROL_FletcherStatusTest.hpp +status/ROL_StatusTest.hpp +utils/ROL_BisectionScalarMinimization.hpp +utils/ROL_BrentsScalarMinimization.hpp +utils/ROL_GoldenSectionScalarMinimization.hpp +utils/ROL_ScalarMinimizationTest.hpp \ No newline at end of file From 4dff65a96640d6473d88c9dc0282bb0441f77c77 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Thu, 15 Aug 2024 12:40:22 -0600 Subject: [PATCH 047/243] MueLu: Code Review Fixes Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index ad5895e2e41b..0431bf011541 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -117,6 +117,11 @@ struct DropTol { }; } // namespace Details +enum decisionAlgoType { defaultAlgo, + unscaled_cut, + scaled_cut, + scaled_cut_symmetric }; + template RCP CoalesceDropFactory::GetValidParameterList() const { RCP validParamList = rcp(new ParameterList()); @@ -354,11 +359,6 @@ void CoalesceDropFactory::Build(Level #endif //////////////////////////////////////////////////// - enum decisionAlgoType { defaultAlgo, - unscaled_cut, - scaled_cut, - scaled_cut_symmetric }; - decisionAlgoType distanceLaplacianAlgo = defaultAlgo; decisionAlgoType classicalAlgo = defaultAlgo; if (algo == "distance laplacian") { @@ -591,24 +591,24 @@ void CoalesceDropFactory::Build(Level //move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes); auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); auto At = Utilities::Op2TpetraCrs(A); auto A_device = At->getLocalMatrixDevice(); - int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); auto drop_views = Kokkos::View("drop_views", A_device.nnz()); auto index_views = Kokkos::View("index_views", A_device.nnz()); Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { LO row = teamMember.league_rank(); - auto rowView = A_device.row(row); + auto rowView = A_device.rowConst(row); size_t nnz = rowView.length; - size_t n = 0; + size_t dropSize = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); @@ -629,10 +629,10 @@ void CoalesceDropFactory::Build(Level drop_view(colID) = false; count++; } - }, n); + }, dropSize); - size_t dropStart = n; - if (algorithm == unscaled_cut) { + size_t dropStart = dropSize; + if (classicalAlgo == unscaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { if(drop_view(x) || drop_view(y)) { @@ -646,7 +646,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_aij = 0; @@ -664,7 +664,7 @@ void CoalesceDropFactory::Build(Level } } }, Kokkos::Min(dropStart)); - } else if (algorithm == scaled_cut) { + } else if (classicalAlgo == scaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { if(drop_view(x) || drop_view(y)) { @@ -680,7 +680,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_val = 0; @@ -705,22 +705,23 @@ void CoalesceDropFactory::Build(Level } //drop everything to the right of where values stop passing threshold - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + if(dropStart < dropSize) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) { drop_view(index_view(i)) = true; }); } LO rownnz = 0; GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal if(row == col || !drop_view(idxID)) { + columnsDevice(A_device.graph.row_map(row) + idxID) = col; keep++; } else { - rowView.colidx(idxID) = -1; + columnsDevice(A_device.graph.row_map(row) + idxID) = -1; drop++; } }, rownnz, rowDropped); @@ -731,7 +732,6 @@ void CoalesceDropFactory::Build(Level }, realnnz, numDropped); //update column indices so that kept indices are aligned to the left for subview that happens later on - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); From babef0cdd20b652add2d093ccd653e105931cd07 Mon Sep 17 00:00:00 2001 From: Aurya Javeed Date: Mon, 19 Aug 2024 10:30:38 -0600 Subject: [PATCH 048/243] Fix shadowing warning --- .../TypeU/bundle/ROL_Bundle_U_TT_Def.hpp | 132 +++++++++--------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/packages/rol/src/algorithm/TypeU/bundle/ROL_Bundle_U_TT_Def.hpp b/packages/rol/src/algorithm/TypeU/bundle/ROL_Bundle_U_TT_Def.hpp index 5c00814a59d3..2e530f5bb161 100644 --- a/packages/rol/src/algorithm/TypeU/bundle/ROL_Bundle_U_TT_Def.hpp +++ b/packages/rol/src/algorithm/TypeU/bundle/ROL_Bundle_U_TT_Def.hpp @@ -45,10 +45,10 @@ #define ROL_BUNDLE_U_TT_DEF_H #include "ROL_Types.hpp" -#include -#include -#include -#include +#include +#include +#include +#include #include // TT: std::find #define EXACT 1 @@ -61,7 +61,7 @@ template Bundle_U_TT::Bundle_U_TT(const unsigned maxSize, const Real coeff, const Real omega, - const unsigned remSize) + const unsigned remSize) : Bundle_U(maxSize,coeff,omega,remSize), maxSize_(maxSize), isInitialized_(false) { maxind_ = std::numeric_limits::max(); @@ -77,7 +77,7 @@ Real Bundle_U_TT::sgn(const Real x) const { return ((x < zero) ? -one : ((x > zero) ? one : zero)); } - + template unsigned Bundle_U_TT::solveDual(const Real t, const unsigned maxit, const Real tol) { unsigned iter = 0; @@ -153,16 +153,16 @@ void Bundle_U_TT::addSubgradToBase(unsigned ind, Real delta) { base_[currSize_-1] = tmp; ind--; } // end if dependent - + L_(ind,ind) = delta; - + // update z1 and z2 unsigned zsize = ind+1; z1_.resize(zsize); z2_.resize(zsize); z1_[ind] = ( static_cast(1) - lhz1_ ) / delta; - z2_[ind] = ( Bundle_U::alpha(base_[ind]) - lhz2_ ) / delta; - //z2[zsize-1] = ( Bundle_U::alpha(entering_) - lhz2_ ) / delta; - + z2_[ind] = ( Bundle_U::alpha(base_[ind]) - lhz2_ ) / delta; + //z2[zsize-1] = ( Bundle_U::alpha(entering_) - lhz2_ ) / delta; + // update kappa if(delta > L_(LiMax_,LiMax_)){ LiMax_ = ind; @@ -179,11 +179,11 @@ void Bundle_U_TT::deleteSubgradFromBase(unsigned ind, Real tol){ const Real zero(0), one(1); // update L, currSize, base_, z1, z2, dependent, dualVariables_, kappa if (ind >= currSize_-dependent_){ - // if dependent > 0, the last one or two rows of L are lin. dependent + // if dependent > 0, the last one or two rows of L are lin. dependent if (ind < currSize_-1){ // eliminate currSize_-2 but keep currSize_-1 // swap the last row with the second to last swapRowsL(ind,currSize_-1); - base_[ind] = base_[currSize_-1]; + base_[ind] = base_[currSize_-1]; #if( ! EXACT ) lhNorm = ljNorm; // new last row is lh #endif @@ -199,22 +199,22 @@ void Bundle_U_TT::deleteSubgradFromBase(unsigned ind, Real tol){ } // end if dependent item /* currently L_B is lower trapezoidal - + | L_1 0 0 | L_B = | l d 0 | - | Z v L_2 | - + | Z v L_2 | + Apply Givens rotations to transform it to - + | L_1 0 0 | | l d 0 | | Z 0 L_2' | - + then delete row and column to obtain factorization of L_B' with B' = B/{i} - + L_B' = | L_1 0 | | Z L_2' | - + */ for (unsigned j=ind+1; j::deleteSubgradFromBase(unsigned ind, Real tol){ // d = hypot(ai,aj); // Gc = aj/d; // Gs = -ai/d; - - L_(j,j) = d; L_(j,ind) = zero; + + L_(j,j) = d; L_(j,ind) = zero; // apply Givens to columns i,j of L for (unsigned h=j+1; h::deleteSubgradFromBase(unsigned ind, Real tol){ if( dependent_ > 1 ) // j = currSize_ - 1, h = currSize_ - 2 deltaLj_ = L_(currSize_-1,ind); } - + // shift rows and columns of L by exchanging i-th row with next row and i-th column with next column until the row to be deleted is the last, then deleting last row and column swapRowsL(ind,currSize_-1); swapRowsL(ind,currSize_-1,true); @@ -298,7 +298,7 @@ void Bundle_U_TT::deleteSubgradFromBase(unsigned ind, Real tol){ // update kappa updateK(); - + if(dependent_){ // if some previously dependent item have become independent // recompute deltaLh @@ -315,8 +315,8 @@ void Bundle_U_TT::deleteSubgradFromBase(unsigned ind, Real tol){ Real signum = sgn1 * sgn2; // sgn( deltaLh ) * sgn ( deltaLj ); deltaLh_ = std::abs( ghNorm - lhNorm + deltaLh_ * deltaLh_); #endif - - if( std::sqrt(deltaLh_) > tol*kappa_*std::max(static_cast(1),ghNorm) ){ // originally had deltaLh without sqrt + + if( std::sqrt(deltaLh_) > tol*kappa_*std::max(static_cast(1),ghNorm) ){ // originally had deltaLh without sqrt unsigned newind = currSize_-dependent_; dependent_--; // get the last row of L @@ -329,7 +329,7 @@ void Bundle_U_TT::deleteSubgradFromBase(unsigned ind, Real tol){ lhz2_ += lh_[ii]*z2_[ii]; } deltaLh_ = std::sqrt(deltaLh_); - addSubgradToBase(newind,deltaLh_); + addSubgradToBase(newind,deltaLh_); if(dependent_){ // dependent was 2 #if( ! EXACT ) @@ -366,13 +366,13 @@ void Bundle_U_TT::deleteSubgradFromBase(unsigned ind, Real tol){ #else deltaLj_ = std::abs( gjNorm - ljNorm + deltaLj_ * deltaLj_); #endif - + if( std::sqrt(deltaLj_) > tol*kappa_*std::max(static_cast(1),gjNorm) ){ // originally had deltaLj without sqrt unsigned newind = currSize_-1; dependent_--; // get the last row of L lj_.size(newind-1); // initialize to zeros; - Real ljz1_ = zero; + ljz1_ = zero; Real ljTz2 = zero; for (unsigned ii=0;ii::deleteSubgradFromBase(unsigned ind, Real tol){ ljTz2 += lj_[ii]*z2_[ii]; } deltaLj_ = std::sqrt(deltaLj_); - addSubgradToBase(newind,deltaLj_); + addSubgradToBase(newind,deltaLj_); #if( EXACT ) deltaLh_ = GiGj(base_[currSize_-2],base_[currSize_-1]); for (unsigned ii=0;ii::deleteSubgradFromBase(unsigned ind, Real tol){ } // end if ( dependent > 1 ) } // end if(dependent) }// end deleteSubgradFromBase() - + template void Bundle_U_TT::solveSystem(int size, char tran, LA::Matrix &L, LA::Vector &v){ int info; @@ -435,8 +435,8 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con entering_ = maxind_; // cold start - optimal_ = true; - dependent_ = 0; + optimal_ = true; + dependent_ = 0; rho_ = ROL_INF(); // value of rho = -v currSize_ = 1; // current base size base_.clear(); @@ -458,7 +458,7 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con kappa_ = one; // condition number of matrix L ( >= max|L_ii|/min|L_ii| ) objval_ = ROL_INF(); // value of objective minobjval_ = ROL_INF(); // min value of objective (ever reached) - + unsigned iter; //-------------------------- MAIN LOOP --------------------------------// for (iter=0;iter::solveDual_TT(const Real t, const unsigned maxit, con L = L_B' */ z1z2 = z1_.dot(z2_); - z1z1 = z1_.dot(z1_); + z1z1 = z1_.dot(z1_); rho_ = ( one + z1z2/t )/z1z1; tempv_ = z1_; tempv_.scale(rho_); tempw1_ = z2_; tempw1_.scale(one/t); @@ -480,26 +480,26 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con optimal_ = true; break; } - case(1): + case(1): { /* L = | L_B' 0 | \ currSize | l_h^T 0 | / */ - LA::Matrix LBprime( LA::Copy,L_,currSize_-1,currSize_-1); + LA::Matrix LBprime( LA::Copy,L_,currSize_-1,currSize_-1); lh_.size(currSize_-1); // initialize to zeros; lhz1_ = zero; lhz2_ = zero; for(unsigned i=0; i::solveDual_TT(const Real t, const unsigned maxit, con // system has (unique) solution rho_ = ( (Bundle_U::alpha(base_[currSize_-1]) - lhz2_)/t ) / ( one - lhz1_ ); z1z2 = z1_.dot(z2_); - z1z1 = z1_.dot(z1_); + z1z1 = z1_.dot(z1_); Real tmp = ( one + z1z2 / t - rho_ * z1z1 )/( one - lhz1_ ); tempw1_ = z1_; tempw1_.scale(rho_); tempw2_ = z2_; tempw2_.scale(one/t); @@ -539,13 +539,13 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con Real tmp1 = L_(currSize_-1,i); Real tmp2 = L_(currSize_-2,i); ljz1_ += tmp1*z1_(i); - lhz1_ += tmp2*z1_(i); + lhz1_ += tmp2*z1_(i); lj_[i] = tmp1; lh_[i] = tmp2; } - if(std::abs(ljz1_-one) <= tol*kappa_){ + if(std::abs(ljz1_-one) <= tol*kappa_){ // tempv is an infinite direction - tempv_ = lj_; + tempv_ = lj_; solveSystem(currSize_-2,'T',LBprime,tempv_); tempv_.resize(currSize_); // add two last entries tempv_[currSize_-2] = zero; @@ -573,7 +573,7 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con // set dual variables to values in tempv Bundle_U::resetDualVariables(); for (unsigned i=0; i::setDualVariable(base_[i],tempv_[i]); + Bundle_U::setDualVariable(base_[i],tempv_[i]); } } else{ @@ -613,19 +613,19 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con QPStatus_ = -1; // invalid step return iter; } - + for (unsigned i=0; i::setDualVariable(base_[i],Bundle_U::getDualVariable(base_[i]) + step * tempv_[i]); + Bundle_U::setDualVariable(base_[i],Bundle_U::getDualVariable(base_[i]) + step * tempv_[i]); }// if(!optimal) - + //------------------------- ITEMS ELIMINATION ---------------------------// - + // Eliminate items with 0 multipliers from base bool deleted = optimal_; for (unsigned baseitem=0; baseitem::getDualVariable(base_[baseitem]) <= tol){ deleted = true; - + #if( TABOO_LIST ) // item that just entered shouldn't exit; if it does, mark it as taboo if( base_[baseitem] == entering_ ){ @@ -634,18 +634,18 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con } #endif - // eliminate item from base; + // eliminate item from base; deleteSubgradFromBase(baseitem,tol); } // end if(dualVariables_[baseitem] < tol) - } // end loop over baseitem - + } // end loop over baseitem + if(!deleted){ // nothing deleted and not optimal QPStatus_ = -2; // loop return iter; } } // end inner loop - + Real newobjval(0), Lin(0), Quad(0); // new objective value for (unsigned i=0; i::alpha(base_[i])*Bundle_U::getDualVariable(base_[i]); @@ -660,12 +660,12 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con } #if( TABOO_LIST ) - // -- test for strict decrease -- // + // -- test for strict decrease -- // // if item didn't provide decrease, move it to taboo list ... if( ( entering_ < maxind_ ) && ( objval_ < ROL_INF() ) ){ if( newobjval >= objval_ - std::max( tol*std::abs(objval_), ROL_EPSILON() ) ){ taboo_.push_back(entering_); - } + } } #endif @@ -680,7 +680,7 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con //---------------------- OPTIMALITY TEST -------------------------// if ( (rho_ >= ROL_NINF()) && (objval_ <= ROL_NINF()) ) // if current x (dualVariables_) is feasible break; - + entering_ = maxind_; Real minro = - std::max( tol*currSize_*std::abs(objval_), ROL_EPSILON() ); #if ( ! FIRST_VIOLATED ) @@ -703,15 +703,15 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con if (rho_ >= ROL_NINF()){ - ro = ro - rho_; // note: rho = -v + ro = ro - rho_; // note: rho = -v } else{ ro = ROL_NINF(); minobjval_ = ROL_INF(); objval_ = ROL_INF(); } - - if (ro < minro){ + + if (ro < minro){ #if ( FIRST_VIOLATED ) entering_ = bundleitem; break; // skip going through rest of constraints; alternatively, could look for "most violated" @@ -723,11 +723,11 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con } #endif } - + } // end if item not in base }// end of loop over items in bundle - //----------------- INSERTING ITEM ------------------------// + //----------------- INSERTING ITEM ------------------------// if (entering_ < maxind_){ // dual constraint is violated optimal_ = false; Bundle_U::setDualVariable(entering_,zero); @@ -740,7 +740,7 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con for (unsigned i=0; i LBprime( LA::Copy,L_,zsize,zsize); + LA::Matrix LBprime( LA::Copy,L_,zsize,zsize); solveSystem(zsize,'N',LBprime,lh_); // lh = (L_B^{-1})*(G_B^T*g_h) for (unsigned i=0; i::solveDual_TT(const Real t, const unsigned maxit, con #endif currSize_++; // update base size - + L_.reshape(currSize_,currSize_); zsize = currSize_ - dependent_; // zsize is the size of L_Bprime (new one) for (unsigned i=0; i deltaeps ){ // new row is independent // add subgradient to the base unsigned ind = currSize_-1; @@ -788,7 +788,7 @@ unsigned Bundle_U_TT::solveDual_TT(const Real t, const unsigned maxit, con } if(optimal_) - break; + break; } // end main loop taboo_.clear(); From e5628586ce416cf4707e1927db4b5c1b3376ba2c Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Tue, 27 Aug 2024 17:16:14 -0600 Subject: [PATCH 049/243] Updated the tensor-opt example to work with ROL v2.0. Signed-off-by: Drew Kouri --- .../rol/example/tensor-opt/example_01.cpp | 29 ++-- .../rol/example/tensor-opt/example_01.xml | 124 +++++++++--------- 2 files changed, 79 insertions(+), 74 deletions(-) diff --git a/packages/rol/example/tensor-opt/example_01.cpp b/packages/rol/example/tensor-opt/example_01.cpp index 75b7eace379f..58be9ca5d8f1 100644 --- a/packages/rol/example/tensor-opt/example_01.cpp +++ b/packages/rol/example/tensor-opt/example_01.cpp @@ -465,6 +465,14 @@ class SemidefiniteProgramming ehv.scale(DT2_(0.5), ev); } + + void precond (ROL::Vector & hv, + const ROL::Vector & v, + const ROL::Vector & x, + DT2_ & tol) override + { + invHessVec(hv,v,x,tol); + } }; /******************************************************************************/ @@ -697,9 +705,8 @@ class SemidefiniteProgramming _problem->addConstraint("Inequality Constraint 1", _icon[1], _imul[1], _ibnd[1]); _problem->addConstraint("Inequality Constraint 2", _icon[2], _imul[2], _ibnd[2]); _problem->addConstraint("Inequality Constraint 3", _icon[3], _imul[3], _ibnd[3]); + _problem->finalize(false,true,std::cout); _solver = ROL::makePtr>(_problem, * _parlist); - //_problem = ROL::makePtr>(_obj, _x, _bnd, _icon, _imul, _ibnd); - //_solver = ROL::makePtr>(* _problem, * _parlist); _x->zero(); } @@ -757,8 +764,9 @@ class SemidefiniteProgramming { _x->wrap(x); for (auto& it : _imul) it->zero(); - _solver->reset(); - //_problem->reset(); + _problem->edit(); + _problem->finalize(false,true,std::cout); + _solver = ROL::makePtr>(_problem, * _parlist); _solver->solve(outStream); return _x->data(); @@ -874,21 +882,22 @@ int main(int argc, char *argv[]) { // start from zero solution DataType y[] = {0.0, 0.0, 0.0}; sp.solve(y); - std::cout << "y = [" << y[0] << ", " << y[1] << ", " << y[2] << "]" << std::endl; + std::cout << std::setprecision(16) << "y = [" << y[0] << ", " << y[1] << ", " << y[2] << "]" << std::endl; // solve one more time DataType z[] = {0.0, 0.0, 0.0}; sp.solve(z); - std::cout << "z = [" << z[0] << ", " << z[1] << ", " << z[2] << "]" << std::endl; + std::cout << std::setprecision(16) << "z = [" << z[0] << ", " << z[1] << ", " << z[2] << "]" << std::endl; // perform checks + DataType tol = std::sqrt(ROL::ROL_EPSILON()); // * xnorm; ROL::CArrayVector xx(&x[0],dim), yy(&y[0],dim), zz(&z[0],dim); xx.axpy(static_cast(-1), yy); - if (xx.norm() > std::sqrt(ROL::ROL_EPSILON())) { - *outStream << "\n\nxx.norm() = " << xx.norm() << "\n"; + if (xx.norm() > tol) { + *outStream << std::endl << "xx.norm() = " << xx.norm() << std::endl; errorFlag = 1000; } yy.axpy(static_cast(-1), zz); - if (yy.norm() > ROL::ROL_EPSILON()) { - *outStream << "\n\nyy.norm() = " << yy.norm() << "\n"; + if (yy.norm() > tol) { + *outStream << std::endl << "yy.norm() = " << yy.norm() << std::endl; errorFlag = 1000; } } diff --git a/packages/rol/example/tensor-opt/example_01.xml b/packages/rol/example/tensor-opt/example_01.xml index 0f158e6d4458..143414654144 100644 --- a/packages/rol/example/tensor-opt/example_01.xml +++ b/packages/rol/example/tensor-opt/example_01.xml @@ -1,42 +1,79 @@ + + + - - + + - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + - + + + + + + + + + + + + + + + + + + - + + - + @@ -46,56 +83,15 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + From 62b432ac0f7c75c868d238f0f92f439831a9778c Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Tue, 27 Aug 2024 17:19:10 -0600 Subject: [PATCH 050/243] Added capability to read initial control in from file. Signed-off-by: Drew Kouri --- .../dynamic/navier-stokes/example_02.cpp | 58 ++++++++++--------- .../dynamic/navier-stokes/input_02.xml | 6 +- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp index 472ccb4219b2..761f4192d00e 100644 --- a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp +++ b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp @@ -311,39 +311,41 @@ int main(int argc, char *argv[]) { /*************************************************************************/ z->zero(); if (useParametricControl) { - // Linearly interpolate between optimal values for angular velocity - // amplitude and Strouhal number obtained for Re=200, 1000 in - // JW He, R Glowinski, R Metcalfe, A Nordlander, J Periaux - // Active Control and Drag Optimization for Flow Past a - // Circular Cylinder - // Journal of Computation Physics, 163, pg. 83-117, 2000. - RealT Re = parlist->sublist("Problem").get("Reynolds Number",200.0); - RealT amp0 = 6.0 - (Re - 200.0)/1600.0; - RealT Se0 = 0.74 - (Re - 200.0) * (0.115/800.0); - RealT amp = parlist->sublist("Problem").sublist("Initial Guess").get("Amplitude", amp0); - RealT Se = parlist->sublist("Problem").sublist("Initial Guess").get("Strouhal Number", Se0); - RealT ph = parlist->sublist("Problem").sublist("Initial Guess").get("Phase Shift", 0.0); - for( int k=0; k> zn - = ROL::dynamicPtrCast>(z->get(k))->getParameter()->getVector(); - (*zn)[0] = -amp * std::sin(2.0 * M_PI * Se * timeStamp[k].t[0] + ph); + bool readFromFile = parlist->sublist("Initial Guess").get("Read From File",false); + if (!readFromFile) { + // Linearly interpolate between optimal values for angular velocity + // amplitude and Strouhal number obtained for Re=200, 1000 in + // JW He, R Glowinski, R Metcalfe, A Nordlander, J Periaux + // Active Control and Drag Optimization for Flow Past a + // Circular Cylinder + // Journal of Computation Physics, 163, pg. 83-117, 2000. + RealT Re = parlist->sublist("Problem").get("Reynolds Number",200.0); + RealT amp0 = 6.0 - (Re - 200.0)/1600.0; + RealT Se0 = 0.74 - (Re - 200.0) * (0.115/800.0); + RealT amp = parlist->sublist("Problem").sublist("Initial Guess").get("Amplitude", amp0); + RealT Se = parlist->sublist("Problem").sublist("Initial Guess").get("Strouhal Number", Se0); + RealT ph = parlist->sublist("Problem").sublist("Initial Guess").get("Phase Shift", 0.0); + for( int k=0; k> zn + = ROL::dynamicPtrCast>(z->get(k))->getParameter()->getVector(); + (*zn)[0] = -amp * std::sin(2.0 * M_PI * Se * timeStamp[k].t[0] + ph); + } + } + else { + for (int k = 1; k < nt; ++k) { + std::stringstream zname; + zname << "initial_control." << k-1 << ".txt"; + std::fstream zfile; + zfile.open(zname.str(),std::ios::in); + if (!zfile.is_open()) std::cout << "CANNOT OPEN " << zname.str() << std::endl; + zfile >> (*(ROL::dynamicPtrCast>(z->get(k-1))->getParameter()->getVector()))[0]; + zfile.close(); + } } } //parlist->sublist("Step").sublist("Trust Region").sublist("TRN").sublist("Solver").set("Subproblem Solver", "NCG"); algo = ROL::makePtr>(*parlist); //algo = ROL::makePtr>(*parlist); - ROL::Ptr> ztemp = z->clone(); - ztemp->randomize(); - ROL::Ptr> pztemp = z->clone(); - -// RealT ptol = 1.0; -// nobj->prox(*pztemp, *ztemp, 1.0, ptol); -// -// pztemp->axpy(-1.0, *ztemp); -// -// *outStream << "Error = " -// << pztemp->norm() << std::endl; -// //ROL::OptimizationProblem problem(obj,z);// need to change this //ROL::OptimizationSolver solver(problem,*parlist);// need to change this std::clock_t timer = std::clock(); diff --git a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/input_02.xml b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/input_02.xml index 2b4effe5daaa..2fca7dcd42f3 100644 --- a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/input_02.xml +++ b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/input_02.xml @@ -11,9 +11,9 @@ - - - + + + From bdf13fa5d6075509e07f8cda8221f29bbff9e02f Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Tue, 27 Aug 2024 17:21:02 -0600 Subject: [PATCH 051/243] Updated doxygen for type P objective. Signed-off-by: Drew Kouri --- .../src/function/objective/ROL_Objective.hpp | 49 ++++++++++++++++++- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/packages/rol/src/function/objective/ROL_Objective.hpp b/packages/rol/src/function/objective/ROL_Objective.hpp index a4ca1773d66c..09650b450366 100644 --- a/packages/rol/src/function/objective/ROL_Objective.hpp +++ b/packages/rol/src/function/objective/ROL_Objective.hpp @@ -186,14 +186,34 @@ class Objective { Pv.set(v.dual()); } - virtual void prox( Vector &Pv, const Vector &v, Real t, Real &tol){ + /** \brief Compute the proximity operator. + + This function returns the proximity operator. + @param[out] Pv is the proximity operator applied to \f$v\f$ (primal optimization vector). + @param[in] v is the input to the proximity operator (primal optimization vector). + @param[in] t is the proximity operator parameter (positive scalar). + @param[in] tol is a tolerance for inexact objective function computation. + */ + virtual void prox( Vector &Pv, const Vector &v, Real t, Real &tol){ ROL_UNUSED(Pv); ROL_UNUSED(v); ROL_UNUSED(t); ROL_UNUSED(tol); ROL_TEST_FOR_EXCEPTION(true, std::invalid_argument, ">>> ERROR (ROL::Objective): prox not implemented!"); - } + } + + + /** \brief Apply the Jacobian of the proximity operator. + + This function applies the Jacobian of the proximity operator. + @param[out] Jv is the Jacobian of the proximity operator at \f$x\f$ applied to \f$v\f$ (primal optimization vector). + @param[in] v is the direction vector (primal optimization vector). + @param[in] x is the input to the proximity operator (primal optimization vector). + @param[in] t is the proximity operator parameter (positive scalar). + @param[in] tol is a tolerance for inexact objective function computation. + */ + virtual void proxJacVec( Vector &Jv, const Vector &v, const Vector &x, Real t, Real &tol); /** \brief Finite-difference gradient check. @@ -485,6 +505,31 @@ class Objective { const bool printToStream = true, std::ostream & outStream = std::cout ); + /** \brief Finite-difference proximity operator Jacobian-applied-to-vector check. + + This function computes a sequence of one-sided finite-difference checks for the proximity + operator Jacobian. + At each step of the sequence, the finite difference step size is decreased. The output + compares the error + \f[ + \left\| \frac{\mathrm{prox}_{t f}(x+tv) - \mathrm{prox}_{t f}(x)}{t} - J_{t f}(x+tv)v\right\|_{\mathcal{X}}, + \f] + if the approximation is first order. Note that in some cases the proximity operator + is semismooth, which motivates the evaluation of \f$J_{t f}\f$ at \f$x+tv\f$. + @param[in] x is an optimization vector. + @param[in] v is a direction vector. + @param[in] t is the proximity operator parameter. + @param[in] printToStream is a flag that turns on/off output. + @param[out] outStream is the output stream. + @param[in] numSteps is a parameter which dictates the number of finite difference steps. + */ + virtual std::vector> checkProxJacVec( const Vector &x, + const Vector &v, + Real t = Real(1), + bool printToStream = true, + std::ostream &outStream = std::cout, + int numSteps = ROL_NUM_CHECKDERIV_STEPS); + // Definitions for parametrized (stochastic) objective functions private: std::vector param_; From 38d35af18606863f49c62dcdf0ddcad97464f7a4 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Tue, 27 Aug 2024 17:24:03 -0600 Subject: [PATCH 052/243] Added default implementation and test for prox jacobian. Signed-off-by: Drew Kouri --- .../function/objective/ROL_ObjectiveDef.hpp | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/packages/rol/src/function/objective/ROL_ObjectiveDef.hpp b/packages/rol/src/function/objective/ROL_ObjectiveDef.hpp index 1d5cee291e21..0e7711d433cc 100644 --- a/packages/rol/src/function/objective/ROL_ObjectiveDef.hpp +++ b/packages/rol/src/function/objective/ROL_ObjectiveDef.hpp @@ -115,6 +115,28 @@ void Objective::hessVec( Vector &hv, const Vector &v, const Ve } } +template +void Objective::proxJacVec(Vector &Jv, const Vector &v, const Vector &x, Real t, Real &tol) { + const Real zero(0), vnorm = v.norm(); + // Get Step Length + if ( vnorm == zero ) { + Jv.zero(); + } + else { + if (prim_ == nullPtr) prim_ = x.clone(); + + //Real h = 2.0/(v.norm()*v.norm())*tol; + const Real one(1), h(std::max(one,x.norm()/vnorm)*tol); + + prim_->set(x); prim_->axpy(h,v); // Set prim = x + hv + prox(Jv,*prim_,t,tol); // Compute prox at prim + prim_->zero(); + prox(*prim_,x,t,tol); // Compute prox at x + Jv.axpy(-one,*prim_); // Construct FD approximation + Jv.scale(one/h); + } +} + template std::vector> Objective::checkGradient( const Vector &x, const Vector &g, @@ -397,6 +419,82 @@ std::vector Objective::checkHessSym( const Vector &x, } // checkHessSym +template +std::vector> Objective::checkProxJacVec( const Vector &x, + const Vector &v, + Real t, + bool printToStream, + std::ostream & outStream, + int numSteps) { + + const Real one(1), scale(0.1); + Real tol = std::sqrt(ROL_EPSILON()); + + int numVals = 4; + std::vector tmp(numVals); + std::vector> hvCheck(numSteps, tmp); + + // Save the format state of the original outStream. + nullstream oldFormatState; + oldFormatState.copyfmt(outStream); + + // Compute prox at x. + Ptr> p = x.clone(); + prox(*p, x, t, tol); + + // Temporary vectors. + Ptr> pdif = x.clone(); + Ptr> Jnew = x.clone(); + Ptr> xnew = x.clone(); + + Real eta(10); + for (int i=0; iset(x); + xnew->axpy(eta, v); + prox(*pdif,*xnew,t,tol); + pdif->axpy(-one,*p); + pdif->scale(one/eta); + proxJacVec(*Jnew,v,*xnew,t,tol); + + // Compute norms of jacvec, finite-difference jacvec, and error. + hvCheck[i][0] = eta; + hvCheck[i][1] = Jnew->norm(); + hvCheck[i][2] = pdif->norm(); + pdif->axpy(-one, *Jnew); + hvCheck[i][3] = pdif->norm(); + + if (printToStream) { + if (i==0) { + outStream << std::right + << std::setw(20) << "Step size" + << std::setw(20) << "norm(Jac*vec)" + << std::setw(20) << "norm(FD approx)" + << std::setw(20) << "norm(abs error)" + << std::endl + << std::setw(20) << "---------" + << std::setw(20) << "--------------" + << std::setw(20) << "---------------" + << std::setw(20) << "---------------" + << std::endl; + } + outStream << std::scientific << std::setprecision(11) << std::right + << std::setw(20) << hvCheck[i][0] + << std::setw(20) << hvCheck[i][1] + << std::setw(20) << hvCheck[i][2] + << std::setw(20) << hvCheck[i][3] + << std::endl; + } + } + + // Reset format state of outStream. + outStream.copyfmt(oldFormatState); + + return hvCheck; +} // checkProxJacVec + } // namespace ROL #endif From e6c03bfa97d8691b37a86996696f131b1b62dceb Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Tue, 27 Aug 2024 17:26:10 -0600 Subject: [PATCH 053/243] Added update separate update multiplier and penalty functions. Signed-off-by: Drew Kouri --- .../ROL_MoreauYosidaObjective.hpp | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/packages/rol/src/algorithm/TypeG/moreauyosida/ROL_MoreauYosidaObjective.hpp b/packages/rol/src/algorithm/TypeG/moreauyosida/ROL_MoreauYosidaObjective.hpp index c22642f3b831..c053391a719e 100644 --- a/packages/rol/src/algorithm/TypeG/moreauyosida/ROL_MoreauYosidaObjective.hpp +++ b/packages/rol/src/algorithm/TypeG/moreauyosida/ROL_MoreauYosidaObjective.hpp @@ -209,21 +209,35 @@ class MoreauYosidaObjective : public Objective { lam_->set(lam); } + void updateMultiplier(const Vector &x) { + computePenalty(x); + lam_->set(*u1_); + lam_->axpy(static_cast(-1),*l1_); + lam_->scale(mu_); + isPenEvaluated_ = false; + } + void updatePenalty(Real mu) { + mu_ = mu; + isPenEvaluated_ = false; + } + void updateMultipliers(Real mu, const Vector &x) { if ( bnd_->isActivated() ) { if ( updateMultiplier_ ) { - const Real one(1); - computePenalty(x); - lam_->set(*u1_); - lam_->axpy(-one,*l1_); - lam_->scale(mu_); + updateMultiplier(x); + //const Real one(1); + //computePenalty(x); + //lam_->set(*u1_); + //lam_->axpy(-one,*l1_); + //lam_->scale(mu_); } if ( updatePenalty_ ) { - mu_ = mu; + updatePenalty(mu); + //mu_ = mu; } } nfval_ = 0; ngrad_ = 0; - isPenEvaluated_ = false; + //isPenEvaluated_ = false; } void reset(const Real mu) { From da2accc5aa755a6e067f7d013b0d662d3b688656 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Wed, 28 Aug 2024 17:43:29 -0600 Subject: [PATCH 054/243] Added GenericUnaryFunction which allows evaluation of arbitrary unary functions on all elements of a derived vector for a cost of two virtual function calls independent of the dimension of the Vector Signed-off-by: Greg von Winckel --- .../elementwise/ROL_GenericUnaryFunction.hpp | 242 ++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 packages/rol/src/elementwise/ROL_GenericUnaryFunction.hpp diff --git a/packages/rol/src/elementwise/ROL_GenericUnaryFunction.hpp b/packages/rol/src/elementwise/ROL_GenericUnaryFunction.hpp new file mode 100644 index 000000000000..47b8ca8881a4 --- /dev/null +++ b/packages/rol/src/elementwise/ROL_GenericUnaryFunction.hpp @@ -0,0 +1,242 @@ +// @HEADER +// ************************************************************************ +// +// Rapid Optimization Library (ROL) Package +// Copyright (2014) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact lead developers: +// Drew Kouri (dpkouri@sandia.gov) and +// Denis Ridzal (dridzal@sandia.gov) +// +// ************************************************************************ +// @HEADER + +#pragma once +#include + +namespace ROL { + +/** + * @brief A template class representing a unary function that can be applied to Vectors. + * + * This class provides a mechanism for applying unary functions to Vectors efficiently, + * using the Visitor pattern to avoid virtual function calls for each element. + * + * @tparam Real The type used for real numbers in the function and calculations. + */ +template +class GenericUnaryFunction { +private: + template class DerivedVisitor; ///< Forward declaration + +protected: + /** + * @brief Abstract base class for visitors. + * + * This class is part of the Visitor pattern implementation, allowing for + * double dispatch without virtual function calls for each vector element. + */ + struct Visitor { + virtual void visit( const GenericUnaryFunction& ) = 0; + }; + +public: + /** @brief Virtual destructor for proper cleanup of derived classes. */ + virtual ~GenericUnaryFunction() noexcept = default; + + /** + * @brief Apply the unary function to a single value. + * + * @param x The input value. + * @return The result of applying the function to x. + */ + [[nodiscard]] virtual Real operator()( Real x ) const = 0; + + /** + * @brief Accept a visitor, part of the Visitor pattern implementation. + * + * @param visitor The visitor to accept. + */ + virtual void accept( Visitor&& visitor ) const = 0; + + /** + * @brief Apply the unary function to a vector. + * + * This method uses the Visitor pattern to efficiently apply the function + * to all elements of the vector without virtual function calls per element. + * + * @tparam VecT The type of the vector. + * @tparam EvalT The type of the evaluator function. + * @param vec The vector to apply the function to. + * @param eval The evaluator function that defines how to apply the unary function to the vector. + * + * Example Usage: + * @code + * template + * class Vector { + * public: + * virtual void applyGenericUnary( const GenericUnaryFunction& ) {} + * }; + * + * template + * class StdVector : public Vector { + * public: + * void applyGenericUnary( const GenericUnaryFunction& guf ) override { + * guf.apply_vectorized(*this,[](StdVector& vec, const auto& f){ vec.applyGenericUnaryImpl(f); }); + * } + * + * template + * void applyGenericUnaryImpl( const unary_function& f ) { + * for( auto& e : vec_ ) e = f(e); + * } + * private: + * std::vector vec_; + * }; + * @endcode + */ + template + void apply_vectorized( VecT& vec, EvalT&& eval ) const { + accept(VectorVisitor(vec,std::forward(eval))); + } + + /** + * @brief A wrapper class that turns any callable into a GenericUnaryFunction. + * + * This class allows easy creation of GenericUnaryFunction objects from lambdas or other callables. + * + * @tparam Func The type of the callable to wrap. + * @tparam Base The base class to inherit from, defaults to GenericUnaryFunction. + */ + template + class Wrapper: public Base { + public: + + /** + * @brief Construct a new Wrapper object. + * + * @param f The callable to wrap. + */ + Wrapper( Func&& f ) : f_{std::forward(f)} { + static_assert( std::is_invocable_r_v,Real>, + "Callable must take and return Real" ); + } + + /** + * @brief Apply the wrapped function. + * + * @param x The input value. + * @return The result of applying the wrapped function to x. + */ + inline Real operator()( Real x ) const override { return f_(x); } + + private: + /** + * @brief Accept a visitor, part of the Visitor pattern implementation. + * + * @param visitor The visitor to accept. + */ + void accept( Visitor&& visitor ) const override { visitor.visit(*this); } + + Func f_; ///< The wrapped callable. + }; // class Wrapper + + /** + * @brief Class Template Argument Deduction (CTAD) guide for Wrapper. + * + * This allows the compiler to deduce the template arguments for Wrapper + * when constructing it from a callable. + */ + template + Wrapper( Func&& ) -> Wrapper>; + +private: + /** + * @brief A base class for visitors that implements the visit method. + * + * This class uses the Curiously Recurring Template Pattern (CRTP) to + * achieve static polymorphism, avoiding virtual function calls. + * + * @tparam Derived The derived visitor class. + */ + template + struct DerivedVisitor : public Visitor { + /** + * @brief Visit a GenericUnaryFunction object. + * + * This method casts the visitor to the derived type and calls its visitImpl method. + * + * @param uf The GenericUnaryFunction to visit. + */ + void visit( const GenericUnaryFunction& uf ) override { + static_cast(this)->visitImpl(uf); + } + }; // struct DerivedVisitor + + /** + * @brief A visitor that applies a unary function to a vector. + * + * This class implements the actual logic of applying a unary function to a vector. + * + * @tparam VecT The type of the vector. + * @tparam EvalT The type of the evaluator function. + */ + template + class VectorVisitor : public DerivedVisitor> { + public: + /** + * @brief Construct a new VectorVisitor object. + * + * @param vec The vector to apply the function to. + * @param eval The evaluator function. + */ + VectorVisitor( VecT& vec, EvalT&& eval ) + : vec_{vec}, eval_{std::forward(eval)} {} + + /** + * @brief Apply the unary function to the vector. + * + * This method is called by the visit method of the base DerivedVisitor. + * + * @param uf The GenericUnaryFunction to apply. + */ + void visitImpl( const GenericUnaryFunction& uf ) { + eval_(vec_, uf); + } + + private: + VecT& vec_; ///< Reference to the vector. + EvalT eval_; ///< The evaluator function. + }; // class VectorVisitor +}; // class GenericUnaryFunction + +} // namespace ROL From 4795e2b038a95a03b98513204e2eafc792cd1470 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Thu, 29 Aug 2024 12:13:36 -0600 Subject: [PATCH 055/243] Added ROL::GenericFunction, which provides a means to apply elementwise functions with and arbitrary number of arguments to a ROL::Vector Signed-off-by: Greg von Winckel --- .../src/elementwise/ROL_GenericFunction.hpp | 302 ++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 packages/rol/src/elementwise/ROL_GenericFunction.hpp diff --git a/packages/rol/src/elementwise/ROL_GenericFunction.hpp b/packages/rol/src/elementwise/ROL_GenericFunction.hpp new file mode 100644 index 000000000000..aec705d39353 --- /dev/null +++ b/packages/rol/src/elementwise/ROL_GenericFunction.hpp @@ -0,0 +1,302 @@ +// @HEADER +// ************************************************************************ +// +// Rapid Optimization Library (ROL) Package +// Copyright (2014) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact lead developers: +// Drew Kouri (dpkouri@sandia.gov) and +// Denis Ridzal (dridzal@sandia.gov) +// +// ************************************************************************ +// @HEADER + +#pragma once + +#include +#include +#include +#include +#include + +namespace ROL { + +constexpr std::size_t MAX_ARITY = 16; + +// Function traits to deduce arity +template +struct function_traits : public function_traits {}; + +template +struct function_traits { + static constexpr std::size_t arity = sizeof...(Args); + using result_type = ReturnType; +}; + + + +/** + * @brief A template class representing a function that can be applied to Vectors. + * + * This class provides a mechanism for applying functions to Vectors efficiently, + * using the Visitor pattern to avoid virtual function calls for each element. + * + * @tparam Real The type used for real numbers in the function and calculations. + */ +template +class GenericFunction { +protected: + /** + * @brief Abstract base class for visitors. + * + * This class is part of the Visitor pattern implementation, allowing for + * double dispatch without virtual function calls for each vector element. + */ + struct Visitor { + virtual void visit( const GenericFunction& ) = 0; + }; + + explicit GenericFunction( std::size_t arity ) : arity_{arity} {} + +public: + + GenericFunction( const GenericFunction& ) = delete; + GenericFunction& = operator ( const GenericFunction& ) = delete; + GenericFunction( GenericFunction&& ) = default; + GenericFunction& = operator ( GenericFunction&& ) = default; + + /** @brief Virtual destructor for proper cleanup of derived classes. */ + virtual ~GenericFunction() noexcept = default; + + /** + * @brief Apply the function to a single value. + * + * @param x The input values. + * @return The result of applying the function to x. + */ + [[nodiscard]] virtual Real operator()( const std::vector& x ) const = 0; + + /** + * @brief Accept a visitor, part of the Visitor pattern implementation. + * + * @param visitor The visitor to accept. + */ + virtual void accept( Visitor&& visitor ) const = 0; + + /** + * @brief Apply the function to a vector. + * + * This method uses the Visitor pattern to efficiently apply the function + * to all elements of the vector without virtual function calls per element. + * + * @tparam VecT The type of the vector. + * @tparam EvalT The type of the evaluator function. + * @param vec The vector to apply the function to. + * @param eval The evaluator function that defines how to apply the function to the vector. + * + * Example Usage: + * @code + * template + * class Vector { + * public: + * virtual void applyFunction( const GenericFunction&, const std::vector& ) {} + * virtual int dimension() const { return 0; } + * }; + * + * template + * class StdVector : public Vector { + * public: + * void dimension() const override { return vec_.size(); } + * void applyGenericUnary( const GenericFunction& gf, const std::vector*>& vecs ) override { + * std::vector stdVecs; + * stdVecs.reserve(vecs.size()); + * for(auto vec : vecs) { + * stdVecs.push_back(static_cast(vec)); + * } + * gf.apply_vectorized(stdVecs,[this](const auto& vecs, const auto& f){ this->applyFunctionImpl(f,vecs); }); + * } + * + * template + * void applyFunctionImpl( const F& f, const std::vector& vecs ) { + * std::vector args(vecs.size()); + * for( int i=0; i vec_; + * }; + * @endcode + */ + template + void apply_vectorized( VecT& vec, EvalT&& eval ) const { + accept(VectorVisitor(vec,std::forward(eval))); + } + + /** + * @brief A wrapper class that turns any callable into a GenericFunction. + * + * This class allows easy creation of GenericFunction objects from lambdas or other callables. + * + * @tparam Func The type of the callable to wrap. + * @tparam Base The base class to inherit from, defaults to GenericFunction. + */ + template + class Lambda : public Base { + public: + + static constexpr std::size_t arity = function_traits::arity; ///< Number of arguments wrapped callable takes + + /** + * @brief Construct a new Wrapper object. + * + * @param f The callable to wrap. + */ + explicit Lambda( Func&& f ) + : Base(arity), f_{std::forward(f)} { + static_assert( arity > 0, "Callable must take at least one argument" ); + static_assert( arity <= MAX_ARITY, "Callable must not take more than MAX_ARITY arguments" ); + } + + /** + * @brief Apply the wrapped function. + * + * @param x The input value. + * @return The result of applying the wrapped function to x. + */ + Real operator()( const std::vector& x ) const override { + if( x.size() != this->arity ) { + std::stringstream msg; + msg << "Received vector of " << x.size() << " arguments, but the wrapped callable's arity is " << this->arity << "."; + throw std::invalid_argument(msg.str()); + } + return call_impl(x, std::make_index_sequence{}); + } + + private: + + /** + * @brief Expands the elements of a vector into the arguments of a function + * @param x The vector of arguments to pass to the callable + */ + template + Real call_impl( const std::vector& x, std::index_sequence ) const { + return f_(x[I]...); + } + + /** + * @brief Accept a visitor, part of the Visitor pattern implementation. + * + * @param visitor The visitor to accept. + */ + void accept( Visitor&& visitor ) const override { visitor.visit(*this); } + + Func f_; ///< The wrapped callable. + }; // class Wrapper + + /** + * @brief Class Template Argument Deduction (CTAD) guide for Wrapper. + * + * This allows the compiler to deduce the template arguments for Wrapper + * when constructing it from a callable. + */ + template + Lambda( Func&& ) -> Lambda>; + +private: + /** + * @brief A base class for visitors that implements the visit method. + * + * This class uses the Curiously Recurring Template Pattern (CRTP) to + * achieve static polymorphism, avoiding virtual function calls. + * + * @tparam Derived The derived visitor class. + */ + template + struct DerivedVisitor : public Visitor { + /** + * @brief Visit a GenericFunction object. + * + * This method casts the visitor to the derived type and calls its visitImpl method. + * + * @param uf The GenericFunction to visit. + */ + void visit( const GenericFunction& gf ) override { + static_cast(this)->visitImpl(gf); + } + }; // struct DerivedVisitor + + /** + * @brief A visitor that applies a function to a vector. + * + * This class implements the actual logic of applying a function to a vector. + * + * @tparam VecT The type of the vector. + * @tparam EvalT The type of the evaluator function. + */ + template + class VectorVisitor : public DerivedVisitor> { + public: + /** + * @brief Construct a new VectorVisitor object. + * + * @param vec The vector to apply the function to. + * @param eval The evaluator function. + */ + VectorVisitor( const std::vector& vecs, EvalT&& eval ) + : vecs_{vecs}, eval_{std::forward(eval)} {} + + /** + * @brief Apply the function to the vector. + * + * This method is called by the visit method of the base DerivedVisitor. + * + * @param uf The GenericFunction to apply. + */ + void visitImpl( const GenericFunction& gf ) { + eval_(vecs_, gf); + } + + private: + const std::vector& vec_; ///< Reference to the pointers to vectors. + EvalT eval_; ///< The evaluator function. + }; // class VectorVisitor + + const std::size_t arity_; ///< Number of Real arguments the function takes + +}; // class GenericFunction + +} // namespace ROL From 8d2047482c135f70bc29ed9f3a7471abaf25ac90 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Fri, 13 Sep 2024 16:44:44 -0600 Subject: [PATCH 056/243] Combined all necessary parameter parsing components into rol_parameters.py which now creates a Teuchos::ParameterList compatible XML output of the valid keys at each level of the parameter hierarchy Signed-off-by: Greg von Winckel --- .../rol_parameters/all_rol_parameters.json | 499 ------------------ packages/rol/rol_parameters/compile_json.py | 32 -- .../rol/rol_parameters/compile_parameters.py | 115 ---- packages/rol/rol_parameters/find_files.py | 78 --- packages/rol/rol_parameters/find_instances.py | 203 ------- .../rol/rol_parameters/list_of_rol_files.txt | 148 ------ .../rol/rol_parameters/read_cpp_source.py | 120 ----- packages/rol/rol_parameters/rol_parameters.py | 169 ++++-- .../rol/rol_parameters/rol_parameters.xml | 442 ++++++++++++++++ 9 files changed, 574 insertions(+), 1232 deletions(-) delete mode 100644 packages/rol/rol_parameters/all_rol_parameters.json delete mode 100644 packages/rol/rol_parameters/compile_json.py delete mode 100644 packages/rol/rol_parameters/compile_parameters.py delete mode 100644 packages/rol/rol_parameters/find_files.py delete mode 100644 packages/rol/rol_parameters/find_instances.py delete mode 100644 packages/rol/rol_parameters/list_of_rol_files.txt delete mode 100644 packages/rol/rol_parameters/read_cpp_source.py create mode 100644 packages/rol/rol_parameters/rol_parameters.xml diff --git a/packages/rol/rol_parameters/all_rol_parameters.json b/packages/rol/rol_parameters/all_rol_parameters.json deleted file mode 100644 index 124e41cd1d51..000000000000 --- a/packages/rol/rol_parameters/all_rol_parameters.json +++ /dev/null @@ -1,499 +0,0 @@ -{"Absolute Value Approximation": {}, - "Adaptive Rank": {}, - "Additive Rank Update": {}, - "Adjoint Domain Seed": {}, - "Adjoint Range Seed": {}, - "Adjoint Rank": {}, - "Dimension": {}, - "Distribution": {"Name": {}}, - "Dynamic Constraint": {"Solve": {"Absolute Residual Tolerance": {}, - "Backtracking Factor": {}, - "Iteration Limit": {}, - "Output Iteration History": {}, - "Relative Residual Tolerance": {}, - "Solver Type": {}, - "Step Tolerance": {}, - "Sufficient Decrease Tolerance": {}, - "Zero Initial Guess": {}}}, - "General": {"Inexact Gradient": {}, - "Inexact Hessian-Times-A-Vector": {}, - "Inexact Objective Function": {}, - "Krylov": {"Absolute Tolerance": {}, - "Iteration Limit": {}, - "Relative Tolerance": {}, - "Type": {}, - "User Defined Krylov Name": {}}, - "Output Level": {}, - "Polyhedral Projection": {"Absolute Tolerance": {}, - "Douglas-Rachford": {"Constraint Weight": {}, - "Penalty Parameter": {}, - "Relaxation Parameter": {}}, - "Iteration Limit": {}, - "Multiplier Tolerance": {}, - "Relative Tolerance": {}, - "Semismooth Newton": {"Backtracking Rate": {}, - "Krylov": {}, - "Line Search Type": {}, - "Project onto Separating Hyperplane": {}, - "Regularization Scale": {}, - "Relative Error Scale": {}, - "Step Tolerance": {}, - "Sufficient Decrease Tolerance": {}}, - "Type": {}}, - "Secant": {"Type": {}, - "Use as Hessian": {}, - "Use as Preconditioner": {}, - "User Defined Secant Name": {}}}, - "Log Rank Update Shift": {}, - "Log Rank Update Slope": {}, - "Lower Bound": {}, - "Maximum Rank": {}, - "Maximum Tolerance": {}, - "Mean": {}, - "Number of Quadrature Points": {}, - "Number of Samples": {}, - "OED": {"A-Optimality": {"Number of Samples": {}, - "Randomized Trace Estimation": {}}, - "C-Optimality": {"C Value": {}}, - "Constraint Scaling": {}, - "Double-Well Penalty Parameter": {}, - "I-Optimality": {"Number of Samples": {}, - "Randomized Trace Estimation": {}, - "Use Trace Form": {}}, - "L1 Penalty Parameter": {}, - "Objective Scaling": {}, - "Optimality Type": {}, - "R-Optimality": {"Confidence Level": {}, - "Convex Combination Parameter": {}, - "Smoothing Parameter": {}, - "Use Primal-Dual Algorithm": {}}, - "Use Double-Well Penalty": {}, - "Use L1 Penalty": {}, - "Use Scaling": {}, - "Use Storage": {}}, - "Orthogonality Tolerance": {}, - "Output Frequency": {}, - "Plus Function": {"Smoothing Parameter": {}}, - "Points File Name": {}, - "Print Optimization Vector": {}, - "Print Quadrature to Screen": {}, - "Rank Update Factor": {}, - "Reorthogonalization Iterations": {}, - "SOL": {"Deviation Measure": {"CVaR": {}, - "Entropic": {}, - "Generalized Moreau-Yosida CVaR": {}, - "Log Quantile": {}, - "Moreau-Yosida CVaR": {}, - "Name": {}, - "Smoothed Upper Range": {}, - "Truncated Mean": {}}, - "Distribution": {"Arcsine": {"Lower Bound": {}, "Upper Bound": {}}, - "Beta": {"Shape 1": {}, "Shape 2": {}}, - "Cauchy": {"Location": {}, "Scale": {}}, - "Dirac": {"Location": {}}, - "Exponential": {"Location": {}, "Scale": {}}, - "Gamma": {"Scale": {}, "Shape": {}}, - "Gaussian": {"Mean": {}, "Variance": {}}, - "Gumbel": {"Location": {}, "Scale": {}}, - "Kumaraswamy": {"Exponent 1": {}, - "Exponent 2": {}, - "Lower Bound": {}, - "Upper Bound": {}}, - "Laplace": {"Mean": {}, "Scale": {}}, - "Logistic": {"Mean": {}, "Scale": {}}, - "Name": {}, - "Parabolic": {"Lower Bound": {}, "Upper Bound": {}}, - "Raised Cosine": {"Mean": {}, "Scale": {}}, - "Smale": {"Lower Bound": {}, "Upper Bound": {}}, - "Triangle": {"Lower Bound": {}, - "Peak Location": {}, - "Upper Bound": {}}, - "Truncated Exponential": {}, - "Truncated Gaussian": {}, - "Uniform": {"Lower Bound": {}, "Upper Bound": {}}}, - "Error Measure": {"Exponential": {}, - "Generalized Moreau-Yosida-Koenker-Bassett": {}, - "Huber": {}, - "Koenker-Bassett": {}, - "Log Quantile": {}, - "Moreau-Yosida-Koenker-Bassett": {}, - "Name": {}, - "Smoothed Worst Case": {}}, - "Initial Statistic": {}, - "Objective": {"Risk Measure": {"CVaR": {"Confidence Level": {}, - "Convex Combination Parameter": {}}, - "Confidence Level": {}, - "Convex Combination Parameter": {}, - "Name": {}, - "Smoothing Parameter": {}}, - "Risk Neutral": {"Use Storage": {}}, - "Store Sampled Value and Gradient": {}, - "Type": {}}, - "Primal Dual Risk": {"Dual Tolerance": {}, - "Dual Tolerance Decrease Exponent": {}, - "Dual Tolerance Update Exponent": {}, - "Dual Tolerance Update Scale": {}, - "Initial Constraint Tolerance": {}, - "Initial Dual Tolerance": {}, - "Initial Gradient Tolerance": {}, - "Initial Penalty Parameter": {}, - "Iteration Limit": {}, - "Maximum Penalty Parameter": {}, - "Penalty Update Scale": {}, - "Print Subproblem Solve History": {}, - "Solver Tolerance Decrease Scale": {}, - "Solver Tolerance Update Scale": {}, - "Update Frequency": {}}, - "Probability": {"Name": {}, "bPOE": {"Threshold": {}}}, - "Progressive Hedging": {"Dynamic Tolerance": {}, - "Fixed Tolerance": {}, - "Initial Penalty Parameter": {}, - "Iteration Limit": {}, - "Maximum Penalty Parameter": {}, - "Nonanticipativity Constraint Tolerance": {}, - "Penalty Update Frequency": {}, - "Penalty Update Scale": {}, - "Print Subproblem Solve History": {}, - "Use Inexact Solve": {}, - "Use Presolve": {}}, - "Regret Measure": {"Exponential": {}, - "Generalized Moreau-Yosida Mean Absolute Loss": {}, - "Log Quantile": {}, - "Mean Absolute Loss": {}, - "Mean L2": {}, - "Moreau-Yosida Mean Absolute Loss": {}, - "Name": {}, - "Smoothed Worst Case": {}, - "Truncated Mean": {}}, - "Risk Measure": {"CVaR": {"Confidence Level": {}, - "Convex Combination Parameter": {}}, - "Chebyshev Spectral Risk": {}, - "Convex Combination Risk Measure": {}, - "Entropic Risk": {}, - "F-Divergence": {}, - "Generalized Moreau-Yosida CVaR": {}, - "HMCR": {"Confidence Level": {}, - "Convex Combination Parameter": {}}, - "KL Divergence": {}, - "Log Quantile": {}, - "Mean Plus Deviation": {}, - "Mean Plus Deviation From Target": {}, - "Mean Plus Semi-Deviation": {"Coefficient": {}}, - "Mean Plus Semi-Deviation From Target": {"Coefficient": {}, - "Target": {}}, - "Mean Plus Variance": {}, - "Mean Plus Variance From Target": {}, - "Mixed CVaR": {}, - "Moreau-Yosida CVaR": {}, - "Name": {}, - "Quantile Radius": {}, - "Safety Margin": {}, - "Second Order CVaR": {}, - "Smoothed Worst Case": {}, - "Spectral Risk": {}, - "Truncated Mean": {}}, - "Sample Generator": {"SROM": {"Adaptive Sampling": {}, - "Atom Tolerance": {}, - "CDF Smoothing Parameter": {}, - "Number of New Samples Per Adaptation": {}, - "Number of Samples": {}, - "Presolve for Atom Locations": {}, - "Probability Tolerance": {}}, - "User Input": {}}, - "Store Sampled Value and Gradient": {}, - "Type": {}}, - "Scalar Minimization": {"Bisection": {"Iteration Limit": {}, "Tolerance": {}}, - "Brent"s": {"Iteration Limit": {}, "Tolerance": {}}, - "Golden Section": {"Iteration Limit": {}, - "Tolerance": {}}, - "Iteration Limit": {}, - "Tolerance": {}, - "Type": {}}, - "Scale": {}, - "SimOpt": {"Solve": {"Absolute Residual Tolerance": {}, - "Backtracking Factor": {}, - "Iteration Limit": {}, - "Output Iteration History": {}, - "Relative Residual Tolerance": {}, - "Solver Type": {}, - "Step Tolerance": {}, - "Sufficient Decrease Tolerance": {}, - "Zero Initial Guess": {}}}, - "Smoothing Parameter": {}, - "Standard Deviation": {}, - "State Domain Seed": {}, - "State Range Seed": {}, - "State Rank": {}, - "State Sensitivity Domain Seed": {}, - "State Sensitivity Range Seed": {}, - "State Sensitivity Rank": {}, - "Status Test": {"Constraint Tolerance": {}, - "Gradient Scale": {}, - "Gradient Tolerance": {}, - "Iteration Limit": {}, - "Proximal Gradient Parameter": {}, - "Step Tolerance": {}, - "Use Relative Tolerances": {}}, - "Step": {"Augmented Lagrangian": {"Constraint Scaling": {}, - "Feasibility Tolerance Decrease Exponent": {}, - "Feasibility Tolerance Update Exponent": {}, - "Initial Feasibility Tolerance": {}, - "Initial Optimality Tolerance": {}, - "Initial Penalty Parameter": {}, - "Level of Hessian Approximation": {}, - "Maximum Penalty Parameter": {}, - "Objective Scaling": {}, - "Optimality Tolerance Decrease Exponent": {}, - "Optimality Tolerance Update Exponent": {}, - "Penalty Parameter Growth Factor": {}, - "Penalty Parameter Reciprocal Lower Bound": {}, - "Print Intermediate Optimization History": {}, - "Subproblem Iteration Limit": {}, - "Use Default Initial Penalty Parameter": {}, - "Use Default Problem Scaling": {}, - "Use Scaled Augmented Lagrangian": {}}, - "Bundle": {"Cutting Plane Iteration Limit": {}, - "Cutting Plane Tolerance": {}, - "Distance Measure Coefficient": {}, - "Epsilon Solution Tolerance": {}, - "Initial Trust-Region Parameter": {}, - "Locality Measure Coefficient": {}, - "Lower Threshold for Serious Step": {}, - "Maximum Bundle Size": {}, - "Removal Size for Bundle Update": {}, - "Upper Threshold for Null Step": {}, - "Upper Threshold for Serious Step": {}}, - "Composite Step": {"Initial Radius": {}, - "Optimality System Solver": {"Fix Tolerance": {}, - "Iteration Limit": {}, - "Nominal Relative Tolerance": {}}, - "Tangential Subproblem Solver": {"Iteration Limit": {}, - "Relative Tolerance": {}}, - "Use Constraint Hessian": {}}, - "Fletcher": {"Inexact Solves": {}, - "Level of Hessian Approximation": {}, - "Maximum Penalty Parameter": {}, - "Minimum Penalty Parameter": {}, - "Minimum Regularization Parameter": {}, - "Modify Penalty Parameter": {}, - "Penalty Parameter": {}, - "Penalty Parameter Growth Factor": {}, - "Quadratic Penalty Parameter": {}, - "Regularization Parameter": {}, - "Regularization Parameter Decrease Factor": {}, - "Subproblem Iteration Limit": {}}, - "Interior Point": {"Barrier Penalty Reduction Factor": {}, - "Initial Barrier Parameter": {}, - "Linear Damping Coefficient": {}, - "Maximum Barrier Parameter": {}, - "Minimum Barrier Parameter": {}, - "Subproblem": {"Feasibility Tolerance Reduction Factor": {}, - "Initial Feasibility Tolerance": {}, - "Initial Optimality Tolerance": {}, - "Iteration Limit": {}, - "Optimality Tolerance Reduction Factor": {}, - "Print History": {}, - "Step Type": {}}, - "Use Linear Damping": {}}, - "Line Search": {"Accept Last Alpha": {}, - "Accept Linesearch Minimizer": {}, - "Apply Prox to Initial Guess": {}, - "Curvature Condition": {"General Parameter": {}, - "Generalized Wolfe Parameter": {}, - "Type": {}}, - "Descent Method": {"Nonlinear CG Type": {}, - "Type": {}, - "User Defined Descent Direction Name": {}, - "User Defined Nonlinear CG Name": {}}, - "Finite Difference Directional Derivative": {}, - "Function Evaluation Limit": {}, - "Inexact Newton": {"Lower Step Size Safeguard": {}, - "Subproblem Absolute Tolerance": {}, - "Subproblem Iteration Limit": {}, - "Subproblem Relative Tolerance": {}, - "Subproblem Solver": {}, - "Subproblem Tolerance Exponent": {}, - "Upper Step Size Safeguard": {}}, - "Initial Step Size": {}, - "Line-Search Method": {"Backtracking Rate": {}, - "Increase Rate": {}, - "Iteration Limit": {}, - "Path-Based Target Level": {"Target Relaxation Parameter": {}, - "Upper Bound on Path Length": {}}, - "Tolerance": {}, - "Type": {}, - "User Defined Line Search Name": {}}, - "Lower Bound for Initial Step Size": {}, - "Maximum Number of Function Evaluations": {}, - "Maximum Step Size": {}, - "Normalize Initial Step Size": {}, - "PQN": {"Lower Step Size Safeguard": {}, - "Subproblem Absolute Tolerance": {}, - "Subproblem Iteration Limit": {}, - "Subproblem Relative Tolerance": {}, - "Subproblem Solver": {}, - "Upper Step Size Safeguard": {}}, - "Quasi-Newton": {"L-Secant-B": {"Cauchy Point": {"Decrease Tolerance": {}, - "Expansion Rate": {}, - "Initial Step Size": {}, - "Maximum Number of Expansion Steps": {}, - "Maximum Number of Reduction Steps": {}, - "Normalize Initial Step Size": {}, - "Reduction Rate": {}}, - "Relative Tolerance Exponent": {}, - "Sufficient Decrease Parameter": {}}, - "Method": {}}, - "Status Test": {"Gradient Tolerance": {}}, - "Sufficient Decrease Tolerance": {}, - "Use Adaptive Step Size Selection": {}, - "Use Previous Step Length as Initial Guess": {}, - "User Defined Initial Step Size": {}}, - "Moreau-Yosida Penalty": {"Initial Penalty Parameter": {}, - "Maximum Penalty Parameter": {}, - "Penalty Parameter Growth Factor": {}, - "Subproblem": {"Feasibility Tolerance": {}, - "Iteration Limit": {}, - "Optimality Tolerance": {}, - "Print History": {}, - "Step Type": {}, - "Use Relative Tolerances": {}}, - "Update Multiplier": {}, - "Update Penalty": {}}, - "Primal Dual Active Set": {"Dual Scaling": {}, - "Iteration Limit": {}, - "Relative Gradient Tolerance": {}, - "Relative Step Tolerance": {}}, - "Primal Dual Interior Point": {"Barrier Objective": {"Initial Barrier Parameter": {}, - "Linear Damping Coefficient": {}, - "Use Linear Damping": {}}}, - "Spectral Gradient": {"Apply Prox to Initial Guess": {}, - "Backtracking Rate": {}, - "Function Evaluation Limit": {}, - "Initial Spectral Step Size": {}, - "Lower Step Size Safeguard": {}, - "Maximum Spectral Step Size": {}, - "Maximum Storage Size": {}, - "Minimum Spectral Step Size": {}, - "Sufficient Decrease Tolerance": {}, - "Upper Step Size Safeguard": {}}, - "Stabilized LCL": {"Constraint Scaling": {}, - "Elastic Penalty Parameter Growth Rate": {}, - "Feasibility Tolerance Decrease Exponent": {}, - "Feasibility Tolerance Increase Exponent": {}, - "Initial Elastic Penalty Parameter": {}, - "Initial Feasibility Tolerance": {}, - "Initial Optimality Tolerance": {}, - "Initial Penalty Parameter": {}, - "Level of Hessian Approximation": {}, - "Maximum Elastic Penalty Parameter": {}, - "Maximum Penalty Parameter": {}, - "Objective Scaling": {}, - "Optimality Tolerance Decrease Exponent": {}, - "Optimality Tolerance Increase Exponent": {}, - "Penalty Parameter Growth Factor": {}, - "Subproblem Iteration Limit": {}, - "Use Default Initial Penalty Parameter": {}, - "Use Default Problem Scaling": {}, - "Use Scaled Stabilized LCL": {}}, - "Trust Region": {"Apply Prox to Initial Guess": {}, - "Coleman-Li": {"Relative Tolerance Exponent": {}, - "Relaxation Safeguard": {}, - "Sufficient Decrease Parameter": {}}, - "General": {"Output Level": {}}, - "Inexact": {"Gradient": {"Relative Tolerance": {}, - "Tolerance Scaling": {}}, - "Value": {"Exponent": {}, - "Forcing Sequence Initial Value": {}, - "Forcing Sequence Reduction Factor": {}, - "Forcing Sequence Update Frequency": {}, - "Tolerance Scaling": {}}}, - "Initial Radius": {}, - "Kelley-Sachs": {"Binding Set Tolerance": {}, - "Initial Post-Smoothing Step Size": {}, - "Maximum Number of Smoothing Iterations": {}, - "Post-Smoothing Backtracking Rate": {}, - "Post-Smoothing Decrease Parameter": {}, - "Sufficient Decrease Parameter": {}}, - "Lin-More": {"Cauchy Point": {"Decrease Tolerance": {}, - "Expansion Rate": {}, - "Initial Step Size": {}, - "Maximum Number of Expansion Steps": {}, - "Maximum Number of Reduction Steps": {}, - "Normalize Initial Step Size": {}, - "Reduction Rate": {}}, - "Maximum Number of Minor Iterations": {}, - "Projected Search": {"Backtracking Rate": {}, - "Maximum Number of Steps": {}}, - "Relative Tolerance Exponent": {}, - "Sufficient Decrease Parameter": {}}, - "Maximum Radius": {}, - "Nonmonotone Storage Limit": {}, - "Nonmonotone Storage Size": {}, - "Radius Growing Rate": {}, - "Radius Growing Threshold": {}, - "Radius Shrinking Threshold": {}, - "SPG": {"Cauchy Point": {"Decrease Tolerance": {}, - "Expansion Rate": {}, - "Initial Step Size": {}, - "Maximum Number of Expansion Steps": {}, - "Maximum Number of Reduction Steps": {}, - "Normalize Initial Step Size": {}, - "Reduction Rate": {}}, - "Relative Tolerance Exponent": {}, - "Solver": {"Absolute Tolerance": {}, - "Compute Cauchy Point": {}, - "Iteration Limit": {}, - "Maximum Spectral Step Size": {}, - "Maximum Storage Size": {}, - "Minimum Spectral Step Size": {}, - "Relative Tolerance": {}, - "Sufficient Decrease Tolerance": {}, - "Use Nonmonotone Search": {}, - "Use Smallest Model Iterate": {}}, - "Sufficient Decrease Parameter": {}}, - "Safeguard Size": {}, - "Step Acceptance Threshold": {}, - "Subproblem Model": {}, - "Subproblem Solver": {}, - "TRN": {"Cauchy Point": {"Decrease Tolerance": {}, - "Expansion Rate": {}, - "Initial Step Size": {}, - "Maximum Number of Expansion Steps": {}, - "Maximum Number of Reduction Steps": {}, - "Normalize Initial Step Size": {}, - "Reduction Rate": {}}, - "Relative Tolerance Exponent": {}, - "Solver": {"Absolute Tolerance": {}, - "Iteration Limit": {}, - "Maximum Spectral Step Size": {}, - "Maximum Storage Size": {}, - "Minimum Spectral Step Size": {}, - "NCG": {"Descent Parameter": {}, - "Nonlinear CG Type": {}, - "Truncation Parameter for HZ CG": {}}, - "Relative Tolerance": {}, - "Subproblem Solver": {}, - "Sufficient Decrease Tolerance": {}, - "Use Nonmonotone Search": {}, - "Use Smallest Model Iterate": {}}, - "Sufficient Decrease Parameter": {}}, - "Use Radius Interpolation": {}}, - "Type": {}, - "iPiano": {"Apply Prox to Initial Guess": {}, - "Backtracking Rate": {}, - "Increase Rate": {}, - "Initial Lipschitz Constant Estimate": {}, - "Lower Interpolation Factor": {}, - "Momentum Parameter": {}, - "Reduction Iteration Limit": {}, - "Upper Interpolation Factor": {}, - "Use Constant Beta": {}}}, - "Sync Hessian Rank": {}, - "Truncate Approximation": {}, - "Upper Bound": {}, - "Use Basic Rank Update": {}, - "Use Hessian": {}, - "Use Only Sketched Sensitivity": {}, - "Use Sketching": {}, - "Weight Type": {}, - "Weights File Name": {}} diff --git a/packages/rol/rol_parameters/compile_json.py b/packages/rol/rol_parameters/compile_json.py deleted file mode 100644 index f5ab689c9a00..000000000000 --- a/packages/rol/rol_parameters/compile_json.py +++ /dev/null @@ -1,32 +0,0 @@ -import re -import pathlib -import json -from collections import OrderedDict -from find_files import find_files -from read_cpp_source import read_cpp_source - -def compile_json( pattern : re.Pattern, - root_dir : pathlib.Path, - relative_pathfiles : list[pathlib.Path], - num_capture_groups : int = 1 ) -> str: - - all_instances = OrderedDict() - - for relative_pathfile in relative_pathfiles: - cpp = read_cpp_source(root_dir / relative_pathfile) - matches = list(re.finditer(pattern, cpp)) - file_str = str(relative_pathfile) - - if len(matches): - for m in matches: - key_name = m.group(1).strip() - if key_name not in all_instances.keys(): - all_instances[key_name] = {file_str} - else: - all_instances[key_name].add(file_str) - - for k,v in all_instances.items(): - all_instances[k] = list(v) - - return json.dumps(all_instances,indent=4) - diff --git a/packages/rol/rol_parameters/compile_parameters.py b/packages/rol/rol_parameters/compile_parameters.py deleted file mode 100644 index d4c620436475..000000000000 --- a/packages/rol/rol_parameters/compile_parameters.py +++ /dev/null @@ -1,115 +0,0 @@ -import re -import pathlib -import subprocess -from typing import Set, Optional, List, Tuple -from read_cpp_source import read_cpp_source - -# Compile regex patterns once -SUBLIST_PATTERN = re.compile(r'\bsublist\s*\(\s*"([^"]+)"\s*\)', re.MULTILINE) -GET_KEY_PATTERN = re.compile(r'\bget\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) -SET_KEY_PATTERN = re.compile(r'\bset\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) - -def find_instances(root_path: pathlib.Path, - search_token: str, - include: Optional[str|Set[str]] = None, - exclude: Optional[str|Set[str]] = None, - exclude_dir: Optional[str|Set[str]] = None) -> Set[pathlib.Path]: - - # Ensure the root path is an existant directory - assert( root_path.exists() ) - assert( root_path.is_dir() ) - - def join(arg): - if isinstance(arg,str): - return [arg] - else: - return list(arg) - - cmd = ['grep','-r',search_token] - - if include is not None: - for inc in join(include): - cmd.append(f'--include={inc}') - - if exclude is not None: - for exc in join(exclude): - cmd.append(f'--exclude={exc}') - - if exclude_dir is not None: - for exc_dir in join(exclude_dir): - cmd.append(f'--exclude-dir={exc_dir}') - - cmd.append(str(root_path)) - - result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - - # Check if the command was successful - if result.returncode != 0: - raise Exception(f"Error executing grep: {result.stderr}") - - make_relative = lambda path_str : pathlib.Path(path_str).relative_to(root_path,walk_up=True) - - files = { make_relative(line.split(':')[0]) for line in result.stdout.splitlines() } - return files - - - -def parse_cpp_file(file_path: pathlib.Path) -> Set[Tuple[str, ...]]: - cpp = read_cpp_source(file_path) - cpp = re.sub(';', '\n', cpp) - - def has_token(line: str) -> bool: - return ('sublist(' in line) or ('get(' in line) or ('set(' in line) - - lines = [re.sub(r'\s+', ' ', line).strip() for line in cpp.splitlines() if has_token(line) and '"' in line] - - names = {} - code = [] - instances = set() - - for line in lines: - line = re.sub(r'->', '.', line) - if '&' in line: - assignment = line.split('&')[1].strip() - lhs, rhs = assignment.split('=') - names[lhs.strip()] = rhs.strip().split('.') - else: - code.append(line.strip() if '=' not in line else line.split('=')[1].strip()) - - for k, v in names.items(): - if v[0] in names: - names[k] = names[v[0]] + v[1:] - for c in code: - elem = c.split('.') - if elem[0] in names: - elem = names[elem[0]] + elem[1:] - if len(elem) > 1: - tpl = tuple(filter(has_token, elem)) - if all((e.count('"') in [2, 4]) for e in tpl): - instances.add(tuple(e.split('"')[1] for e in tpl)) - - return instances - - - - - -def write_to_csv(instances: List[Tuple[str, ...]], output_file: str): - with open(output_file, 'w') as f: - for line in instances: - f.write(','.join(line) + '\n') - -def main(): - rol_src = pathlib.Path('/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol/src') - - relative_filepaths = find_instances(rol_src, 'ParameterList', - include={'*.hpp', '*.cpp'}, - exclude_dir={'compatibility', 'step', 'zoo'}) - all_instances = set() - for filepath in relative_filepaths: - all_instances.update(parse_cpp_file(rol_src / filepath)) - - write_to_csv(sorted(all_instances), 'all_parameters.csv') - -if __name__ == '__main__': - main() diff --git a/packages/rol/rol_parameters/find_files.py b/packages/rol/rol_parameters/find_files.py deleted file mode 100644 index 161497eab4a6..000000000000 --- a/packages/rol/rol_parameters/find_files.py +++ /dev/null @@ -1,78 +0,0 @@ -import subprocess -import pathlib - -def find_files( root_path : pathlib.Path, - search_token : str, - include : list[str]=[], - exclude : list[str]=[]) -> list[pathlib.Path]: - """ - Searches for files within a directory tree that contain a specified search token using - the Unix/MacOS command line tool `grep`. - - This function wraps the Unix `grep` command to recursively search through files - starting from a root directory. It returns a list of `pathlib.Path` objects for - files that contain the specified search token. The search can be further refined - by specifying patterns for files to include or exclude. - - Parameters: - - root_path (pathlib.Path): The root directory from which the search will begin. - Must be a valid directory path. - - search_token (str): The token to search for within files. This is passed directly - to `grep`, so regular expressions can be used. - - includes (list[str], optional): A list of patterns to include in the search. - Patterns should match the file names to include. - For example, ['*.py'] to include only Python files. - Defaults to an empty list, which includes all files. - - excludes (list[str], optional): A list of patterns to exclude from the search. - Patterns should match the file names to exclude. - For example, ['*.txt'] to exclude all text files. - Defaults to an empty list, which excludes no files. - - Returns: - - list[pathlib.Path]: A list of `pathlib.Path` objects (relative to `root_path`), - each representing a file that contains the search token. The - list will be empty if no matching files are found. - - Raises: - - Exception: If the `grep` command fails for any reason (e.g., due to an invalid - root_path or issues executing `grep`), an exception is raised with - the error message from `grep`. - - Example: - >>> find_files(pathlib.Path('/path/to/search'), 'def main', includes=['*.py']) - [PosixPath('script1.py'), PosixPath('script2.py')] - - Note: - - This function relies on the Unix `grep` command and may not be portable to - environments without `grep` (e.g., some Windows environments without Unix-like - tools installed). - """ - - # Ensure the root path is an existant directory - assert( root_path.exists() ) - assert( root_path.is_dir() ) - - if isinstance(include,str): - include = [include] if len(include) else [] - if isinstance(exclude,str): - exclude = [exclude] if len(exclude) else [] - - cmd = ['grep','-rl',search_token] - - if len(include): - cmd += [f'--include={inc}' for inc in include] - if len(exclude): - cmd += [f'--exclude={exc}' for exc in exclude] - - cmd.append(str(root_path)) - - result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - - # Check if the command was successful - if result.returncode != 0: - raise Exception(f"Error executing grep: {result.stderr}") - - make_relative = lambda path_str : pathlib.Path(path_str).relative_to(root_path,walk_up=True) - - # Parse the output into a list of relative Path objects (relative to root_path) - return sorted([make_relative(line.strip()) for line in result.stdout.splitlines()]) diff --git a/packages/rol/rol_parameters/find_instances.py b/packages/rol/rol_parameters/find_instances.py deleted file mode 100644 index ef5d2f0b368e..000000000000 --- a/packages/rol/rol_parameters/find_instances.py +++ /dev/null @@ -1,203 +0,0 @@ -import re -import subprocess -import pathlib -from pprint import pprint -from typing import Set, Optional - - - -def run_grep_command(src_directory): - grep_command = [ - 'grep', - '-rE', - '-e', - r'(\.|\->)\s*(((s|g)et\s*\(\s*"([a-zA-Z0-9]|\s)+"\s*,\s*\S+\s*\))|sublist)', - '-e', - r'(\.|\->)\s*sublist\s*\(\s*\"', - src_directory - ] - - try: - result = subprocess.run(grep_command, capture_output=True, text=True, check=True) - return result.stdout - except subprocess.CalledProcessError as e: - print(f"Error occurred: {e}") - return e.stderr - - -def split_cpp_code(code_string): - # Use a regular expression to split on both '.' and '->' - # The regex looks for either '->' or '.' as delimiters - split_pattern = r'->|\.' - - # Split the string and discard the delimiters - tokens = re.split(split_pattern, code_string) - - # Remove any empty strings from the result and strip whitespace - tokens = [token.strip() for token in tokens if token.strip()] - - return tokens - - -def extract_quoted_substring(input_string): - # Regular expression pattern to match content between double quotes - pattern = r'"([^"]*)"' - - # Search for the pattern in the input string - match = re.search(pattern, input_string) - - if match: - # If a match is found, return the content between the quotes - return match.group(1) - else: - # If no match is found, return None or an empty string - return None # or return "" if you prefer - - - - - - - - - - - -def extract_quoted_strings(string_list): - return tuple((s.strip('"') for s in string_list if s.startswith('"') and s.endswith('"'))) - -def custom_sort_key(sublist): - return sublist[:len(sublist)] - -def sort_list_of_lists(list_of_lists): - return sorted(list_of_lists, key=custom_sort_key) - - - - -def parse_cpp_strings(input_list): - parsed_list = [] - - for item in input_list: - # Match a word without parentheses, a quoted string inside parentheses, - # or a quoted string as the first argument of get() or set() - match = re.search(r'(\w+)$|"([^"]*)"|\b(?:get|set)\s*\(\s*"([^"]*)"', item) - if match: - if match.group(1): # If it's a word without parentheses - parsed_list.append(match.group(1)) - elif match.group(2): # If it's a quoted string inside parentheses - parsed_list.append(f'"{match.group(2)}"') - elif match.group(3): # If it's a quoted string in get() or set() - parsed_list.append(f'"{match.group(3)}"') - - return parsed_list - -def build_hierarchy(data): - def resolve_list(value_list): - if not value_list: - return value_list - - first_item = value_list[0] - if first_item in data and not first_item.startswith('"'): - return resolve_list(data[first_item]) + value_list[1:] - else: - return [first_item] + resolve_list(value_list[1:]) - - return {key: resolve_list(value) for key, value in data.items()} - -def build_list_hierarchy(data_dict, input_lists): - def resolve_list(value_list): - if not value_list: - return value_list - - first_item = value_list[0] - if first_item in data_dict and not first_item.startswith('"'): - return resolve_list(data_dict[first_item]) + value_list[1:] - else: - return [first_item] + resolve_list(value_list[1:]) - - return [resolve_list(sublist) for sublist in input_lists] - -def create_hierarchical_dict(list_of_lists): - result = {} - for path in list_of_lists: - current = result - for key in path[:-1]: - if key not in current: - current[key] = {} - current = current[key] - current[path[-1]] = {} - return result - -if __name__ == '__main__': - - # Every line contains an instance calling at least one of the three functions: - # - # - ParameterList::sublist - # - ParameterList::get - # - ParameterList::set - # - # 1) Defining a local sublist variable - # 2) Getting a parameter - # 3) Setting a parameter - - rol_src = pathlib.Path('/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol/src') - - make_relative = lambda path_str : pathlib.Path(path_str).relative_to(rol_src,walk_up=True) - - strip_excess_whitespace = lambda text : re.sub(r'\s+',' ',text).strip() - - sublist_pattern = re.compile(r'\bsublist\s*\(\s*"([^"]+)"\s*\)') - - data = dict() - - exclusions = ['compatibility','step','zoo'] - - local_sublist_pattern = re.compile(r'[ParameterList|auto]\s*[&]\s*(\w+)\s*=\s*(\w+)[\.|\->](.*)') - - output = run_grep_command(rol_src) - for line in output.splitlines(): - splitline = line.split(':') - file = str(make_relative(splitline[0])) - code = strip_excess_whitespace(':'.join(splitline[1:])) - if not any(f'{e}/' in file for e in exclusions): - if file not in data.keys(): - data[file] = [code] - else: - data[file].append(code) - -# with open('list_of_rol_files.txt','w') as f: -# f.write('\n'.join(sorted(data.keys()))) - - paramset = set() - - for file, code in data.items(): -# print(f'{file}') - sublist = dict() - parameters = list() - for line in code: -# print(line) - # Look for locally defined sublists - match = re.search(local_sublist_pattern,line) - if match: - sublist[match.group(1)] = [match.group(2)] + parse_cpp_strings( split_cpp_code(match.group(3))) - else: - if '=' in line: - line = line.split('=')[1].strip() - parameters.append(parse_cpp_strings(split_cpp_code(line))) - sublist = build_hierarchy(sublist) -# print(sublist) - parameters = build_list_hierarchy(sublist,parameters) - [ paramset.add(tuple(p)) for p in map(extract_quoted_strings,parameters)] - - parameters = sorted(filter(len,map(list,paramset))) - -# for p in parameters: -# print(p) - - parameters = create_hierarchical_dict(parameters) - - -# pprint(parameters) -# for p in paramset: -# print(p) diff --git a/packages/rol/rol_parameters/list_of_rol_files.txt b/packages/rol/rol_parameters/list_of_rol_files.txt deleted file mode 100644 index 49a3a4f21b40..000000000000 --- a/packages/rol/rol_parameters/list_of_rol_files.txt +++ /dev/null @@ -1,148 +0,0 @@ -algorithm/ROL_OptimizationProblem.hpp -algorithm/ROL_OptimizationSolver.hpp -algorithm/TypeB/ROL_TypeB_AlgorithmFactory.hpp -algorithm/TypeB/ROL_TypeB_ColemanLiAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_GradientAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_InteriorPointAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_KelleySachsAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_LinMoreAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_MoreauYosidaAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_NewtonKrylovAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_PrimalDualActiveSetAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_QuasiNewtonAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_SpectralGradientAlgorithm_Def.hpp -algorithm/TypeB/ROL_TypeB_TrustRegionSPGAlgorithm_Def.hpp -algorithm/TypeE/ROL_TypeE_AlgorithmFactory.hpp -algorithm/TypeE/ROL_TypeE_AugmentedLagrangianAlgorithm_Def.hpp -algorithm/TypeE/ROL_TypeE_CompositeStepAlgorithm_Def.hpp -algorithm/TypeE/ROL_TypeE_FletcherAlgorithm_Def.hpp -algorithm/TypeE/ROL_TypeE_StabilizedLCLAlgorithm_Def.hpp -algorithm/TypeG/ROL_TypeG_AlgorithmFactory.hpp -algorithm/TypeG/ROL_TypeG_AugmentedLagrangianAlgorithm_Def.hpp -algorithm/TypeG/ROL_TypeG_InteriorPointAlgorithm_Def.hpp -algorithm/TypeG/ROL_TypeG_MoreauYosidaAlgorithm_Def.hpp -algorithm/TypeG/ROL_TypeG_StabilizedLCLAlgorithm_Def.hpp -algorithm/TypeG/augmentedlagrangian/ROL_AugmentedLagrangianObjective.hpp -algorithm/TypeG/fletcher/ROL_FletcherObjectiveBase_Def.hpp -algorithm/TypeG/interiorpoint/ROL_InteriorPointObjective.hpp -algorithm/TypeG/moreauyosida/ROL_MoreauYosidaObjective.hpp -algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp -algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm_Def.hpp -algorithm/TypeP/ROL_TypeP_ProxGradientAlgorithm_Def.hpp -algorithm/TypeP/ROL_TypeP_QuasiNewtonAlgorithm_Def.hpp -algorithm/TypeP/ROL_TypeP_SpectralGradientAlgorithm_Def.hpp -algorithm/TypeP/ROL_TypeP_TrustRegionAlgorithm_Def.hpp -algorithm/TypeP/ROL_TypeP_iPianoAlgorithm_Def.hpp -algorithm/TypeU/ROL_TypeU_AlgorithmFactory.hpp -algorithm/TypeU/ROL_TypeU_BundleAlgorithm_Def.hpp -algorithm/TypeU/ROL_TypeU_LineSearchAlgorithm_Def.hpp -algorithm/TypeU/ROL_TypeU_TrustRegionAlgorithm_Def.hpp -algorithm/TypeU/linesearch/ROL_BackTracking_U.hpp -algorithm/TypeU/linesearch/ROL_CubicInterp_U.hpp -algorithm/TypeU/linesearch/ROL_LineSearch_U.hpp -algorithm/TypeU/linesearch/ROL_LineSearch_U_Factory.hpp -algorithm/TypeU/linesearch/ROL_PathBasedTargetLevel_U.hpp -algorithm/TypeU/linesearch/ROL_ScalarMinimizationLineSearch_U.hpp -algorithm/TypeU/linesearch/descent/ROL_DescentDirection_U_Factory.hpp -algorithm/TypeU/linesearch/descent/ROL_NewtonKrylov_U.hpp -algorithm/TypeU/linesearch/descent/ROL_NonlinearCG_U.hpp -algorithm/TypeU/linesearch/descent/ROL_QuasiNewton_U.hpp -algorithm/TypeU/trustregion/ROL_SPGTrustRegion_U.hpp -algorithm/TypeU/trustregion/ROL_TruncatedCG_U.hpp -algorithm/TypeU/trustregion/ROL_TrustRegionModel_U.hpp -algorithm/TypeU/trustregion/ROL_TrustRegion_U_Factory.hpp -function/dynamic/ROL_DynamicConstraint.hpp -function/dynamic/ROL_ReducedDynamicObjective.hpp -function/polyproj/ROL_BrentsProjection_Def.hpp -function/polyproj/ROL_DaiFletcherProjection_Def.hpp -function/polyproj/ROL_DouglasRachfordProjection_Def.hpp -function/polyproj/ROL_DykstraProjection_Def.hpp -function/polyproj/ROL_PolyhedralProjectionFactory.hpp -function/polyproj/ROL_RiddersProjection_Def.hpp -function/polyproj/ROL_SemismoothNewtonProjection_Def.hpp -function/simopt/ROL_Constraint_SimOpt.hpp -oed/ROL_OED_Factory_Def.hpp -sol/algorithm/ROL_PrimalDualRisk.hpp -sol/algorithm/ROL_ProgressiveHedging.hpp -sol/algorithm/ROL_StochasticProblem_Def.hpp -sol/function/ROL_AbsoluteValue.hpp -sol/function/ROL_PlusFunction.hpp -sol/function/ROL_RiskBoundConstraint.hpp -sol/function/distribution/ROL_Arcsine.hpp -sol/function/distribution/ROL_Beta.hpp -sol/function/distribution/ROL_Cauchy.hpp -sol/function/distribution/ROL_Dirac.hpp -sol/function/distribution/ROL_DistributionFactory.hpp -sol/function/distribution/ROL_Exponential.hpp -sol/function/distribution/ROL_Gamma.hpp -sol/function/distribution/ROL_Gaussian.hpp -sol/function/distribution/ROL_Gumbel.hpp -sol/function/distribution/ROL_Kumaraswamy.hpp -sol/function/distribution/ROL_Laplace.hpp -sol/function/distribution/ROL_Logistic.hpp -sol/function/distribution/ROL_Parabolic.hpp -sol/function/distribution/ROL_RaisedCosine.hpp -sol/function/distribution/ROL_Smale.hpp -sol/function/distribution/ROL_Triangle.hpp -sol/function/distribution/ROL_TruncatedExponential.hpp -sol/function/distribution/ROL_TruncatedGaussian.hpp -sol/function/distribution/ROL_Uniform.hpp -sol/function/expectationquad/ROL_GenMoreauYosidaCVaR.hpp -sol/function/expectationquad/ROL_LogExponentialQuadrangle.hpp -sol/function/expectationquad/ROL_LogQuantileQuadrangle.hpp -sol/function/expectationquad/ROL_MeanVarianceQuadrangle.hpp -sol/function/expectationquad/ROL_MoreauYosidaCVaR.hpp -sol/function/expectationquad/ROL_QuantileQuadrangle.hpp -sol/function/expectationquad/ROL_SmoothedWorstCaseQuadrangle.hpp -sol/function/expectationquad/ROL_TruncatedMeanQuadrangle.hpp -sol/function/progressivehedging/ROL_PH_DeviationObjective.hpp -sol/function/progressivehedging/ROL_PH_ErrorObjective.hpp -sol/function/progressivehedging/ROL_PH_Objective.hpp -sol/function/progressivehedging/ROL_PH_ProbObjective.hpp -sol/function/progressivehedging/ROL_PH_RegretObjective.hpp -sol/function/progressivehedging/ROL_PH_RiskObjective.hpp -sol/function/progressivehedging/ROL_PH_bPOEObjective.hpp -sol/function/randvarfunctional/ROL_RandVarFunctionalFactory.hpp -sol/function/randvarfunctional/ROL_RandVarFunctionalInfo.hpp -sol/function/randvarfunctional/ROL_StochasticObjective.hpp -sol/function/randvarfunctional/deviation/ROL_DeviationMeasureFactory.hpp -sol/function/randvarfunctional/deviation/ROL_DeviationMeasureInfo.hpp -sol/function/randvarfunctional/error/ROL_ErrorMeasureFactory.hpp -sol/function/randvarfunctional/error/ROL_ErrorMeasureInfo.hpp -sol/function/randvarfunctional/probability/ROL_BPOE.hpp -sol/function/randvarfunctional/probability/ROL_ProbabilityFactory.hpp -sol/function/randvarfunctional/probability/ROL_ProbabilityInfo.hpp -sol/function/randvarfunctional/probability/ROL_SmoothedPOE.hpp -sol/function/randvarfunctional/regret/ROL_RegretMeasureFactory.hpp -sol/function/randvarfunctional/regret/ROL_RegretMeasureInfo.hpp -sol/function/randvarfunctional/risk/ROL_CVaR.hpp -sol/function/randvarfunctional/risk/ROL_ConvexCombinationRiskMeasure.hpp -sol/function/randvarfunctional/risk/ROL_EntropicRisk.hpp -sol/function/randvarfunctional/risk/ROL_HMCR.hpp -sol/function/randvarfunctional/risk/ROL_KLDivergence.hpp -sol/function/randvarfunctional/risk/ROL_MeanDeviation.hpp -sol/function/randvarfunctional/risk/ROL_MeanDeviationFromTarget.hpp -sol/function/randvarfunctional/risk/ROL_MeanSemiDeviation.hpp -sol/function/randvarfunctional/risk/ROL_MeanSemiDeviationFromTarget.hpp -sol/function/randvarfunctional/risk/ROL_MeanVariance.hpp -sol/function/randvarfunctional/risk/ROL_MeanVarianceFromTarget.hpp -sol/function/randvarfunctional/risk/ROL_MixedCVaR.hpp -sol/function/randvarfunctional/risk/ROL_QuantileRadius.hpp -sol/function/randvarfunctional/risk/ROL_RiskMeasureFactory.hpp -sol/function/randvarfunctional/risk/ROL_RiskMeasureInfo.hpp -sol/function/randvarfunctional/risk/fdivergence/ROL_FDivergence.hpp -sol/function/randvarfunctional/risk/spectral/ROL_ChebyshevSpectral.hpp -sol/function/randvarfunctional/risk/spectral/ROL_SecondOrderCVaR.hpp -sol/function/randvarfunctional/risk/spectral/ROL_SpectralRisk.hpp -sol/sampler/ROL_SROMGenerator.hpp -sol/sampler/ROL_UserInputGenerator.hpp -sol/status/ROL_PH_StatusTest.hpp -status/ROL_BundleStatusTest.hpp -status/ROL_ConstraintStatusTest.hpp -status/ROL_FletcherStatusTest.hpp -status/ROL_StatusTest.hpp -utils/ROL_BisectionScalarMinimization.hpp -utils/ROL_BrentsScalarMinimization.hpp -utils/ROL_GoldenSectionScalarMinimization.hpp -utils/ROL_ScalarMinimizationTest.hpp \ No newline at end of file diff --git a/packages/rol/rol_parameters/read_cpp_source.py b/packages/rol/rol_parameters/read_cpp_source.py deleted file mode 100644 index 7d8a77f976e5..000000000000 --- a/packages/rol/rol_parameters/read_cpp_source.py +++ /dev/null @@ -1,120 +0,0 @@ - - -import pathlib - - -def contains_escaped_quote_advanced( s : str ) -> bool: - """ - Determines if a string contains an escaped double quote character. - - This function checks for occurrences of double quotes (") that are - preceded by an odd number of backslashes (\), indicating that the - quote is escaped. - - Parameters: - s (str): The input string to check. - - Returns: - bool: True if an escaped double quote is found, False otherwise. - """ - i = 0 - while i < len(s): - if s[i] == '\\': - backslash_count = 1 - i += 1 - - # Count consecutive backslashes - while i < len(s) and s[i] == '\\': - backslash_count += 1 - i += 1 - - # If there's an odd number of backslashes followed by a quote, then it is escaped - if i < len(s) and s[i] == '"' and backslash_count % 2 == 1: - return True - else: - i += 1 - return False - - - -def strip_cpp_comments( cpp_source : str ) -> str: - """ - Removes C++ style comments (both single-line and multi-line) from a string of C++ source code. - - This function strips out both single-line (//) and multi-line (/* ... */) comments - from the provided C++ source code, while preserving the content within string literals. - - Parameters: - cpp_source (str): The input C++ source code as a string. - - Returns: - str: The source code with comments removed. - """ - in_string = False - in_single_line_comment = False - in_multi_line_comment = False - result = [] - i = 0 - while i < len(cpp_source): - # Check for string start/end - if cpp_source[i] == '"' and not (in_single_line_comment or in_multi_line_comment): - # Extract substring from the current position backwards to the last non-escaped quote or start - substring = cpp_source[:i+1][::-1] - # Check if the quote is escaped - if not contains_escaped_quote_advanced(substring): - in_string = not in_string - result.append(cpp_source[i]) - # Check for single-line comment start - elif i+1 < len(cpp_source) and cpp_source[i:i+2] == "//" and not (in_string or in_multi_line_comment): - in_single_line_comment = True - i += 1 # Skip next character to avoid parsing '/' twice - # Check for multi-line comment start - elif i + 1 < len(cpp_source) and cpp_source[i:i+2] == "/*" and not (in_string or in_single_line_comment): - in_multi_line_comment = True - i += 1 # Skip next character to avoid parsing '*' twice - # Check for single-line comment end - elif in_single_line_comment and cpp_source[i] == "\n": - in_single_line_comment = False - result.append(cpp_source[i]) # Include newline in result - # Check for multi-line comment end - elif i + 1 < len(cpp_source) and in_multi_line_comment and cpp_source[i:i+2] == "*/": - in_multi_line_comment = False - i += 1 # Skip next character to avoid parsing '/' twice - # Append character if not in a comment - elif not (in_single_line_comment or in_multi_line_comment): - result.append(cpp_source[i]) - i += 1 - - return ''.join(result) - - - -def read_cpp_source( cpp_file : pathlib.Path ) -> str: - """ - Reads a C++ source file, removes comments, and returns the cleaned source code. - - This function reads the content of a given C++ source file, strips out all comments, - and returns the resulting cleaned source code as a string. - - Parameters: - cpp_file (pathlib.Path): The path to the C++ source file to read. - - Returns: - str: The C++ source code with comments removed. - - Raises: - AssertionError: If the provided path does not exist or is not a file. - """ - # Ensure the argument is a file - assert( cpp_file.exists() ) - assert( cpp_file.is_file() ) - - # Read C++ source file to string - with open(cpp_file,"r") as f: - content = f.read() - - cpp_source = strip_cpp_comments(content) - - return cpp_source - - diff --git a/packages/rol/rol_parameters/rol_parameters.py b/packages/rol/rol_parameters/rol_parameters.py index ea6538a427c1..4945d9351e75 100644 --- a/packages/rol/rol_parameters/rol_parameters.py +++ b/packages/rol/rol_parameters/rol_parameters.py @@ -1,51 +1,146 @@ import re -import sys +import subprocess import pathlib -from find_files import find_files -from compile_json import compile_json +from typing import Dict, List, Tuple +import xml.etree.ElementTree as ET +from xml.dom import minidom -if __name__ == '__main__': +def create_xml_from_dict(dict_data: Dict, root_name: str = "Inputs") -> ET.Element: + def create_element(name: str, content: Dict) -> ET.Element: + element = ET.Element(name) + if isinstance(content, dict): + for key, value in content.items(): + if key == "Parameters" and value: + param_element = ET.SubElement(element, "Parameter") + param_element.set("name", "Valid Keys") + param_element.set("type", "Array(string)") + param_element.set("value", "{" + ",".join(value) + "}") + elif key == "Sublists": + for sublist_name, sublist_content in value.items(): + sublist_element = create_element("ParameterList", sublist_content) + sublist_element.set("name", sublist_name) + element.append(sublist_element) + return element + + root = create_element("ParameterList", dict_data) + root.set("name", root_name) + return root + +def prettify(elem: ET.Element) -> str: + rough_string = ET.tostring(elem, 'utf-8') + reparsed = minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ") + +def grep_source_files(src_directory: str) -> str: + grep_command = [ + 'grep', + '-rE', + '-e', r'(\.|\->)\s*(((s|g)et\s*\(\s*"([a-zA-Z0-9]|\s)+"\s*,\s*\S+\s*\))|sublist)', + '-e', r'(\.|\->)\s*sublist\s*\(\s*\"', + src_directory + ] - assert( len(sys.argv)>2 ) + try: + result = subprocess.run(grep_command, capture_output=True, text=True, check=True) + return result.stdout + except subprocess.CalledProcessError as e: + print(f"Error occurred: {e}") + return e.stderr - rol_root = pathlib.Path(sys.argv[1]) -# rol_root = pathlib.Path('/Users/gvonwin/Projects/github/ROL-Trilinos/packages/rol') - binary_dir = pathlib.Path(sys.argv[2])#rol_root/'rol_parameters' - rol_src = rol_root/'src' +def split_cpp_code(code_string: str) -> List[str]: + tokens = re.split(r'->|\.', code_string) + return [token.strip() for token in tokens if token.strip()] - # Create list of all (relative path) header files containing the token `ParameterList` in the C++ source - relative_pathfiles = find_files(rol_src,'ParameterList','*.hpp') +def extract_quoted_strings(string_list: List[str]) -> Tuple[str, ...]: + return tuple(s.strip('"') for s in string_list if s.startswith('"') and s.endswith('"')) - # Breakdown of the `sublist` search pattern: - # \b : Asserts a word boundary, ensuring that "sublist" is matched as a whole word. - # sublist : Matches the literal string "sublist". - # \s* : Matches zero or more whitespace characters. - # \( : Matches a literal opening parenthesis ((). - # "([^"]+)" : Capturing group that matches one or more characters that are not double quotes ("), - # capturing the content between double quotes. - # \) : Matches a literal closing parenthesis ()). - sublist_pattern = re.compile(r'\bsublist\s*\(\s*"([^"]+)"\s*\)', re.MULTILINE) - sublist_json = compile_json(sublist_pattern,rol_src,relative_pathfiles) +def parse_cpp_strings(input_list: List[str]) -> List[str]: + parsed_list = [] + for item in input_list: + match = re.search(r'(\w+)$|"([^"]*)"|\b(?:get|set)\s*\(\s*"([^"]*)"', item) + if match: + if match.group(1): + parsed_list.append(match.group(1)) + elif match.group(2): + parsed_list.append(f'"{match.group(2)}"') + elif match.group(3): + parsed_list.append(f'"{match.group(3)}"') + return parsed_list - with open(binary_dir / 'sublist.json', 'w') as f: - f.write(sublist_json) +def build_hierarchy(data: Dict[str, List[str]]) -> Dict[str, List[str]]: + def resolve_list(value_list: List[str]) -> List[str]: + if not value_list: + return value_list + first_item = value_list[0] + if first_item in data and not first_item.startswith('"'): + return resolve_list(data[first_item]) + value_list[1:] + else: + return [first_item] + resolve_list(value_list[1:]) - # Breakdown of the `getkey` search pattern: - # \b : Asserts a word boundary, ensuring that "get" is matched as a whole word. - # get : Matches the literal string "sublist". - # \s* : Matches zero or more whitespace characters. - # \( : Matches a literal opening parenthesis ((). - # "([^"]+)" : Capturing group that matches one or more characters that are not double quotes ("), - # capturing the content between double quotes. - # , : Matches a literal comma. - # \) : Matches a literal closing parenthesis ()). - # ; : Matches a literal semicolon - getkey_pattern = re.compile(rf'\bget\s*\(\s*"([^"]*)"\s*,.*\)\s*;', re.MULTILINE) - getkey_json = compile_json(getkey_pattern,rol_src,relative_pathfiles) + return {key: resolve_list(value) for key, value in data.items()} + +def create_hierarchical_dict(list_of_lists: List[List[str]]) -> Dict: + result = {} + for path in list_of_lists: + current = result + for key in path[:-1]: + current = current.setdefault(key, {}) + current[path[-1]] = {} + return result + +def parse(params: Dict) -> Dict: + result = {'Parameters': [], 'Sublists': {}} + for k, v in params.items(): + if isinstance(v, dict): + if v: + result['Sublists'][k] = parse(v) + else: + result['Parameters'].append(k) + else: + result['Parameters'].append(k) + return result + +if __name__ == '__main__': + rol_src = (pathlib.Path.cwd().parents[0]/'src').resolve() + make_relative = lambda path_str: pathlib.Path(path_str).relative_to(rol_src, walk_up=True) + strip_excess_whitespace = lambda text: re.sub(r'\s+', ' ', text).strip() - with open(binary_dir / 'getkey.json', 'w') as f: - f.write(getkey_json) + local_sublist_pattern = re.compile(r'[ParameterList|auto]\s*[&]\s*(\w+)\s*=\s*(\w+)[\.|\->](.*)') + exclusions = ['compatibility', 'step', 'zoo', 'sol', 'oed', 'dynamic'] + output = grep_source_files(str(rol_src)) + data = {} + for line in output.splitlines(): + file, *code_parts = line.split(':') + file = str(make_relative(file)) + code = strip_excess_whitespace(':'.join(code_parts)) + if not any(f'{e}/' in file for e in exclusions): + data.setdefault(file, []).append(code) + paramset = set() + for file, code in data.items(): + sublist = {} + parameters = [] + for line in code: + match = re.search(local_sublist_pattern, line) + if match: + sublist[match.group(1)] = [match.group(2)] + parse_cpp_strings(split_cpp_code(match.group(3))) + else: + if '=' in line: + line = line.split('=')[1].strip() + parameters.append(parse_cpp_strings(split_cpp_code(line))) + + sublist = build_hierarchy(sublist) + parameters = [build_hierarchy(sublist)[sublist_key] + param[1:] for param in parameters for sublist_key in sublist] + paramset.update(map(extract_quoted_strings, parameters)) + parameters = create_hierarchical_dict(sorted(filter(len, map(list, paramset)))) + parameters.pop('SOL', None) + xml_root = create_xml_from_dict(parse(parameters)) + pretty_xml = prettify(xml_root) + + with open('rol_parameters.xml', 'w') as f: + f.write(pretty_xml) + + print("XML file 'rol_parameters.xml' has been created.") diff --git a/packages/rol/rol_parameters/rol_parameters.xml b/packages/rol/rol_parameters/rol_parameters.xml new file mode 100644 index 000000000000..21354eb88fd0 --- /dev/null +++ b/packages/rol/rol_parameters/rol_parameters.xml @@ -0,0 +1,442 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 17586e58bf9629ba2192719269f94251880e1983 Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Mon, 16 Sep 2024 15:02:02 -0600 Subject: [PATCH 057/243] Updated Reduced_Constraint_SimOpt. Fixed bugs and updated to ROLv2.0 interface. Added test. Signed-off-by: Drew Kouri --- .../simopt/ROL_Reduced_Constraint_SimOpt.hpp | 254 +++------ .../ROL_Reduced_Constraint_SimOpt_Def.hpp | 329 ++++++++++++ packages/rol/test/function/CMakeLists.txt | 8 + packages/rol/test/function/test_19.cpp | 482 ++++++++++++++++++ 4 files changed, 880 insertions(+), 193 deletions(-) create mode 100644 packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt_Def.hpp create mode 100644 packages/rol/test/function/test_19.cpp diff --git a/packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt.hpp b/packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt.hpp index 3747670b09f8..580eeb68c21d 100644 --- a/packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt.hpp +++ b/packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt.hpp @@ -47,94 +47,49 @@ #include "ROL_Constraint_SimOpt.hpp" #include "ROL_VectorController.hpp" +#include "ROL_BatchManager.hpp" namespace ROL { template class Reduced_Constraint_SimOpt : public Constraint { private: - const ROL::Ptr> conVal_; - const ROL::Ptr> conRed_; - const ROL::Ptr> stateStore_; - ROL::Ptr> adjointStore_; + const ROL::Ptr> conVal_, conRed_; + const ROL::Ptr> stateStore_, adjointStore_; // Primal vectors - ROL::Ptr> state_; - ROL::Ptr> adjoint_; - ROL::Ptr> residual_; - ROL::Ptr> state_sens_; - ROL::Ptr> adjoint_sens_; + const ROL::Ptr> state_, adjoint_, residual_; + const ROL::Ptr> state_sens_, adjoint_sens_; // Dual vectors - ROL::Ptr> dualstate_; - ROL::Ptr> dualstate1_; - ROL::Ptr> dualadjoint_; - ROL::Ptr> dualcontrol_; - ROL::Ptr> dualresidual_; + const ROL::Ptr> dualstate_, dualstate1_, dualadjoint_; + const ROL::Ptr> dualcontrol_, dualresidual_; const bool storage_; const bool useFDhessVec_; + unsigned nupda_, nvalu_, njaco_, najac_, nhess_; + unsigned nstat_, nadjo_, nssen_, nasen_; + bool updateFlag_; int updateIter_; + UpdateType updateType_; + bool newUpdate_; + bool isUpdated_; - void solve_state_equation(const Vector &z, Real &tol) { - // Check if state has been computed. - bool isComputed = false; - if (storage_) { - isComputed = stateStore_->get(*state_,Constraint::getParameter()); - } - // Solve state equation if not done already. - if (!isComputed || !storage_) { - // Update equality constraint with new Opt variable. - conRed_->update_2(z,updateFlag_,updateIter_); - // Solve state equation. - conRed_->solve(*dualadjoint_,*state_,z,tol); - // Update equality constraint with new Sim variable. - conRed_->update_1(*state_,updateFlag_,updateIter_); - // Update full objective function. - conVal_->update(*state_,z,updateFlag_,updateIter_); - // Store state. - if (storage_) { - stateStore_->set(*state_,Constraint::getParameter()); - } - } - } + void solve_state_equation(const Vector &z, Real &tol); /** \brief Given \f$(u,z)\in\mathcal{U}\times\mathcal{Z}\f$ which solves the state equation, solve the adjoint equation \f$c_u(u,z)^*\lambda + c_u(u,z)^*w = 0\f$ for \f$\lambda=\lambda(u,z)\in\mathcal{C}^*\f$. */ - void solve_adjoint_equation(const Vector &w, const Vector &z, Real &tol) { - // Check if adjoint has been computed. - bool isComputed = false; - if (storage_) { - isComputed = adjointStore_->get(*adjoint_,Constraint::getParameter()); - } - // Solve adjoint equation if not done already. - if (!isComputed || !storage_) { - // Evaluate the full gradient wrt u - conVal_->applyAdjointJacobian_1(*dualstate_,w,*state_,z,tol); - // Solve adjoint equation - conRed_->applyInverseAdjointJacobian_1(*adjoint_,*dualstate_,*state_,z,tol); - adjoint_->scale(static_cast(-1)); - // Store adjoint - if (storage_) { - adjointStore_->set(*adjoint_,Constraint::getParameter()); - } - } - } + void solve_adjoint_equation(const Vector &w, const Vector &z, Real &tol); /** \brief Given \f$(u,z)\in\mathcal{U}\times\mathcal{Z}\f$ which solves the state equation and a direction \f$v\in\mathcal{Z}\f$, solve the state senstivity equation \f$c_u(u,z)s + c_z(u,z)v = 0\f$ for \f$s=u_z(z)v\in\mathcal{U}\f$. */ - void solve_state_sensitivity(const Vector &v, const Vector &z, Real &tol) { - // Solve state sensitivity equation - conRed_->applyJacobian_2(*dualadjoint_,v,*state_,z,tol); - dualadjoint_->scale(static_cast(-1)); - conRed_->applyInverseJacobian_1(*state_sens_,*dualadjoint_,*state_,z,tol); - } + void solve_state_sensitivity(const Vector &v, const Vector &z, Real &tol); /** \brief Given \f$(u,z)\in\mathcal{U}\times\mathcal{Z}\f$, the adjoint variable \f$\lambda\in\mathcal{C}^*\f$, and a direction \f$v\in\mathcal{Z}\f$, solve the @@ -143,122 +98,77 @@ class Reduced_Constraint_SimOpt : public Constraint { + c_{zu}(u,z)(\cdot,v)^*\lambda = 0\f$ for \f$p = \lambda_z(u(z),z)v\in\mathcal{C}^*\f$. */ - void solve_adjoint_sensitivity(const Vector &w, const Vector &v, const Vector &z, Real &tol) { - // Evaluate full hessVec in the direction (s,v) - conVal_->applyAdjointHessian_11(*dualstate_,w,*state_sens_,*state_,z,tol); - conVal_->applyAdjointHessian_12(*dualstate1_,w,v,*state_,z,tol); - dualstate_->plus(*dualstate1_); - // Apply adjoint Hessian of constraint - conRed_->applyAdjointHessian_11(*dualstate1_,*adjoint_,*state_sens_,*state_,z,tol); - dualstate_->plus(*dualstate1_); - conRed_->applyAdjointHessian_21(*dualstate1_,*adjoint_,v,*state_,z,tol); - dualstate_->plus(*dualstate1_); - // Solve adjoint sensitivity equation - dualstate_->scale(static_cast(-1)); - conRed_->applyInverseAdjointJacobian_1(*adjoint_sens_,*dualstate_,*state_,z,tol); - } + void solve_adjoint_sensitivity(const Vector &w, const Vector &v, const Vector &z, Real &tol); public: /** \brief Constructor. - @param[in] obj is a pointer to a SimOpt objective function. - @param[in] con is a pointer to a SimOpt equality constraint. + @param[in] conVal is a pointer to a SimOpt constraint, to be evaluated. + @param[in] conRed is a pointer to a SimOpt constraint, to be reduced. @param[in] stateStore is a pointer to a VectorController object. @param[in] state is a pointer to a state space vector, \f$\mathcal{U}\f$. @param[in] control is a pointer to a optimization space vector, \f$\mathcal{Z}\f$. - @param[in] adjoint is a pointer to a dual constraint space vector, \f$\mathcal{C}^*\f$. + @param[in] adjoint is a pointer to a dual constraint space vector, \f$\mathcal{C}_{\text{red}}^*\f$. + @param[in] residual is a pointer to a primal constraint space vector, \f$\mathcal{C}_{\text{val}}\f$. @param[in] storage is a flag whether or not to store computed states and adjoints. @param[in] useFDhessVec is a flag whether or not to use a finite-difference Hessian approximation. */ Reduced_Constraint_SimOpt( - const ROL::Ptr > &conVal, - const ROL::Ptr > &conRed, - const ROL::Ptr > &stateStore, - const ROL::Ptr > &state, - const ROL::Ptr > &control, - const ROL::Ptr > &adjoint, - const ROL::Ptr > &residual, - const bool storage = true, - const bool useFDhessVec = false) - : conVal_(conVal), conRed_(conRed), stateStore_(stateStore), - storage_(storage), useFDhessVec_(useFDhessVec), - updateFlag_(true), updateIter_(0) { - adjointStore_ = ROL::makePtr>(); - state_ = state->clone(); - adjoint_ = adjoint->clone(); - residual_ = residual->clone(); - state_sens_ = state->clone(); - adjoint_sens_ = adjoint->clone(); - dualstate_ = state->dual().clone(); - dualstate1_ = state->dual().clone(); - dualadjoint_ = adjoint->dual().clone(); - dualcontrol_ = control->dual().clone(); - dualresidual_ = residual->dual().clone(); - } + const ROL::Ptr> &conVal, + const ROL::Ptr> &conRed, + const ROL::Ptr> &stateStore, + const ROL::Ptr> &state, + const ROL::Ptr> &control, + const ROL::Ptr> &adjoint, + const ROL::Ptr> &residual, + bool storage = true, + bool useFDhessVec = false); /** \brief Secondary, general constructor for use with dual optimization vector spaces where the user does not define the dual() method. - @param[in] obj is a pointer to a SimOpt objective function. - @param[in] con is a pointer to a SimOpt equality constraint. + @param[in] conVal is a pointer to a SimOpt constraint, to be evaluated. + @param[in] conRed is a pointer to a SimOpt constraint, to be reduced. @param[in] stateStore is a pointer to a VectorController object. @param[in] state is a pointer to a state space vector, \f$\mathcal{U}\f$. @param[in] control is a pointer to a optimization space vector, \f$\mathcal{Z}\f$. - @param[in] adjoint is a pointer to a dual constraint space vector, \f$\mathcal{C}^*\f$. + @param[in] adjoint is a pointer to a dual constraint space vector, \f$\mathcal{C}_{\text{red}}^*\f$. + @param[in] residual is a pointer to a primal constraint space vector, \f$\mathcal{C}_{\text{val}}\f$. @param[in] dualstate is a pointer to a dual state space vector, \f$\mathcal{U}^*\f$. - @param[in] dualadjoint is a pointer to a constraint space vector, \f$\mathcal{C}\f$. + @param[in] dualadjoint is a pointer to a constraint space vector, \f$\mathcal{C}_{\text{red}}\f$. + @param[in] dualresidual is a pointer to a dual constraint space vector, \f$\mathcal{C}_{\text{val}}^*\f$. @param[in] storage is a flag whether or not to store computed states and adjoints. @param[in] useFDhessVec is a flag whether or not to use a finite-difference Hessian approximation. */ Reduced_Constraint_SimOpt( - const ROL::Ptr > &conVal, - const ROL::Ptr > &conRed, - const ROL::Ptr > &stateStore, - const ROL::Ptr > &state, - const ROL::Ptr > &control, - const ROL::Ptr > &adjoint, - const ROL::Ptr > &residual, - const ROL::Ptr > &dualstate, - const ROL::Ptr > &dualcontrol, - const ROL::Ptr > &dualadjoint, - const ROL::Ptr > &dualresidual, - const bool storage = true, - const bool useFDhessVec = false) - : conVal_(conVal), conRed_(conRed), stateStore_(stateStore), - storage_(storage), useFDhessVec_(useFDhessVec), - updateFlag_(true), updateIter_(0) { - adjointStore_ = ROL::makePtr>(); - state_ = state->clone(); - adjoint_ = adjoint->clone(); - residual_ = residual->clone(); - state_sens_ = state->clone(); - adjoint_sens_ = adjoint->clone(); - dualstate_ = dualstate->clone(); - dualstate1_ = dualstate->clone(); - dualadjoint_ = dualadjoint->clone(); - dualcontrol_ = dualcontrol->clone(); - dualresidual_ = dualresidual->clone(); - } + const ROL::Ptr> &conVal, + const ROL::Ptr> &conRed, + const ROL::Ptr> &stateStore, + const ROL::Ptr> &state, + const ROL::Ptr> &control, + const ROL::Ptr> &adjoint, + const ROL::Ptr> &residual, + const ROL::Ptr> &dualstate, + const ROL::Ptr> &dualcontrol, + const ROL::Ptr> &dualadjoint, + const ROL::Ptr> &dualresidual, + bool storage = true, + bool useFDhessVec = false); + + void summarize(std::ostream &stream, const Ptr> &bman = nullPtr) const; + + void reset(); /** \brief Update the SimOpt objective function and equality constraint. */ - void update( const Vector &z, bool flag = true, int iter = -1 ) { - updateFlag_ = flag; - updateIter_ = iter; - stateStore_->constraintUpdate(true); - adjointStore_->constraintUpdate(flag); - } + void update( const Vector &z, bool flag = true, int iter = -1 ); + void update( const Vector &z, UpdateType type, int iter = -1 ); /** \brief Given \f$z\in\mathcal{Z}\f$, evaluate the equality constraint \f$\widehat{c}(z) = c(u(z),z)\f$ where \f$u=u(z)\in\mathcal{U}\f$ solves \f$e(u,z) = 0\f$. */ - void value( Vector &c, const Vector &z, Real &tol ) { - // Solve state equation - solve_state_equation(z,tol); - // Get constraint value - conVal_->value(c,*state_,z,tol); - } + void value( Vector &c, const Vector &z, Real &tol ); /** \brief Given \f$z\in\mathcal{Z}\f$, apply the Jacobian to a vector \f$\widehat{c}'(z)v = c_u(u,z)s + c_z(u,z)v\f$ where @@ -266,61 +176,17 @@ class Reduced_Constraint_SimOpt : public Constraint { \f$e_u(u,z)s+e_z(u,z)v = 0\f$. */ void applyJacobian( Vector &jv, const Vector &v, - const Vector &z, Real &tol ) { - // Solve state equation. - solve_state_equation(z,tol); - // Solve state sensitivity equation. - solve_state_sensitivity(v,z,tol); - // Apply Sim Jacobian to state sensitivity. - conVal_->applyJacobian_1(*residual_,*state_sens_,*state_,z,tol); - // Apply Opt Jacobian to vector. - conVal_->applyJacobian_2(jv,v,*state_,z,tol); - jv.plus(*residual_); - } + const Vector &z, Real &tol ); void applyAdjointJacobian( Vector &ajw, const Vector &w, - const Vector &z, Real &tol ) { - // Solve state equation - solve_state_equation(z,tol); - // Solve adjoint equation - solve_adjoint_equation(w,z,tol); - // Evaluate the full gradient wrt z - conVal_->applyAdjointJacobian_2(*dualcontrol_,w,*state_,z,tol); - // Build gradient - conRed_->applyAdjointJacobian_2(ajw,*adjoint_,*state_,z,tol); - ajw.plus(*dualcontrol_); - } + const Vector &z, Real &tol ); /** \brief Given \f$z\in\mathcal{Z}\f$, evaluate the Hessian of the objective function \f$\nabla^2\widehat{J}(z)\f$ in the direction \f$v\in\mathcal{Z}\f$. */ void applyAdjointHessian( Vector &ahwv, const Vector &w, const Vector &v, const Vector &z, - Real &tol ) { - if ( useFDhessVec_ ) { - Constraint::applyAdjointHessian(ahwv,w,v,z,tol); - } - else { - // Solve state equation - solve_state_equation(z,tol); - // Solve adjoint equation - solve_adjoint_equation(w,z,tol); - // Solve state sensitivity equation - solve_state_sensitivity(v,z,tol); - // Solve adjoint sensitivity equation - solve_adjoint_sensitivity(w,v,z,tol); - // Build hessVec - conRed_->applyAdjointJacobian_2(ahwv,*adjoint_sens_,*state_,z,tol); - conVal_->applyAdjointHessian_21(*dualcontrol_,w,*state_sens_,*state_,z,tol); - ahwv.plus(*dualcontrol_); - conVal_->applyAdjointHessian_22(*dualcontrol_,w,v,*state_,z,tol); - ahwv.plus(*dualcontrol_); - conRed_->applyAdjointHessian_12(*dualcontrol_,*adjoint_,*state_sens_,*state_,z,tol); - ahwv.plus(*dualcontrol_); - conRed_->applyAdjointHessian_22(*dualcontrol_,*adjoint_,v,*state_,z,tol); - ahwv.plus(*dualcontrol_); - } - } + Real &tol ); // For parametrized (stochastic) objective functions and constraints public: @@ -333,4 +199,6 @@ class Reduced_Constraint_SimOpt : public Constraint { } // namespace ROL +#include "ROL_Reduced_Constraint_SimOpt_Def.hpp" + #endif diff --git a/packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt_Def.hpp b/packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt_Def.hpp new file mode 100644 index 000000000000..aec6fef556ab --- /dev/null +++ b/packages/rol/src/function/simopt/ROL_Reduced_Constraint_SimOpt_Def.hpp @@ -0,0 +1,329 @@ +// @HEADER +// ************************************************************************ +// +// Rapid Optimization Library (ROL) Package +// Copyright (2014) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact lead developers: +// Drew Kouri (dpkouri@sandia.gov) and +// Denis Ridzal (dridzal@sandia.gov) +// +// ************************************************************************ +// @HEADER + + +#ifndef ROL_REDUCED_CONSTRAINT_SIMOPT_DEF_H +#define ROL_REDUCED_CONSTRAINT_SIMOPT_DEF_H + +namespace ROL { + +template +void Reduced_Constraint_SimOpt::solve_state_equation(const Vector &z, Real &tol) { + if (!isUpdated_) { + // Update equality constraint with new Opt variable. + if (newUpdate_) conRed_->update_2(z,updateType_,updateIter_); + else conRed_->update_2(z,updateFlag_,updateIter_); + } + // Check if state has been computed. + bool isComputed = storage_ ? stateStore_->get(*state_,Constraint::getParameter()) : false; + // Solve state equation if not done already. + if (!isComputed || !storage_) { + // Solve state equation. + conRed_->solve(*dualadjoint_,*state_,z,tol); + nstat_++; + // Store state. + if (storage_) stateStore_->set(*state_,Constraint::getParameter()); + } + if (!isUpdated_) { + // Update equality constraint with new Sim variable. + if (newUpdate_) conRed_->update_1(*state_,updateType_,updateIter_); + else conRed_->update_1(*state_,updateFlag_,updateIter_); + // Update full objective function. + if (newUpdate_) conVal_->update(*state_,z,updateType_,updateIter_); + else conVal_->update(*state_,z,updateFlag_,updateIter_); + isUpdated_ = true; + } +} + +template +void Reduced_Constraint_SimOpt::solve_adjoint_equation(const Vector &w, const Vector &z, Real &tol) { + // Check if adjoint has been computed. + bool isComputed = storage_ ? adjointStore_->get(*adjoint_,Constraint::getParameter()) : false; + // Solve adjoint equation if not done already. + if (!isComputed || !storage_) { + // Evaluate the full gradient wrt u + conVal_->applyAdjointJacobian_1(*dualstate_,w,*state_,z,tol); + // Solve adjoint equation + conRed_->applyInverseAdjointJacobian_1(*adjoint_,*dualstate_,*state_,z,tol); + adjoint_->scale(static_cast(-1)); + nadjo_++; + // Store adjoint + if (storage_) adjointStore_->set(*adjoint_,Constraint::getParameter()); + } +} + +template +void Reduced_Constraint_SimOpt::solve_state_sensitivity(const Vector &v, const Vector &z, Real &tol) { + // Solve state sensitivity equation + conRed_->applyJacobian_2(*dualadjoint_,v,*state_,z,tol); + dualadjoint_->scale(static_cast(-1)); + conRed_->applyInverseJacobian_1(*state_sens_,*dualadjoint_,*state_,z,tol); + nssen_++; +} + +template +void Reduced_Constraint_SimOpt::solve_adjoint_sensitivity(const Vector &w, const Vector &v, const Vector &z, Real &tol) { + // Evaluate full hessVec in the direction (s,v) + conVal_->applyAdjointHessian_11(*dualstate_,w,*state_sens_,*state_,z,tol); + conVal_->applyAdjointHessian_21(*dualstate1_,w,v,*state_,z,tol); + dualstate_->plus(*dualstate1_); + // Apply adjoint Hessian of constraint + conRed_->applyAdjointHessian_11(*dualstate1_,*adjoint_,*state_sens_,*state_,z,tol); + dualstate_->plus(*dualstate1_); + conRed_->applyAdjointHessian_21(*dualstate1_,*adjoint_,v,*state_,z,tol); + dualstate_->plus(*dualstate1_); + // Solve adjoint sensitivity equation + dualstate_->scale(static_cast(-1)); + conRed_->applyInverseAdjointJacobian_1(*adjoint_sens_,*dualstate_,*state_,z,tol); + nasen_++; +} + +template +Reduced_Constraint_SimOpt::Reduced_Constraint_SimOpt( + const ROL::Ptr> &conVal, + const ROL::Ptr> &conRed, + const ROL::Ptr> &stateStore, + const ROL::Ptr> &state, + const ROL::Ptr> &control, + const ROL::Ptr> &adjoint, + const ROL::Ptr> &residual, + bool storage, + bool useFDhessVec) + : conVal_( conVal ), + conRed_( conRed ), + stateStore_( stateStore ), + adjointStore_( ROL::makePtr>() ), + state_( state->clone() ), + adjoint_( adjoint->clone() ), + residual_( residual->clone() ), + state_sens_( state->clone() ), + adjoint_sens_( adjoint->clone() ), + dualstate_( state->dual().clone() ), + dualstate1_( state->dual().clone() ), + dualadjoint_( adjoint->dual().clone() ), + dualcontrol_( control->dual().clone() ), + dualresidual_( residual->dual().clone() ), + storage_(storage), useFDhessVec_(useFDhessVec), + nupda_(0), nvalu_(0), njaco_(0), najac_(0), nhess_(0), + nstat_(0), nadjo_(0), nssen_(0), nasen_(0), + updateFlag_(true), updateIter_(0), updateType_(UpdateType::Initial), + newUpdate_(false), isUpdated_(true) {} + +template +Reduced_Constraint_SimOpt::Reduced_Constraint_SimOpt( + const ROL::Ptr> &conVal, + const ROL::Ptr> &conRed, + const ROL::Ptr> &stateStore, + const ROL::Ptr> &state, + const ROL::Ptr> &control, + const ROL::Ptr> &adjoint, + const ROL::Ptr> &residual, + const ROL::Ptr> &dualstate, + const ROL::Ptr> &dualcontrol, + const ROL::Ptr> &dualadjoint, + const ROL::Ptr> &dualresidual, + bool storage, + bool useFDhessVec) + : conVal_( conVal ), + conRed_( conRed ), + stateStore_( stateStore ), + adjointStore_( ROL::makePtr>() ), + state_( state->clone() ), + adjoint_( adjoint->clone() ), + residual_( residual->clone() ), + state_sens_( state->clone() ), + adjoint_sens_( adjoint->clone() ), + dualstate_( dualstate->clone() ), + dualstate1_( dualstate->clone() ), + dualadjoint_( dualadjoint->clone() ), + dualcontrol_( dualcontrol->clone() ), + dualresidual_( dualresidual->clone() ), + storage_(storage), useFDhessVec_(useFDhessVec), + nupda_(0), nvalu_(0), njaco_(0), najac_(0), nhess_(0), + nstat_(0), nadjo_(0), nssen_(0), nasen_(0), + updateFlag_(true), updateIter_(0), updateType_(UpdateType::Initial), + newUpdate_(false), isUpdated_(true) {} + +template +void Reduced_Constraint_SimOpt::summarize(std::ostream &stream, const Ptr> &bman) const { + int nupda(0), nvalu(0), njaco(0), najac(0), nhess(0), nstat(0), nadjo(0), nssen(0), nasen(0); + if (bman == nullPtr) { + nupda = nupda_; + nvalu = nvalu_; + njaco = njaco_; + najac = najac_; + nhess = nhess_; + nstat = nstat_; + nadjo = nadjo_; + nssen = nssen_; + nasen = nasen_; + } + else { + auto sumAll = [bman](int val) { + Real global(0), local(val); + bman->sumAll(&local,&global,1); + return static_cast(global); + }; + nupda = sumAll(nupda_); + nvalu = sumAll(nvalu_); + njaco = sumAll(njaco_); + najac = sumAll(najac_); + nhess = sumAll(nhess_); + nstat = sumAll(nstat_); + nadjo = sumAll(nadjo_); + nssen = sumAll(nssen_); + nasen = sumAll(nasen_); + } + stream << std::endl; + stream << std::string(80,'=') << std::endl; + stream << " ROL::Reduced_Objective_SimOpt::summarize" << std::endl; + stream << " Number of calls to update: " << nupda << std::endl; + stream << " Number of calls to value: " << nvalu << std::endl; + stream << " Number of calls to applyJacobian: " << njaco << std::endl; + stream << " Number of calls to applyAdjointJacobian: " << najac << std::endl; + stream << " Number of calls to hessvec: " << nhess << std::endl; + stream << " Number of state solves: " << nstat << std::endl; + stream << " Number of adjoint solves: " << nadjo << std::endl; + stream << " Number of state sensitivity solves: " << nssen << std::endl; + stream << " Number of adjoint sensitivity solves: " << nasen << std::endl; + stream << std::string(80,'=') << std::endl; + stream << std::endl; +} + +template +void Reduced_Constraint_SimOpt::reset() { + nupda_ = 0; nvalu_ = 0; njaco_ = 0; najac_ = 0; nhess_ = 0; + nstat_ = 0; nadjo_ = 0; nssen_ = 0; nasen_ = 0; +} + +template +void Reduced_Constraint_SimOpt::update( const Vector &z, bool flag, int iter ) { + nupda_++; + updateFlag_ = flag; + updateIter_ = iter; + stateStore_->constraintUpdate(true); + adjointStore_->constraintUpdate(flag); +} +template +void Reduced_Constraint_SimOpt::update( const Vector &z, UpdateType type, int iter ) { + nupda_++; + isUpdated_ = false; + newUpdate_ = true; + updateType_ = type; + updateIter_ = iter; + stateStore_->objectiveUpdate(type); + adjointStore_->objectiveUpdate(type); +} + +template +void Reduced_Constraint_SimOpt::value( Vector &c, const Vector &z, Real &tol ) { + nvalu_++; + // Solve state equation + solve_state_equation(z,tol); + // Get constraint value + conVal_->value(c,*state_,z,tol); +} + +template +void Reduced_Constraint_SimOpt::applyJacobian( Vector &jv, const Vector &v, + const Vector &z, Real &tol ) { + njaco_++; + // Solve state equation. + solve_state_equation(z,tol); + // Solve state sensitivity equation. + solve_state_sensitivity(v,z,tol); + // Apply Sim Jacobian to state sensitivity. + conVal_->applyJacobian_1(*residual_,*state_sens_,*state_,z,tol); + // Apply Opt Jacobian to vector. + conVal_->applyJacobian_2(jv,v,*state_,z,tol); + jv.plus(*residual_); +} + +template +void Reduced_Constraint_SimOpt::applyAdjointJacobian( Vector &ajw, const Vector &w, + const Vector &z, Real &tol ) { + najac_++; + // Solve state equation + solve_state_equation(z,tol); + // Solve adjoint equation + solve_adjoint_equation(w,z,tol); + // Evaluate the full gradient wrt z + conVal_->applyAdjointJacobian_2(*dualcontrol_,w,*state_,z,tol); + // Build gradient + conRed_->applyAdjointJacobian_2(ajw,*adjoint_,*state_,z,tol); + ajw.plus(*dualcontrol_); +} + +template +void Reduced_Constraint_SimOpt::applyAdjointHessian( Vector &ahwv, const Vector &w, + const Vector &v, const Vector &z, + Real &tol ) { + nhess_++; + if ( useFDhessVec_ ) { + Constraint::applyAdjointHessian(ahwv,w,v,z,tol); + } + else { + // Solve state equation + solve_state_equation(z,tol); + // Solve adjoint equation + solve_adjoint_equation(w,z,tol); + // Solve state sensitivity equation + solve_state_sensitivity(v,z,tol); + // Solve adjoint sensitivity equation + solve_adjoint_sensitivity(w,v,z,tol); + // Build hessVec + conRed_->applyAdjointJacobian_2(ahwv,*adjoint_sens_,*state_,z,tol); + conVal_->applyAdjointHessian_12(*dualcontrol_,w,*state_sens_,*state_,z,tol); + ahwv.plus(*dualcontrol_); + conVal_->applyAdjointHessian_22(*dualcontrol_,w,v,*state_,z,tol); + ahwv.plus(*dualcontrol_); + conRed_->applyAdjointHessian_12(*dualcontrol_,*adjoint_,*state_sens_,*state_,z,tol); + ahwv.plus(*dualcontrol_); + conRed_->applyAdjointHessian_22(*dualcontrol_,*adjoint_,v,*state_,z,tol); + ahwv.plus(*dualcontrol_); + } +} + +} + +#endif diff --git a/packages/rol/test/function/CMakeLists.txt b/packages/rol/test/function/CMakeLists.txt index 22aa23c105df..c1a7ad12c762 100644 --- a/packages/rol/test/function/CMakeLists.txt +++ b/packages/rol/test/function/CMakeLists.txt @@ -155,6 +155,14 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( ADD_DIR_TO_NAME ) +TRIBITS_ADD_EXECUTABLE_AND_TEST( + ReducedConstraintSimOptCheck + SOURCES test_19.cpp + ARGS PrintItAll + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + TRIBITS_COPY_FILES_TO_BINARY_DIR( BinaryConstraintDataCopy SOURCE_FILES diff --git a/packages/rol/test/function/test_19.cpp b/packages/rol/test/function/test_19.cpp new file mode 100644 index 000000000000..3d1b5f0c68fa --- /dev/null +++ b/packages/rol/test/function/test_19.cpp @@ -0,0 +1,482 @@ +// @HEADER +// ************************************************************************ +// +// Rapid Optimization Library (ROL) Package +// Copyright (2014) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact lead developers: +// Drew Kouri (dpkouri@sandia.gov) and +// Denis Ridzal (dridzal@sandia.gov) +// +// ************************************************************************ +// @HEADER + +/*! \file test_19.cpp + \brief Test ReducedConstraintSimOpt class + +*/ + +#include "ROL_StdVector.hpp" +#include "ROL_Constraint_SimOpt.hpp" +#include "ROL_Reduced_Constraint_SimOpt.hpp" +#include "ROL_Stream.hpp" +#include "Teuchos_GlobalMPISession.hpp" +#include + +template +class constraint1 : public ROL::Constraint_SimOpt { +public: + constraint1() {} + void value(ROL::Vector &c, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(c.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector cs = dynamic_cast&>(c); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(cs.getVector()))[0] = std::exp(z1*u1)-z2*z2; + } + void solve(ROL::Vector &c, ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(c.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(us.getVector()))[0] = static_cast(2)*std::log(std::abs(z2)) / z1; + constraint1::value(c,u,z,tol); + } + void applyJacobian_1(ROL::Vector &jv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(jv.dimension()==1); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector jvs = dynamic_cast&>(jv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real v1 = (*(vs.getVector()))[0]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + (*(jvs.getVector()))[0] = z1*std::exp(z1*u1)*v1; + } + void applyJacobian_2(ROL::Vector &jv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(jv.dimension()==1); + assert(v.dimension()==2); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector jvs = dynamic_cast&>(jv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real v1 = (*(vs.getVector()))[0]; + Real v2 = (*(vs.getVector()))[1]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(jvs.getVector()))[0] = u1*std::exp(z1*u1)*v1 - static_cast(2)*z2*v2; + } + void applyInverseJacobian_1(ROL::Vector &ijv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ijv.dimension()==1); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ijvs = dynamic_cast&>(ijv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real v1 = (*(vs.getVector()))[0]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + (*(ijvs.getVector()))[0] = v1 / (z1*std::exp(z1*u1)); + } + void applyAdjointJacobian_1(ROL::Vector &ajv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + constraint1::applyJacobian_1(ajv,v,u,z,tol); + } + void applyAdjointJacobian_2(ROL::Vector &ajv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ajv.dimension()==2); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ajvs = dynamic_cast&>(ajv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real v1 = (*(vs.getVector()))[0]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(ajvs.getVector()))[0] = u1*std::exp(z1*u1)*v1; + (*(ajvs.getVector()))[1] = -static_cast(2)*z2*v1; + } + void applyInverseAdjointJacobian_1(ROL::Vector &iajv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + constraint1::applyInverseJacobian_1(iajv,v,u,z,tol); + } + void applyAdjointHessian_11(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==1); + assert(w.dimension()==1); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real w1 = (*(ws.getVector()))[0]; + Real v1 = (*(vs.getVector()))[0]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + (*(ahwvs.getVector()))[0] = z1*z1*std::exp(z1*u1)*v1*w1; + } + void applyAdjointHessian_12(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==2); + assert(w.dimension()==1); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real w1 = (*(ws.getVector()))[0]; + Real v1 = (*(vs.getVector()))[0]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + (*(ahwvs.getVector()))[0] = std::exp(z1*u1)*(static_cast(1)+u1*z1)*v1*w1; + (*(ahwvs.getVector()))[1] = static_cast(0); + } + void applyAdjointHessian_21(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==1); + assert(w.dimension()==1); + assert(v.dimension()==2); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real w1 = (*(ws.getVector()))[0]; + Real v1 = (*(vs.getVector()))[0]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + (*(ahwvs.getVector()))[0] = std::exp(z1*u1)*(static_cast(1)+u1*z1)*v1*w1; + } + void applyAdjointHessian_22(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==2); + assert(w.dimension()==1); + assert(v.dimension()==2); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real w1 = (*(ws.getVector()))[0]; + Real v1 = (*(vs.getVector()))[0]; + Real v2 = (*(vs.getVector()))[1]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + (*(ahwvs.getVector()))[0] = u1*u1*std::exp(z1*u1)*v1*w1; + (*(ahwvs.getVector()))[1] = -static_cast(2)*v2*w1; + } +}; + + +template +class constraint2 : public ROL::Constraint_SimOpt { +public: + constraint2() {} + void value(ROL::Vector &c, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(c.dimension()==3); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector cs = dynamic_cast&>(c); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(cs.getVector()))[0] = z1*z2*u1; + (*(cs.getVector()))[1] = (z1-z2)*u1; + (*(cs.getVector()))[2] = u1*u1; + } + void applyJacobian_1(ROL::Vector &jv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(jv.dimension()==3); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector jvs = dynamic_cast&>(jv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + const Real two(2); + Real v1 = (*(vs.getVector()))[0]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(jvs.getVector()))[0] = z1*z2*v1; + (*(jvs.getVector()))[1] = (z1-z2)*v1; + (*(jvs.getVector()))[2] = two*u1*v1; + } + void applyJacobian_2(ROL::Vector &jv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(jv.dimension()==3); + assert(v.dimension()==2); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector jvs = dynamic_cast&>(jv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real v1 = (*(vs.getVector()))[0]; + Real v2 = (*(vs.getVector()))[1]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(jvs.getVector()))[0] = z2*u1*v1 + z1*u1*v2; + (*(jvs.getVector()))[1] = (v1-v2)*u1; + (*(jvs.getVector()))[2] = static_cast(0); + } + void applyAdjointJacobian_1(ROL::Vector &ajv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ajv.dimension()==1); + assert(v.dimension()==3); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ajvs = dynamic_cast&>(ajv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + const Real two(2); + Real v1 = (*(vs.getVector()))[0]; + Real v2 = (*(vs.getVector()))[1]; + Real v3 = (*(vs.getVector()))[2]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(ajvs.getVector()))[0] = z1*z2*v1 + (z1-z2)*v2 + two*u1*v3; + } + void applyAdjointJacobian_2(ROL::Vector &ajv, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ajv.dimension()==2); + assert(v.dimension()==3); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ajvs = dynamic_cast&>(ajv); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real v1 = (*(vs.getVector()))[0]; + Real v2 = (*(vs.getVector()))[1]; + Real u1 = (*(us.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(ajvs.getVector()))[0] = (z2*u1*v1 + u1*v2); + (*(ajvs.getVector()))[1] = (z1*u1*v1 - u1*v2); + } + void applyAdjointHessian_11(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==1); + assert(w.dimension()==3); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + const Real two(2); + Real w3 = (*(ws.getVector()))[2]; + Real v1 = (*(vs.getVector()))[0]; + (*(ahwvs.getVector()))[0] = two*v1*w3; + } + void applyAdjointHessian_12(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==2); + assert(w.dimension()==3); + assert(v.dimension()==1); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real w1 = (*(ws.getVector()))[0]; + Real w2 = (*(ws.getVector()))[1]; + Real v1 = (*(vs.getVector()))[0]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(ahwvs.getVector()))[0] = (z2*v1*w1 + v1*w2); + (*(ahwvs.getVector()))[1] = (z1*v1*w1 - v1*w2); + } + void applyAdjointHessian_21(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==1); + assert(w.dimension()==3); + assert(v.dimension()==2); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real w1 = (*(ws.getVector()))[0]; + Real w2 = (*(ws.getVector()))[1]; + Real v1 = (*(vs.getVector()))[0]; + Real v2 = (*(vs.getVector()))[1]; + Real z1 = (*(zs.getVector()))[0]; + Real z2 = (*(zs.getVector()))[1]; + (*(ahwvs.getVector()))[0] = (v1*z2+z1*v2)*w1 + (v1-v2)*w2; + } + void applyAdjointHessian_22(ROL::Vector &ahwv, const ROL::Vector &w, const ROL::Vector &v, const ROL::Vector &u, const ROL::Vector &z, Real &tol) { + assert(ahwv.dimension()==2); + assert(w.dimension()==3); + assert(v.dimension()==2); + assert(u.dimension()==1); + assert(z.dimension()==2); + ROL::StdVector ahwvs = dynamic_cast&>(ahwv); + const ROL::StdVector ws = dynamic_cast&>(w); + const ROL::StdVector vs = dynamic_cast&>(v); + const ROL::StdVector us = dynamic_cast&>(u); + const ROL::StdVector zs = dynamic_cast&>(z); + Real w1 = (*(ws.getVector()))[0]; + Real v1 = (*(vs.getVector()))[0]; + Real v2 = (*(vs.getVector()))[1]; + Real u1 = (*(us.getVector()))[0]; + (*(ahwvs.getVector()))[0] = v2*u1*w1; + (*(ahwvs.getVector()))[1] = v1*u1*w1; + } +}; + +int main(int argc, char *argv[]) { + using RealT = double; + + Teuchos::GlobalMPISession mpiSession(&argc, &argv); + + // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided. + int iprint = argc - 1; + ROL::Ptr outStream; + ROL::nullstream bhs; // outputs nothing + if (iprint > 0) + outStream = ROL::makePtrFromRef(std::cout); + else + outStream = ROL::makePtrFromRef(bhs); + + // Save the format state of the original std::cout. + ROL::nullstream oldFormatState; + oldFormatState.copyfmt(std::cout); + +// RealT errtol = std::sqrt(ROL::ROL_THRESHOLD()); + + int errorFlag = 0; + + // *** Test body. + + try { + + unsigned c1_dim = 1; // Constraint1 dimension + unsigned c2_dim = 3; // Constraint1 dimension + unsigned u_dim = 1; // State dimension + unsigned z_dim = 2; // Control dimension + + auto c1 = ROL::makePtr>(c1_dim); + auto c2 = ROL::makePtr>(c2_dim); + auto u = ROL::makePtr>(u_dim); + auto z = ROL::makePtr>(z_dim); + auto vc1 = ROL::makePtr>(c1_dim); + auto vc2 = ROL::makePtr>(c2_dim); + auto vu = ROL::makePtr>(u_dim); + auto vz = ROL::makePtr>(z_dim); + auto du = ROL::makePtr>(u_dim); + auto dz = ROL::makePtr>(z_dim); + c1->randomize(static_cast(-1),static_cast(1)); + c2->randomize(static_cast(-1),static_cast(1)); + u->randomize(static_cast(-1),static_cast(1)); + z->randomize(static_cast(-1),static_cast(1)); + vc1->randomize(static_cast(-1),static_cast(1)); + vc2->randomize(static_cast(-1),static_cast(1)); + vu->randomize(static_cast(-1),static_cast(1)); + vz->randomize(static_cast(-1),static_cast(1)); + du->randomize(static_cast(-1),static_cast(1)); + dz->randomize(static_cast(-1),static_cast(1)); + + auto con1 = ROL::makePtr>(); + auto con2 = ROL::makePtr>(); + auto stateStore = ROL::makePtr>(); + auto rcon = ROL::makePtr>(con2,con1,stateStore,u,z,vc1,c2,true,false); + + con1->checkSolve(*u,*z,*c1,true,*outStream); + con1->checkAdjointConsistencyJacobian_1(*vc1,*vu,*u,*z,true,*outStream); + con1->checkAdjointConsistencyJacobian_2(*vc1,*vz,*u,*z,true,*outStream); + con1->checkInverseJacobian_1(*c1,*vu,*u,*z,true,*outStream); + con1->checkInverseAdjointJacobian_1(*c1,*vu,*u,*z,true,*outStream); + con1->checkApplyJacobian_1(*u,*z,*vu,*vc1,true,*outStream); + con1->checkApplyJacobian_2(*u,*z,*vz,*vc1,true,*outStream); + con1->checkApplyAdjointHessian_11(*u,*z,*vc1,*vu,*du,true,*outStream); + con1->checkApplyAdjointHessian_12(*u,*z,*vc1,*vu,*dz,true,*outStream); + con1->checkApplyAdjointHessian_21(*u,*z,*vc1,*vz,*du,true,*outStream); + con1->checkApplyAdjointHessian_22(*u,*z,*vc1,*vz,*dz,true,*outStream); + + con2->checkAdjointConsistencyJacobian_1(*vc2,*vu,*u,*z,true,*outStream); + con2->checkAdjointConsistencyJacobian_2(*vc2,*vz,*u,*z,true,*outStream); + con2->checkApplyJacobian_1(*u,*z,*vu,*vc2,true,*outStream); + con2->checkApplyJacobian_2(*u,*z,*vz,*vc2,true,*outStream); + con2->checkApplyAdjointHessian_11(*u,*z,*vc2,*vu,*du,true,*outStream); + con2->checkApplyAdjointHessian_12(*u,*z,*vc2,*vu,*dz,true,*outStream); + con2->checkApplyAdjointHessian_21(*u,*z,*vc2,*vz,*du,true,*outStream); + con2->checkApplyAdjointHessian_22(*u,*z,*vc2,*vz,*dz,true,*outStream); + + rcon->checkAdjointConsistencyJacobian(*vc2,*vz,*z,true,*outStream); + rcon->checkApplyJacobian(*z,*vz,*vc2,true,*outStream); + rcon->checkApplyAdjointHessian(*z,*vc2,*vz,*dz,true,*outStream); + } + catch (std::logic_error& err) { + *outStream << err.what() << "\n"; + errorFlag = -1000; + }; // end try + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return 0; + + +} + From 40d30ef76e9c5c8b0915928de689884c823164c0 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Wed, 2 Oct 2024 17:35:46 -0600 Subject: [PATCH 058/243] Removed an incorrect call to forward in ROL_VectorClone.hpp Signed-off-by: Greg von Winckel --- packages/rol/src/vector/ROL_VectorClone.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/rol/src/vector/ROL_VectorClone.hpp b/packages/rol/src/vector/ROL_VectorClone.hpp index ed6e308239b7..31928e07f3fc 100644 --- a/packages/rol/src/vector/ROL_VectorClone.hpp +++ b/packages/rol/src/vector/ROL_VectorClone.hpp @@ -136,7 +136,7 @@ class VectorCloneMap { /** \brief Preallocate keys if desired */ template VectorCloneMap( Keys&&...keys ) { - Constructor_Impl( forward(keys)... ); + Constructor_Impl( keys... ); } Ptr> operator() ( const Vector& x, KeyType key ) { From 5fa0d09432e5e7e1b73d89e115ee2541be2f3479 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Thu, 10 Oct 2024 17:19:26 -0600 Subject: [PATCH 059/243] Added shell script find_parameters.sh for rapidly finding all relevant parameter usage. Added local variables to split up some ParameterList expressions in ROL and make C++ source code easier to parse Signed-off-by: Greg von Winckel --- .../rol/rol_parameters/find_parameters.sh | 44 ++ packages/rol/rol_parameters/rol_parameters.py | 289 ++++++------ .../rol/rol_parameters/rol_parameters.xml | 442 ------------------ .../TypeB/ROL_TypeB_AlgorithmFactory.hpp | 12 +- .../ROL_TypeB_ColemanLiAlgorithm_Def.hpp | 3 +- .../ROL_TypeB_KelleySachsAlgorithm_Def.hpp | 3 +- .../TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp | 3 +- .../TypeB/ROL_TypeB_LinMoreAlgorithm_Def.hpp | 3 +- .../ROL_TypeB_TrustRegionSPGAlgorithm_Def.hpp | 7 +- .../TypeE/ROL_TypeE_AlgorithmFactory.hpp | 3 +- .../TypeG/ROL_TypeG_AlgorithmFactory.hpp | 3 +- .../TypeP/ROL_TypeP_AlgorithmFactory.hpp | 3 +- .../ROL_TypeP_TrustRegionAlgorithm_Def.hpp | 3 +- .../TypeU/ROL_TypeU_AlgorithmFactory.hpp | 3 +- .../TypeU/ROL_TypeU_BundleAlgorithm_Def.hpp | 3 +- .../ROL_TypeU_LineSearchAlgorithm_Def.hpp | 3 +- .../ROL_TypeU_TrustRegionAlgorithm_Def.hpp | 6 +- .../TypeU/linesearch/ROL_LineSearch_U.hpp | 6 +- .../ROL_ScalarMinimizationLineSearch_U.hpp | 6 +- .../ROL_PolyhedralProjectionFactory.hpp | 3 +- .../ROL_SemismoothNewtonProjection_Def.hpp | 1 + packages/rol/src/step/ROL_BundleStep.hpp | 3 +- packages/rol/src/step/ROL_FletcherStep.hpp | 3 +- packages/rol/src/step/ROL_LineSearchStep.hpp | 9 +- .../src/step/ROL_PrimalDualActiveSetStep.hpp | 3 +- .../rol/src/step/ROL_ProjectedSecantStep.hpp | 3 +- packages/rol/src/step/ROL_TrustRegionStep.hpp | 18 +- .../rol/src/step/linesearch/ROL_Brents.hpp | 3 +- .../src/step/linesearch/ROL_LineSearch.hpp | 7 +- .../ROL_ScalarMinimizationLineSearch.hpp | 8 +- .../rol/src/step/secant/ROL_SecantFactory.hpp | 4 +- .../src/step/trustregion/ROL_TrustRegion.hpp | 13 +- 32 files changed, 297 insertions(+), 626 deletions(-) create mode 100755 packages/rol/rol_parameters/find_parameters.sh delete mode 100644 packages/rol/rol_parameters/rol_parameters.xml diff --git a/packages/rol/rol_parameters/find_parameters.sh b/packages/rol/rol_parameters/find_parameters.sh new file mode 100755 index 000000000000..0677cdad2821 --- /dev/null +++ b/packages/rol/rol_parameters/find_parameters.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# File: find_parameters.sh + +# Strip C/C++ style comments from source code +remove_comments() { + perl -0777 -pe 's{//.*$}{}gm; s{/\*.*?\*/}{}gs' +} + +# Check if a directory is provided as an argument, otherwise use the current directory +search_dir="${1:-.}" + +# Verify that the directory exists +if [ ! -d "${search_dir}" ]; then + echo "Error: Directory '${search_dir}' does not exist." + echo "Usage: ${0} [directory_to_search]" + exit 1 +fi + +# Updated pattern definitions +getset_pattern="(\.|\->)\s*((s|g)et|sublist)\s*\(\s*\"" +sublist_pattern="(\.|\->)\s*sublist\s*\(\s*\"" +pattern="${getset_pattern}|${sublist_pattern}" + +# Function to generate find's -not -path arguments for excluded directories +generate_exclude_args() { + local IFS=',' + local exclude_args="" + for dir in $1; do + exclude_args="${exclude_args} -not -path */${dir}/*" + done + echo ${exclude_args} +} + +excluded_dirs="compatability,dynamic,interiorpoint,oed,sol,zoo" +exclude_args=$(generate_exclude_args "${excluded_dirs}") + +find "${search_dir}" -type f -name '*.hpp' ${exclude_args} | while read -r file; do + result=$(remove_comments < "${file}" | grep -E "${pattern}" || true) + if [ -n "${result}" ]; then + echo "$result" | while IFS= read -r line; do + echo "${file}:${line}" + done + fi +done diff --git a/packages/rol/rol_parameters/rol_parameters.py b/packages/rol/rol_parameters/rol_parameters.py index 4945d9351e75..c3c970640819 100644 --- a/packages/rol/rol_parameters/rol_parameters.py +++ b/packages/rol/rol_parameters/rol_parameters.py @@ -1,146 +1,175 @@ +# File rol_parameters.py +import sys +import json +import pathlib import re import subprocess -import pathlib -from typing import Dict, List, Tuple -import xml.etree.ElementTree as ET +from typing import Dict, List +from collections import defaultdict + +def hierarchy_to_json(tuple_set): + def add_to_hierarchy(hierarchy, path, item_type): + current = hierarchy + for part in path[:-1]: + if part not in current["Sublists"]: + current["Sublists"][part] = {"Parameters": [], "Sublists": {}} + current = current["Sublists"][part] + + if item_type == "Parameter": + current["Parameters"].append(path[-1]) + elif item_type == "Sublist": + if path[-1] not in current["Sublists"]: + current["Sublists"][path[-1]] = {"Parameters": [], "Sublists": {}} + + root = {"Parameters": [], "Sublists": {}} + + for tuple_path in tuple_set: + path = [item[0] for item in tuple_path] + item_type = tuple_path[-1][1] + add_to_hierarchy(root, path, item_type) + + return root + +from xml.etree.ElementTree import Element, SubElement, tostring from xml.dom import minidom -def create_xml_from_dict(dict_data: Dict, root_name: str = "Inputs") -> ET.Element: - def create_element(name: str, content: Dict) -> ET.Element: - element = ET.Element(name) - if isinstance(content, dict): - for key, value in content.items(): - if key == "Parameters" and value: - param_element = ET.SubElement(element, "Parameter") - param_element.set("name", "Valid Keys") - param_element.set("type", "Array(string)") - param_element.set("value", "{" + ",".join(value) + "}") - elif key == "Sublists": - for sublist_name, sublist_content in value.items(): - sublist_element = create_element("ParameterList", sublist_content) - sublist_element.set("name", sublist_name) - element.append(sublist_element) - return element - - root = create_element("ParameterList", dict_data) - root.set("name", root_name) - return root - -def prettify(elem: ET.Element) -> str: - rough_string = ET.tostring(elem, 'utf-8') +def json_to_xml(json_obj): + def create_parameter_list(name, data): + param_list = Element("ParameterList", name=name) + + if "Parameters" in data and data["Parameters"]: + param_str = ",".join(data["Parameters"]) + param = SubElement(param_list, "Parameter", name="Parameters", type="Array(string)", value=f"{{{param_str}}}") + + if "Sublists" in data: + for sublist_name, sublist_data in data["Sublists"].items(): + sub_param_list = create_parameter_list(sublist_name, sublist_data) + if len(sub_param_list) > 0: + param_list.append(sub_param_list) + + return param_list + + root = create_parameter_list("ROL Parameters", json_obj) + + # Remove empty ParameterLists + for elem in root.iter("ParameterList"): + if len(elem) == 0: + root.remove(elem) + + # Convert to string and pretty print + rough_string = tostring(root, 'utf-8') reparsed = minidom.parseString(rough_string) return reparsed.toprettyxml(indent=" ") -def grep_source_files(src_directory: str) -> str: - grep_command = [ - 'grep', - '-rE', - '-e', r'(\.|\->)\s*(((s|g)et\s*\(\s*"([a-zA-Z0-9]|\s)+"\s*,\s*\S+\s*\))|sublist)', - '-e', r'(\.|\->)\s*sublist\s*\(\s*\"', - src_directory - ] +def pretty_print_xml(xml_str : str): + def remove_extra_newlines(string : str): + return '\n'.join(line for line in string.split('\n') if line.strip()) + parsed_xml = minidom.parseString(xml_str) + return remove_extra_newlines(parsed_xml.toprettyxml(indent=" ")) + +def replace_whitespace(input_string): + return re.sub(r'\s{2,}', ' ', input_string) + +def dereference(code): + return re.sub(r'\&','',code) + + +def extract_quoted_strings(input_string): + # Use a regular expression to find all substrings in double quotes + matches = re.findall(r'"(.*?)"', input_string) + # Convert the list of matches to a tuple + return tuple(matches) + +def get_lines( search_path : pathlib.Path ) -> List[str]: try: - result = subprocess.run(grep_command, capture_output=True, text=True, check=True) - return result.stdout + result = subprocess.run(['./find_parameters.sh', search_path], capture_output=True, text=True, check=True) + return result.stdout.splitlines() except subprocess.CalledProcessError as e: print(f"Error occurred: {e}") - return e.stderr - -def split_cpp_code(code_string: str) -> List[str]: - tokens = re.split(r'->|\.', code_string) - return [token.strip() for token in tokens if token.strip()] - -def extract_quoted_strings(string_list: List[str]) -> Tuple[str, ...]: - return tuple(s.strip('"') for s in string_list if s.startswith('"') and s.endswith('"')) - -def parse_cpp_strings(input_list: List[str]) -> List[str]: - parsed_list = [] - for item in input_list: - match = re.search(r'(\w+)$|"([^"]*)"|\b(?:get|set)\s*\(\s*"([^"]*)"', item) - if match: - if match.group(1): - parsed_list.append(match.group(1)) - elif match.group(2): - parsed_list.append(f'"{match.group(2)}"') - elif match.group(3): - parsed_list.append(f'"{match.group(3)}"') - return parsed_list - -def build_hierarchy(data: Dict[str, List[str]]) -> Dict[str, List[str]]: - def resolve_list(value_list: List[str]) -> List[str]: - if not value_list: - return value_list - first_item = value_list[0] - if first_item in data and not first_item.startswith('"'): - return resolve_list(data[first_item]) + value_list[1:] - else: - return [first_item] + resolve_list(value_list[1:]) - - return {key: resolve_list(value) for key, value in data.items()} - -def create_hierarchical_dict(list_of_lists: List[List[str]]) -> Dict: - result = {} - for path in list_of_lists: - current = result - for key in path[:-1]: - current = current.setdefault(key, {}) - current[path[-1]] = {} + +def prune_tuples(data): + # Sort the tuples by length in descending order + sorted_data = sorted(data, key=len, reverse=True) + + result = [] + leading_elements_set = set() + + for tup in sorted_data: + # Create a leading element tuple (all but the last element) + leading_elements = tup[:-1] + + # Check if the leading elements are already in the set + if leading_elements not in leading_elements_set: + # If not, add the tuple to the result and update the set + result.append(tup) + leading_elements_set.add(leading_elements) + return result -def parse(params: Dict) -> Dict: - result = {'Parameters': [], 'Sublists': {}} - for k, v in params.items(): - if isinstance(v, dict): - if v: - result['Sublists'][k] = parse(v) - else: - result['Parameters'].append(k) - else: - result['Parameters'].append(k) - return result if __name__ == '__main__': - rol_src = (pathlib.Path.cwd().parents[0]/'src').resolve() - make_relative = lambda path_str: pathlib.Path(path_str).relative_to(rol_src, walk_up=True) - strip_excess_whitespace = lambda text: re.sub(r'\s+', ' ', text).strip() - - local_sublist_pattern = re.compile(r'[ParameterList|auto]\s*[&]\s*(\w+)\s*=\s*(\w+)[\.|\->](.*)') - exclusions = ['compatibility', 'step', 'zoo', 'sol', 'oed', 'dynamic'] - - output = grep_source_files(str(rol_src)) - data = {} - for line in output.splitlines(): - file, *code_parts = line.split(':') - file = str(make_relative(file)) - code = strip_excess_whitespace(':'.join(code_parts)) - if not any(f'{e}/' in file for e in exclusions): - data.setdefault(file, []).append(code) - - paramset = set() - for file, code in data.items(): - sublist = {} - parameters = [] - for line in code: - match = re.search(local_sublist_pattern, line) - if match: - sublist[match.group(1)] = [match.group(2)] + parse_cpp_strings(split_cpp_code(match.group(3))) - else: - if '=' in line: - line = line.split('=')[1].strip() - parameters.append(parse_cpp_strings(split_cpp_code(line))) - - sublist = build_hierarchy(sublist) - parameters = [build_hierarchy(sublist)[sublist_key] + param[1:] for param in parameters for sublist_key in sublist] - paramset.update(map(extract_quoted_strings, parameters)) - parameters = create_hierarchical_dict(sorted(filter(len, map(list, paramset)))) - parameters.pop('SOL', None) - - xml_root = create_xml_from_dict(parse(parameters)) - pretty_xml = prettify(xml_root) - - with open('rol_parameters.xml', 'w') as f: - f.write(pretty_xml) - - print("XML file 'rol_parameters.xml' has been created.") + src = sys.argv[1] + grep_pattern = r'(\.|\->)\s*((s|g)et|sublist)\s*\(\s*\"' + exclude_dirs = 'compatability,dynamic,interiorpoint,oed,sol,zoo' + + # Get all lines in C++ source in which a ParameterList's sublist, get, or set method is called + lines = get_lines(src) + + data = defaultdict(list) + + defines_local_sublist = lambda line: line.split('.')[-1].startswith('sublist') + + for line in lines: + filename,code = line.split(':',1) + if not code.startswith('//'): # Excluded commented out lines + data[filename].append(replace_whitespace(code.strip())) + + pattern = re.compile(r'(?:sublist|get|set)\s*\(\s*"([^"]*)"\s*(?:\)|,)') + + expanded_lines = list() + + for key, value in data.items(): + ldefs = dict() + for line in value: + if defines_local_sublist(line): + lhs, rhs = line.split('=') + if len(dereference(lhs).split()) == 2: + var = dereference(lhs).split()[1] + ldefs[var] = rhs.strip()[:-1] + + if len(ldefs): + for cycle in range(len(ldefs)): + for k,v in ldefs.items(): + vl,vr = v.split('.',1) + if vl in ldefs.keys(): + ldefs[k] = f'{ldefs[vl]}.{vr}' + + + for line in value: + exline = line + for k,v in ldefs.items(): + exline = re.sub(rf' {k}.',rf' {v}.', exline) + expanded_lines.append(exline) + + pair_tuples = set() + for line in expanded_lines: + if '=' in line: + line = line.split('=')[1] + if 'et(' in line and 'SOL' not in line: + line = line.split(',')[0] + token_tuple = extract_quoted_strings(line) + depth = len(token_tuple) + if depth > 1: + type_tuple = ('Sublist',) * (depth-1) + ('Parameter',) + pair_tuples.add(tuple(zip(token_tuple,type_tuple))) + + rol_json = hierarchy_to_json(set(pair_tuples)) + with open('rol_parameters.json','w') as jsonfile: + jsonfile.write(json.dumps(rol_json,indent=2)) + + xml_output = json_to_xml(rol_json) + pretty_xml_output = pretty_print_xml(xml_output) + with open('rol_parameters.xml','w') as xmlfile: + xmlfile.write(pretty_xml_output) diff --git a/packages/rol/rol_parameters/rol_parameters.xml b/packages/rol/rol_parameters/rol_parameters.xml deleted file mode 100644 index 21354eb88fd0..000000000000 --- a/packages/rol/rol_parameters/rol_parameters.xml +++ /dev/null @@ -1,442 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/packages/rol/src/algorithm/TypeB/ROL_TypeB_AlgorithmFactory.hpp b/packages/rol/src/algorithm/TypeB/ROL_TypeB_AlgorithmFactory.hpp index 3837afb7e37b..018cf9860088 100644 --- a/packages/rol/src/algorithm/TypeB/ROL_TypeB_AlgorithmFactory.hpp +++ b/packages/rol/src/algorithm/TypeB/ROL_TypeB_AlgorithmFactory.hpp @@ -144,17 +144,16 @@ inline EAlgorithmB StringToEAlgorithmB(std::string s) { template inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr> &secant = nullPtr) { - EAlgorithmB ealg = StringToEAlgorithmB(parlist.sublist("Step").get("Type","Trust Region")); + std::string stepType = parlist.sublist("Step").get("Type","Trust Region"); + EAlgorithmB ealg = StringToEAlgorithmB(stepType); switch(ealg) { case ALGORITHM_B_LINESEARCH: { - std::string desc - = parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").get("Type","Newton-Krylov"); + std::string desc = parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").get("Type","Newton-Krylov"); if (desc=="Newton-Krylov" || desc=="Newton") return makePtr>(parlist,secant); else if (desc=="Quasi-Newton Method" || desc=="Quasi-Newton") { - std::string method - = parlist.sublist("Step").sublist("Line Search").sublist("Quasi-Newton").get("Method","L-Secant-B"); + std::string method = parlist.sublist("Step").sublist("Line Search").sublist("Quasi-Newton").get("Method","L-Secant-B"); if (method == "L-Secant-B") return makePtr>(parlist,secant); // Similar to L-BFGS-B else @@ -166,8 +165,7 @@ inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr>(parlist,secant); else if (trmod=="SPG") diff --git a/packages/rol/src/algorithm/TypeB/ROL_TypeB_ColemanLiAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeB/ROL_TypeB_ColemanLiAlgorithm_Def.hpp index 6597ad726f67..b7f427411b93 100644 --- a/packages/rol/src/algorithm/TypeB/ROL_TypeB_ColemanLiAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeB/ROL_TypeB_ColemanLiAlgorithm_Def.hpp @@ -93,7 +93,8 @@ ColemanLiAlgorithm::ColemanLiAlgorithm(ParameterList &list, // Initialize trust region model model_ = makePtr>(list,secant,mode); if (secant == nullPtr) { - esec_ = StringToESecant(list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); } } diff --git a/packages/rol/src/algorithm/TypeB/ROL_TypeB_KelleySachsAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeB/ROL_TypeB_KelleySachsAlgorithm_Def.hpp index c0e910bbcf75..f4ae2715bf97 100644 --- a/packages/rol/src/algorithm/TypeB/ROL_TypeB_KelleySachsAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeB/ROL_TypeB_KelleySachsAlgorithm_Def.hpp @@ -88,7 +88,8 @@ KelleySachsAlgorithm::KelleySachsAlgorithm(ParameterList &list, useSecantPrecond_ = list.sublist("General").sublist("Secant").get("Use as Preconditioner", false); useSecantHessVec_ = list.sublist("General").sublist("Secant").get("Use as Hessian", false); if (secant == nullPtr) { - esec_ = StringToESecant(list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); } } diff --git a/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp index 1fded5aacbc9..92f0c7a3cf2f 100644 --- a/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeB/ROL_TypeB_LSecantBAlgorithm_Def.hpp @@ -81,7 +81,8 @@ LSecantBAlgorithm::LSecantBAlgorithm(ParameterList &list, useSecantHessVec_ = true; ESecantMode mode = SECANTMODE_BOTH; if (secant == nullPtr) { - esec_ = StringToESecant(list.sublist("General").sublist("Secant").get("Type","Limited-Memory Secant")); + std::string secantType = list.sublist("General").sublist("Secant").get("Type","Limited-Memory Secant"); + esec_ = StringToESecant(secantType); secant_ = SecantFactory(list,mode); } } diff --git a/packages/rol/src/algorithm/TypeB/ROL_TypeB_LinMoreAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeB/ROL_TypeB_LinMoreAlgorithm_Def.hpp index 33a2a1b179e4..cdbf38c2cb3d 100644 --- a/packages/rol/src/algorithm/TypeB/ROL_TypeB_LinMoreAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeB/ROL_TypeB_LinMoreAlgorithm_Def.hpp @@ -118,7 +118,8 @@ LinMoreAlgorithm::LinMoreAlgorithm(ParameterList &list, // Initialize trust region model model_ = makePtr>(list,secant,mode); if (secant == nullPtr) { - esec_ = StringToESecant(list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); } } diff --git a/packages/rol/src/algorithm/TypeB/ROL_TypeB_TrustRegionSPGAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeB/ROL_TypeB_TrustRegionSPGAlgorithm_Def.hpp index 93f100f1af8a..e7ca1bc0ee88 100644 --- a/packages/rol/src/algorithm/TypeB/ROL_TypeB_TrustRegionSPGAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeB/ROL_TypeB_TrustRegionSPGAlgorithm_Def.hpp @@ -95,7 +95,9 @@ TrustRegionSPGAlgorithm::TrustRegionSPGAlgorithm(ParameterList &list, tol2_ = lmlist.sublist("Solver").get("Relative Tolerance", 1e-2); useMin_ = lmlist.sublist("Solver").get("Use Smallest Model Iterate", true); useNMSP_ = lmlist.sublist("Solver").get("Use Nonmonotone Search", false); - useSimpleSPG_ = !lmlist.sublist("Solver").get("Compute Cauchy Point", true); + + bool useCachyPoint = lmlist.sublist("Solver").get("Compute Cauchy Point", true); + useSimpleSPG_ = !useCachyPoint; // Inexactness Information ParameterList &glist = list.sublist("General"); useInexact_.clear(); @@ -125,7 +127,8 @@ TrustRegionSPGAlgorithm::TrustRegionSPGAlgorithm(ParameterList &list, // Initialize trust region model model_ = makePtr>(list,secant,mode); if (secant == nullPtr) { - esec_ = StringToESecant(list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); } } diff --git a/packages/rol/src/algorithm/TypeE/ROL_TypeE_AlgorithmFactory.hpp b/packages/rol/src/algorithm/TypeE/ROL_TypeE_AlgorithmFactory.hpp index 709968d76f86..4f02d5417bbd 100644 --- a/packages/rol/src/algorithm/TypeE/ROL_TypeE_AlgorithmFactory.hpp +++ b/packages/rol/src/algorithm/TypeE/ROL_TypeE_AlgorithmFactory.hpp @@ -123,7 +123,8 @@ inline EAlgorithmE StringToEAlgorithmE(std::string s) { template inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr> &secant = nullPtr) { - EAlgorithmE ealg = StringToEAlgorithmE(parlist.sublist("Step").get("Type","Augmented Lagrangian")); + std::string stepType = parlist.sublist("Step").get("Type","Augmented Lagrangian"); + EAlgorithmE ealg = StringToEAlgorithmE(stepType); switch(ealg) { case ALGORITHM_E_AUGMENTEDLAGRANGIAN: return makePtr>(parlist,secant); case ALGORITHM_E_FLETCHER: return makePtr>(parlist,secant); diff --git a/packages/rol/src/algorithm/TypeG/ROL_TypeG_AlgorithmFactory.hpp b/packages/rol/src/algorithm/TypeG/ROL_TypeG_AlgorithmFactory.hpp index 113392ee7aee..f6164f3bd336 100644 --- a/packages/rol/src/algorithm/TypeG/ROL_TypeG_AlgorithmFactory.hpp +++ b/packages/rol/src/algorithm/TypeG/ROL_TypeG_AlgorithmFactory.hpp @@ -128,7 +128,8 @@ inline EAlgorithmG StringToEAlgorithmG(std::string s) { template inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr> &secant = nullPtr) { - EAlgorithmG ealg = StringToEAlgorithmG(parlist.sublist("Step").get("Type","Augmented Lagrangian")); + std::string stepType = parlist.sublist("Step").get("Type","Augmented Lagrangian"); + EAlgorithmG ealg = StringToEAlgorithmG(stepType); switch(ealg) { case ALGORITHM_G_AUGMENTEDLAGRANGIAN: return makePtr>(parlist,secant); case ALGORITHM_G_MOREAUYOSIDA: return makePtr>(parlist,secant); diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp index ca780ad30804..065040e25bcf 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_AlgorithmFactory.hpp @@ -130,7 +130,8 @@ inline EAlgorithmP StringToEAlgorithmP(std::string s) { template inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr> &secant = nullPtr) { - EAlgorithmP ealg = StringToEAlgorithmP(parlist.sublist("Step").get("Type","Trust Region")); + std::string stepType = parlist.sublist("Step").get("Type","Trust Region"); + EAlgorithmP ealg = StringToEAlgorithmP(stepType); switch(ealg) { case ALGORITHM_P_LINESEARCH: { diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_TrustRegionAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_TrustRegionAlgorithm_Def.hpp index 569b2d304b28..8e16cfa7dfda 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_TrustRegionAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_TrustRegionAlgorithm_Def.hpp @@ -134,7 +134,8 @@ TrustRegionAlgorithm::TrustRegionAlgorithm(ParameterList &list, // Initialize trust region model model_ = makePtr>(list,secant,mode); if (secant == nullPtr) { - esec_ = StringToESecant(list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = list.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); } } diff --git a/packages/rol/src/algorithm/TypeU/ROL_TypeU_AlgorithmFactory.hpp b/packages/rol/src/algorithm/TypeU/ROL_TypeU_AlgorithmFactory.hpp index 11695d24865a..542a4b38ca50 100644 --- a/packages/rol/src/algorithm/TypeU/ROL_TypeU_AlgorithmFactory.hpp +++ b/packages/rol/src/algorithm/TypeU/ROL_TypeU_AlgorithmFactory.hpp @@ -123,7 +123,8 @@ inline EAlgorithmU StringToEAlgorithmU(std::string s) { template inline Ptr> AlgorithmFactory(ParameterList &parlist, const Ptr> &secant = nullPtr) { - EAlgorithmU ealg = StringToEAlgorithmU(parlist.sublist("Step").get("Type","Trust Region")); + std::string stepType = parlist.sublist("Step").get("Type","Trust Region"); + EAlgorithmU ealg = StringToEAlgorithmU(stepType); switch(ealg) { case ALGORITHM_U_BUNDLE: return makePtr>(parlist); case ALGORITHM_U_LINESEARCH: return makePtr>(parlist,secant); diff --git a/packages/rol/src/algorithm/TypeU/ROL_TypeU_BundleAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeU/ROL_TypeU_BundleAlgorithm_Def.hpp index 846221ca4a7f..4be8b80fc10e 100644 --- a/packages/rol/src/algorithm/TypeU/ROL_TypeU_BundleAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeU/ROL_TypeU_BundleAlgorithm_Def.hpp @@ -82,7 +82,8 @@ BundleAlgorithm::BundleAlgorithm( ParameterList &parlist, Real omega = blist.get("Locality Measure Coefficient", two); unsigned maxSize = blist.get("Maximum Bundle Size", 200); unsigned remSize = blist.get("Removal Size for Bundle Update", 2); - if ( blist.get("Cutting Plane Solver",0) == 1 ) { + int cps = blist.get("Cutting Plane Solver",0); + if ( cps == 1 ) { bundle_ = makePtr>(maxSize,coeff,omega,remSize); } else { diff --git a/packages/rol/src/algorithm/TypeU/ROL_TypeU_LineSearchAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeU/ROL_TypeU_LineSearchAlgorithm_Def.hpp index e612c7f5ea22..803872d283fb 100644 --- a/packages/rol/src/algorithm/TypeU/ROL_TypeU_LineSearchAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeU/ROL_TypeU_LineSearchAlgorithm_Def.hpp @@ -65,7 +65,8 @@ LineSearchAlgorithm::LineSearchAlgorithm( ParameterList &parlist, // Parse parameter list ParameterList& Llist = parlist.sublist("Step").sublist("Line Search"); ParameterList& Glist = parlist.sublist("General"); - econd_ = StringToECurvatureConditionU(Llist.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions") ); + std::string condType = Llist.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions"); + econd_ = StringToECurvatureConditionU(condType); acceptLastAlpha_ = Llist.get("Accept Last Alpha", false); verbosity_ = Glist.get("Output Level",0); printHeader_ = verbosity_ > 2; diff --git a/packages/rol/src/algorithm/TypeU/ROL_TypeU_TrustRegionAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeU/ROL_TypeU_TrustRegionAlgorithm_Def.hpp index 69dec1a20806..1e700333c160 100644 --- a/packages/rol/src/algorithm/TypeU/ROL_TypeU_TrustRegionAlgorithm_Def.hpp +++ b/packages/rol/src/algorithm/TypeU/ROL_TypeU_TrustRegionAlgorithm_Def.hpp @@ -91,14 +91,16 @@ TrustRegionAlgorithm::TrustRegionAlgorithm( ParameterList &parlist, updateIter_ = vlist.get("Forcing Sequence Update Frequency", static_cast(10)); forceFactor_ = vlist.get("Forcing Sequence Reduction Factor", static_cast(0.1)); // Initialize Trust Region Subproblem Solver Object - etr_ = StringToETrustRegionU(trlist.get("Subproblem Solver", "Dogleg")); + std::string solverName = trlist.get("Subproblem Solver", "Dogleg"); + etr_ = StringToETrustRegionU(solverName); solver_ = TrustRegionUFactory(parlist); verbosity_ = glist.get("Output Level", 0); // Secant Information useSecantPrecond_ = glist.sublist("Secant").get("Use as Preconditioner", false); useSecantHessVec_ = glist.sublist("Secant").get("Use as Hessian", false); if (secant == nullPtr) { - esec_ = StringToESecant(glist.sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = glist.sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); } // Initialize trust region model model_ = makePtr>(parlist,secant); diff --git a/packages/rol/src/algorithm/TypeU/linesearch/ROL_LineSearch_U.hpp b/packages/rol/src/algorithm/TypeU/linesearch/ROL_LineSearch_U.hpp index d72d38199d24..868ccade00f4 100644 --- a/packages/rol/src/algorithm/TypeU/linesearch/ROL_LineSearch_U.hpp +++ b/packages/rol/src/algorithm/TypeU/linesearch/ROL_LineSearch_U.hpp @@ -115,8 +115,10 @@ class LineSearch_U { LineSearch_U( ParameterList &parlist ) { Real one(1), p9(0.9), p6(0.6), p4(0.4), oem4(1.e-4), zero(0); // Enumerations - edesc_ = StringToEDescentU(parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").get("Type","Quasi-Newton Method")); - econd_ = StringToECurvatureConditionU(parlist.sublist("Step").sublist("Line Search").sublist("Curvature Condition").get("Type","Strong Wolfe Conditions")); + std::string descentType = parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").get("Type","Quasi-Newton Method"); + edesc_ = StringToEDescentU(descentType); + std::string condType = parlist.sublist("Step").sublist("Line Search").sublist("Curvature Condition").get("Type","Strong Wolfe Conditions"); + econd_ = StringToECurvatureConditionU(condType); // Linesearch Parameters alpha0_ = parlist.sublist("Step").sublist("Line Search").get("Initial Step Size",one); alpha0bnd_ = parlist.sublist("Step").sublist("Line Search").get("Lower Bound for Initial Step Size",one); diff --git a/packages/rol/src/algorithm/TypeU/linesearch/ROL_ScalarMinimizationLineSearch_U.hpp b/packages/rol/src/algorithm/TypeU/linesearch/ROL_ScalarMinimizationLineSearch_U.hpp index 30d6e25378a9..66f43c9c9b3a 100644 --- a/packages/rol/src/algorithm/TypeU/linesearch/ROL_ScalarMinimizationLineSearch_U.hpp +++ b/packages/rol/src/algorithm/TypeU/linesearch/ROL_ScalarMinimizationLineSearch_U.hpp @@ -234,7 +234,8 @@ class ScalarMinimizationLineSearch_U : public LineSearch_U { sf_ = sf; // Status test for line search - econd_ = StringToECurvatureConditionU(list0.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions")); + std::string condName = list0.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions"); + econd_ = StringToECurvatureConditionU(condName); max_nfval_ = list0.get("Function Evaluation Limit",20); c1_ = list0.get("Sufficient Decrease Tolerance",oem4); c2_ = list0.sublist("Curvature Condition").get("General Parameter",p9); @@ -247,7 +248,8 @@ class ScalarMinimizationLineSearch_U : public LineSearch_U { c1_ = oem4; c2_ = p9; } - EDescentU edesc = StringToEDescentU(list0.sublist("Descent Method").get("Type","Quasi-Newton Method")); + std::string descentName = list0.sublist("Descent Method").get("Type","Quasi-Newton Method"); + EDescentU edesc = StringToEDescentU(descentName); if ( edesc == DESCENT_U_NONLINEARCG ) { c2_ = p4; c3_ = std::min(one-c2_,c3_); diff --git a/packages/rol/src/function/polyproj/ROL_PolyhedralProjectionFactory.hpp b/packages/rol/src/function/polyproj/ROL_PolyhedralProjectionFactory.hpp index 68cf45dacede..d0d420ef58cf 100644 --- a/packages/rol/src/function/polyproj/ROL_PolyhedralProjectionFactory.hpp +++ b/packages/rol/src/function/polyproj/ROL_PolyhedralProjectionFactory.hpp @@ -141,7 +141,8 @@ inline Ptr> PolyhedralProjectionFactory(const Vector< const Vector &mul, const Vector &res, ParameterList &list) { - EPolyProjAlgo ealg = StringToEPolyProjAlgo(list.sublist("General").sublist("Polyhedral Projection").get("Type","Dykstra")); + std::string projectionType = list.sublist("General").sublist("Polyhedral Projection").get("Type","Dykstra"); + EPolyProjAlgo ealg = StringToEPolyProjAlgo(projectionType); switch(ealg) { case PPA_DAIFLETCHER: return makePtr>(xprim,xdual,bnd,con,mul,res,list); break; case PPA_DYKSTRA: return makePtr>(xprim,xdual,bnd,con,mul,res,list); break; diff --git a/packages/rol/src/function/polyproj/ROL_SemismoothNewtonProjection_Def.hpp b/packages/rol/src/function/polyproj/ROL_SemismoothNewtonProjection_Def.hpp index 04f0d346bea3..0acc2bf0fdbe 100644 --- a/packages/rol/src/function/polyproj/ROL_SemismoothNewtonProjection_Def.hpp +++ b/packages/rol/src/function/polyproj/ROL_SemismoothNewtonProjection_Def.hpp @@ -142,6 +142,7 @@ SemismoothNewtonProjection::SemismoothNewtonProjection(const Vector useproj_ = ppl.sublist("Semismooth Newton").get("Project onto Separating Hyperplane", DEFAULT_useproj_); ParameterList klist; + klist.sublist("General").sublist("Krylov") = ppl.sublist("Semismooth Newton").sublist("Krylov"); klist.sublist("General").set("Inexact Hessian-Times-A-Vector", false); krylov_ = KrylovFactory(klist); diff --git a/packages/rol/src/step/ROL_BundleStep.hpp b/packages/rol/src/step/ROL_BundleStep.hpp index 3bc774596d57..ca1cdcba30e7 100644 --- a/packages/rol/src/step/ROL_BundleStep.hpp +++ b/packages/rol/src/step/ROL_BundleStep.hpp @@ -140,7 +140,8 @@ class BundleStep : public Step { Real omega = parlist.sublist("Step").sublist("Bundle").get("Locality Measure Coefficient", two); unsigned maxSize = parlist.sublist("Step").sublist("Bundle").get("Maximum Bundle Size", 200); unsigned remSize = parlist.sublist("Step").sublist("Bundle").get("Removal Size for Bundle Update", 2); - if ( parlist.sublist("Step").sublist("Bundle").get("Cutting Plane Solver",0) == 1 ) { + int cps = parlist.sublist("Step").sublist("Bundle").get("Cutting Plane Solver",0); + if (cps) { bundle_ = ROL::makePtr>(maxSize,coeff,omega,remSize); //bundle_ = ROL::makePtr>(maxSize,coeff,omega,remSize); } diff --git a/packages/rol/src/step/ROL_FletcherStep.hpp b/packages/rol/src/step/ROL_FletcherStep.hpp index 70ebf2e9b65d..f27f0756ad8c 100644 --- a/packages/rol/src/step/ROL_FletcherStep.hpp +++ b/packages/rol/src/step/ROL_FletcherStep.hpp @@ -176,7 +176,8 @@ class FletcherStep : public Step { else { step_ = makePtr>(trlist); } - etr_ = StringToETrustRegion(parlist_.sublist("Step").sublist("Trust Region").get("Subproblem Solver", "Truncated CG")); + std::string solverType = parlist_.sublist("Step").sublist("Trust Region").get("Subproblem Solver", "Truncated CG"); + etr_ = StringToETrustRegion(solverType); // Initialize class members g_ = g.clone(); diff --git a/packages/rol/src/step/ROL_LineSearchStep.hpp b/packages/rol/src/step/ROL_LineSearchStep.hpp index 3501d75becbe..c7c7632908fe 100644 --- a/packages/rol/src/step/ROL_LineSearchStep.hpp +++ b/packages/rol/src/step/ROL_LineSearchStep.hpp @@ -214,7 +214,8 @@ class LineSearchStep : public Step { // Parse parameter list ROL::ParameterList& Llist = parlist.sublist("Step").sublist("Line Search"); ROL::ParameterList& Glist = parlist.sublist("General"); - econd_ = StringToECurvatureCondition(Llist.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions") ); + std::string condName = Llist.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions"); + econd_ = StringToECurvatureCondition(condName); acceptLastAlpha_ = Llist.get("Accept Last Alpha", false); verbosity_ = Glist.get("Print Verbosity",0); computeObj_ = Glist.get("Recompute Objective Function",false); @@ -237,9 +238,9 @@ class LineSearchStep : public Step { d_ = x.clone(); // Initialize unglobalized step - ROL::ParameterList& list - = parlist_.sublist("Step").sublist("Line Search").sublist("Descent Method"); - EDescent edesc = StringToEDescent(list.get("Type","Quasi-Newton Method")); + ROL::ParameterList& list = parlist_.sublist("Step").sublist("Line Search").sublist("Descent Method"); + std::string descentName = list.get("Type","Quasi-Newton Method"); + EDescent edesc = StringToEDescent(descentName); if (bnd.isActivated()) { switch(edesc) { case DESCENT_STEEPEST: { diff --git a/packages/rol/src/step/ROL_PrimalDualActiveSetStep.hpp b/packages/rol/src/step/ROL_PrimalDualActiveSetStep.hpp index 8cbf2daf433b..d4d61b43af88 100644 --- a/packages/rol/src/step/ROL_PrimalDualActiveSetStep.hpp +++ b/packages/rol/src/step/ROL_PrimalDualActiveSetStep.hpp @@ -295,7 +295,8 @@ class PrimalDualActiveSetStep : public Step { gtol_ = parlist.sublist("Step").sublist("Primal Dual Active Set").get("Relative Gradient Tolerance",oem6); scale_ = parlist.sublist("Step").sublist("Primal Dual Active Set").get("Dual Scaling", one); // Build secant object - esec_ = StringToESecant(parlist.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = parlist.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); useSecantHessVec_ = parlist.sublist("General").sublist("Secant").get("Use as Hessian", false); useSecantPrecond_ = parlist.sublist("General").sublist("Secant").get("Use as Preconditioner", false); if ( useSecantHessVec_ || useSecantPrecond_ ) { diff --git a/packages/rol/src/step/ROL_ProjectedSecantStep.hpp b/packages/rol/src/step/ROL_ProjectedSecantStep.hpp index 8d9a560ae6c0..f8e1b0ee1b84 100644 --- a/packages/rol/src/step/ROL_ProjectedSecantStep.hpp +++ b/packages/rol/src/step/ROL_ProjectedSecantStep.hpp @@ -93,7 +93,8 @@ class ProjectedSecantStep : public Step { verbosity_ = parlist.sublist("General").get("Print Verbosity",0); // Initialize secant object if ( secant == ROL::nullPtr ) { - esec_ = StringToESecant(parlist.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantType = parlist.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantType); secant_ = SecantFactory(parlist); } } diff --git a/packages/rol/src/step/ROL_TrustRegionStep.hpp b/packages/rol/src/step/ROL_TrustRegionStep.hpp index 1fa681f43460..2c96ad74fa70 100644 --- a/packages/rol/src/step/ROL_TrustRegionStep.hpp +++ b/packages/rol/src/step/ROL_TrustRegionStep.hpp @@ -190,16 +190,21 @@ class TrustRegionStep : public Step { // Inexactness Information ROL::ParameterList &glist = parlist.sublist("General"); useInexact_.clear(); - useInexact_.push_back(glist.get("Inexact Objective Function", false)); - useInexact_.push_back(glist.get("Inexact Gradient", false)); - useInexact_.push_back(glist.get("Inexact Hessian-Times-A-Vector", false)); + bool inexactObj = glist.get("Inexact Objective Function", false); + bool inexactGrad = glist.get("Inexact Gradient", false); + bool inexactHessVec = glist.get("Inexact Hessian-Times-A-Vector", false); + useInexact_.push_back(inexactObj ); + useInexact_.push_back(inexactGrad ); + useInexact_.push_back(inexactHessVec); // Trust-Region Inexactness Parameters ROL::ParameterList &ilist = list.sublist("Inexact").sublist("Gradient"); scale0_ = ilist.get("Tolerance Scaling", static_cast(0.1)); scale1_ = ilist.get("Relative Tolerance", static_cast(2)); // Initialize Trust Region Subproblem Solver Object - etr_ = StringToETrustRegion(list.get("Subproblem Solver", "Dogleg")); - TRmodel_ = StringToETrustRegionModel(list.get("Subproblem Model", "Kelley-Sachs")); + std::string solverName = list.get("Subproblem Solver", "Dogleg"); + etr_ = StringToETrustRegion(solverName); + std::string modelName = list.get("Subproblem Model", "Kelley-Sachs"); + TRmodel_ = StringToETrustRegionModel(modelName); useProjectedGrad_ = glist.get("Projected Gradient Criticality Measure", false); trustRegion_ = TrustRegionFactory(parlist); // Scale for epsilon active sets @@ -328,7 +333,8 @@ class TrustRegionStep : public Step { parseParameterList(parlist); // Create secant object ROL::ParameterList &glist = parlist.sublist("General"); - esec_ = StringToESecant(glist.sublist("Secant").get("Type","Limited-Memory BFGS")); + std::string secantName = glist.sublist("Secant").get("Type","Limited-Memory BFGS"); + esec_ = StringToESecant(secantName); useSecantPrecond_ = glist.sublist("Secant").get("Use as Preconditioner", false); useSecantHessVec_ = glist.sublist("Secant").get("Use as Hessian", false); secant_ = SecantFactory(parlist); diff --git a/packages/rol/src/step/linesearch/ROL_Brents.hpp b/packages/rol/src/step/linesearch/ROL_Brents.hpp index 468d2d03445b..c77f58d5fc71 100644 --- a/packages/rol/src/step/linesearch/ROL_Brents.hpp +++ b/packages/rol/src/step/linesearch/ROL_Brents.hpp @@ -70,8 +70,7 @@ class Brents : public LineSearch { // Constructor Brents( ROL::ParameterList &parlist ) : LineSearch(parlist) { Real oem10(1.e-10); - ROL::ParameterList &list - = parlist.sublist("Step").sublist("Line Search").sublist("Line-Search Method").sublist("Brent's"); + ROL::ParameterList &list = parlist.sublist("Step").sublist("Line Search").sublist("Line-Search Method").sublist("Brent's"); tol_ = list.get("Tolerance",oem10); niter_ = list.get("Iteration Limit",1000); test_ = list.get("Run Test Upon Initialization",true); diff --git a/packages/rol/src/step/linesearch/ROL_LineSearch.hpp b/packages/rol/src/step/linesearch/ROL_LineSearch.hpp index 52d391b48dbf..b5fbb408efef 100644 --- a/packages/rol/src/step/linesearch/ROL_LineSearch.hpp +++ b/packages/rol/src/step/linesearch/ROL_LineSearch.hpp @@ -94,8 +94,11 @@ class LineSearch { LineSearch( ROL::ParameterList &parlist ) : eps_(0) { Real one(1), p9(0.9), p6(0.6), p4(0.4), oem4(1.e-4), zero(0); // Enumerations - edesc_ = StringToEDescent(parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").get("Type","Quasi-Newton Method")); - econd_ = StringToECurvatureCondition(parlist.sublist("Step").sublist("Line Search").sublist("Curvature Condition").get("Type","Strong Wolfe Conditions")); + std::string descentName = parlist.sublist("Step").sublist("Line Search").sublist("Descent Method").get("Type","Quasi-Newton Method"); + edesc_ = StringToEDescent(descentName); + + std::string condName = parlist.sublist("Step").sublist("Line Search").sublist("Curvature Condition").get("Type","Strong Wolfe Conditions"); + econd_ = StringToECurvatureCondition(condName); // Linesearch Parameters alpha0_ = parlist.sublist("Step").sublist("Line Search").get("Initial Step Size",one); alpha0bnd_ = parlist.sublist("Step").sublist("Line Search").get("Lower Bound for Initial Step Size",one); diff --git a/packages/rol/src/step/linesearch/ROL_ScalarMinimizationLineSearch.hpp b/packages/rol/src/step/linesearch/ROL_ScalarMinimizationLineSearch.hpp index f6ca951e29a7..112c7b51c974 100644 --- a/packages/rol/src/step/linesearch/ROL_ScalarMinimizationLineSearch.hpp +++ b/packages/rol/src/step/linesearch/ROL_ScalarMinimizationLineSearch.hpp @@ -218,9 +218,10 @@ class ScalarMinimizationLineSearch : public LineSearch { sf_ = sf; - + // Status test for line search - econd_ = StringToECurvatureCondition(list0.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions")); + std::string condName = list0.sublist("Curvature Condition").get("Type","Strong Wolfe Conditions"); + econd_ = StringToECurvatureCondition(condName); max_nfval_ = list0.get("Function Evaluation Limit",20); c1_ = list0.get("Sufficient Decrease Tolerance",oem4); c2_ = list0.sublist("Curvature Condition").get("General Parameter",p9); @@ -233,7 +234,8 @@ class ScalarMinimizationLineSearch : public LineSearch { c1_ = oem4; c2_ = p9; } - EDescent edesc = StringToEDescent(list0.sublist("Descent Method").get("Type","Quasi-Newton Method")); + std::string descentName = list0.sublist("Descent Method").get("Type","Quasi-Newton Method"); + EDescent edesc = StringToEDescent(descentName); if ( edesc == DESCENT_NONLINEARCG ) { c2_ = p4; c3_ = std::min(one-c2_,c3_); diff --git a/packages/rol/src/step/secant/ROL_SecantFactory.hpp b/packages/rol/src/step/secant/ROL_SecantFactory.hpp index f63d7b0df84c..843a19b869bc 100644 --- a/packages/rol/src/step/secant/ROL_SecantFactory.hpp +++ b/packages/rol/src/step/secant/ROL_SecantFactory.hpp @@ -69,8 +69,8 @@ namespace ROL { template inline ROL::Ptr > SecantFactory( ROL::ParameterList &parlist, ESecantMode mode = SECANTMODE_BOTH ) { - ESecant esec = StringToESecant( - parlist.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS") ); + std::string secantName = parlist.sublist("General").sublist("Secant").get("Type","Limited-Memory BFGS"); + ESecant esec = StringToESecant(secantName); int L = parlist.sublist("General").sublist("Secant").get("Maximum Storage",10); int BB = parlist.sublist("General").sublist("Secant").get("Barzilai-Borwein",1); bool uds = parlist.sublist("General").sublist("Secant").get("Use Default Scaling",true); diff --git a/packages/rol/src/step/trustregion/ROL_TrustRegion.hpp b/packages/rol/src/step/trustregion/ROL_TrustRegion.hpp index 825ed7065761..dbaf9cc6e84c 100644 --- a/packages/rol/src/step/trustregion/ROL_TrustRegion.hpp +++ b/packages/rol/src/step/trustregion/ROL_TrustRegion.hpp @@ -94,7 +94,8 @@ class TrustRegion { : pRed_(0), ftol_old_(ROL_OVERFLOW()), cnt_(0), verbosity_(0) { // Trust-Region Parameters ROL::ParameterList list = parlist.sublist("Step").sublist("Trust Region"); - TRmodel_ = StringToETrustRegionModel(list.get("Subproblem Model", "Kelley-Sachs")); + std::string modelName = list.get("Subproblem Model", "Kelley-Sachs"); + TRmodel_ = StringToETrustRegionModel(modelName); eta0_ = list.get("Step Acceptance Threshold", static_cast(0.05)); eta1_ = list.get("Radius Shrinking Threshold", static_cast(0.05)); eta2_ = list.get("Radius Growing Threshold", static_cast(0.9)); @@ -107,9 +108,13 @@ class TrustRegion { // General Inexactness Information ROL::ParameterList &glist = parlist.sublist("General"); useInexact_.clear(); - useInexact_.push_back(glist.get("Inexact Objective Function", false)); - useInexact_.push_back(glist.get("Inexact Gradient", false)); - useInexact_.push_back(glist.get("Inexact Hessian-Times-A-Vector", false)); + + bool inexactObj = glist.get("Inexact Objective Function", false); + bool inexactGrad = glist.get("Inexact Gradient", false); + bool inexactHessVec = glist.get("Inexact Hessian-Times-A-Vector", false); + useInexact_.push_back(inexactObj ); + useInexact_.push_back(inexactGrad ); + useInexact_.push_back(inexactHessVec); // Inexact Function Evaluation Information ROL::ParameterList &ilist = list.sublist("Inexact").sublist("Value"); scale_ = ilist.get("Tolerance Scaling", static_cast(1.e-1)); From 1ad4d17327c792c56d225b1470b9b0028158cf99 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Wed, 16 Oct 2024 14:35:37 -0600 Subject: [PATCH 060/243] Removed all instances of 'using namespace std' from ROL Signed-off-by: Greg von Winckel --- .../example/PinT/tanks/LowerBandedMatrix.hpp | 3 +- packages/rol/example/PinT/tanks/TankState.hpp | 2 +- .../rol/example/PinT/tanks/TankState_Impl.hpp | 3 +- .../rol/example/PinT/tanks/TankVector.hpp | 6 +- .../example/PinT/tanks/TankVector_Impl.hpp | 2 +- .../PinT/tanks/Tanks_ControlVector.hpp | 6 +- .../PinT/tanks/Tanks_ControlVector_Impl.hpp | 2 - .../PinT/tanks/Tanks_DynamicConstraint.hpp | 8 +- .../example/PinT/tanks/Tanks_StateVector.hpp | 6 +- .../PinT/tanks/Tanks_StateVector_Impl.hpp | 2 - .../compatibility/backward_cpp/backward.hpp | 20 ++- .../boost/property_tree/ROL_ParameterList.hpp | 2 - .../ROL_DynamicConstraint_CheckInterface.hpp | 129 +++++++++--------- .../ROL_DynamicObjective_CheckInterface.hpp | 55 ++++---- packages/rol/src/step/krylov/ROL_MINRES.hpp | 22 ++- .../ROL_Constraint_CheckInterface.hpp | 11 +- .../ROL_FiniteDifference.hpp | 2 - .../ROL_FiniteDifferenceDef.hpp | 3 - .../ROL_FunctionBindings.hpp | 15 +- .../ROL_Objective_CheckInterface.hpp | 9 +- ...ROL_Objective_SimOpt_CheckInterfaceDef.hpp | 11 +- .../ROL_ValidateFunction.hpp | 14 ++ .../ROL_ValidateFunctionDef.hpp | 1 - packages/rol/src/vector/ROL_VectorClone.hpp | 12 +- .../rol/src/vector/ROL_VectorWorkspace.hpp | 71 +++++----- packages/rol/src/zoo/ROL_Stream.hpp | 18 ++- 26 files changed, 216 insertions(+), 219 deletions(-) diff --git a/packages/rol/example/PinT/tanks/LowerBandedMatrix.hpp b/packages/rol/example/PinT/tanks/LowerBandedMatrix.hpp index e9b11e952377..867259213c0e 100644 --- a/packages/rol/example/PinT/tanks/LowerBandedMatrix.hpp +++ b/packages/rol/example/PinT/tanks/LowerBandedMatrix.hpp @@ -51,7 +51,8 @@ #include "ROL_StdVector.hpp" namespace details { -using namespace std; + +using std::vector; /* \class LowerBandedMatrix \brief Implements linear solve and multiplication by diff --git a/packages/rol/example/PinT/tanks/TankState.hpp b/packages/rol/example/PinT/tanks/TankState.hpp index e1509a8394e4..49abe48a2641 100644 --- a/packages/rol/example/PinT/tanks/TankState.hpp +++ b/packages/rol/example/PinT/tanks/TankState.hpp @@ -52,7 +52,7 @@ namespace details { -using namespace std; +using std::vector; template class TankState { diff --git a/packages/rol/example/PinT/tanks/TankState_Impl.hpp b/packages/rol/example/PinT/tanks/TankState_Impl.hpp index 510645b59f97..4532fe91caaa 100644 --- a/packages/rol/example/PinT/tanks/TankState_Impl.hpp +++ b/packages/rol/example/PinT/tanks/TankState_Impl.hpp @@ -47,7 +47,8 @@ namespace details { -using namespace std; +using std::vector; +using std::ostream; template TankState::TankState( ROL::ParameterList& pl ) : diff --git a/packages/rol/example/PinT/tanks/TankVector.hpp b/packages/rol/example/PinT/tanks/TankVector.hpp index c5ea81e15ecd..245e5814aec9 100644 --- a/packages/rol/example/PinT/tanks/TankVector.hpp +++ b/packages/rol/example/PinT/tanks/TankVector.hpp @@ -51,7 +51,11 @@ #include namespace details { -using namespace std; +using std::vector; +using std::string; +using std::ostream; +using std::endl; +using std::setw template class TankControlVector; diff --git a/packages/rol/example/PinT/tanks/TankVector_Impl.hpp b/packages/rol/example/PinT/tanks/TankVector_Impl.hpp index 258b065ac185..d27b5ccfffb5 100644 --- a/packages/rol/example/PinT/tanks/TankVector_Impl.hpp +++ b/packages/rol/example/PinT/tanks/TankVector_Impl.hpp @@ -46,7 +46,7 @@ namespace details { -using namespace std; +using std::vector; using size_type = typename vector::size_type; diff --git a/packages/rol/example/PinT/tanks/Tanks_ControlVector.hpp b/packages/rol/example/PinT/tanks/Tanks_ControlVector.hpp index 22fcf7a54fb7..d937e1dd5995 100644 --- a/packages/rol/example/PinT/tanks/Tanks_ControlVector.hpp +++ b/packages/rol/example/PinT/tanks/Tanks_ControlVector.hpp @@ -52,7 +52,11 @@ namespace Tanks { -using namespace std; +using std::vector; +using std::ostream; +using std::string; +using std::setw; +using std::endl; // Forward declaration template class StateVector; diff --git a/packages/rol/example/PinT/tanks/Tanks_ControlVector_Impl.hpp b/packages/rol/example/PinT/tanks/Tanks_ControlVector_Impl.hpp index 1947cc7e3068..8bf41c515cbc 100644 --- a/packages/rol/example/PinT/tanks/Tanks_ControlVector_Impl.hpp +++ b/packages/rol/example/PinT/tanks/Tanks_ControlVector_Impl.hpp @@ -46,8 +46,6 @@ namespace Tanks { -using namespace std; - using size_type = typename vector::size_type; diff --git a/packages/rol/example/PinT/tanks/Tanks_DynamicConstraint.hpp b/packages/rol/example/PinT/tanks/Tanks_DynamicConstraint.hpp index 7aa99f8b674d..27e41f2a6825 100644 --- a/packages/rol/example/PinT/tanks/Tanks_DynamicConstraint.hpp +++ b/packages/rol/example/PinT/tanks/Tanks_DynamicConstraint.hpp @@ -103,7 +103,7 @@ class DynamicConstraint : public ROL::DynamicConstraint { //--------- Subvector addressing --------------------------------------------- size_type h_, Qout_, Qin_, z_; - shared_ptr L_, R_, S_; + std::shared_ptr L_, R_, S_; State zero_state_; Control zero_ctrl_; @@ -148,9 +148,9 @@ class DynamicConstraint : public ROL::DynamicConstraint { p_( ptrows.at(i), ptcols.at(i) ) = 0.0; } - L_ = make_shared>( rows_, cols_, alphaL_, *(p_.getVector()) ); - R_ = make_shared>( rows_, cols_, -alphaR_, *(p_.getVector()) ); - S_ = make_shared>( rows_, cols_ ); + L_ = std::make_shared>( rows_, cols_, alphaL_, *(p_.getVector()) ); + R_ = std::make_shared>( rows_, cols_, -alphaR_, *(p_.getVector()) ); + S_ = std::make_shared>( rows_, cols_ ); } static ROL::Ptr create( ROL::ParameterList& pl ) { diff --git a/packages/rol/example/PinT/tanks/Tanks_StateVector.hpp b/packages/rol/example/PinT/tanks/Tanks_StateVector.hpp index 7a5eaf5a3b3e..d8551aa77fcf 100644 --- a/packages/rol/example/PinT/tanks/Tanks_StateVector.hpp +++ b/packages/rol/example/PinT/tanks/Tanks_StateVector.hpp @@ -52,7 +52,11 @@ namespace Tanks { -using namespace std; +using std::vector; +using std::ostream; +using std::string; +using std::setw; +using std::endl; // Forward declaration template class ControlVector; diff --git a/packages/rol/example/PinT/tanks/Tanks_StateVector_Impl.hpp b/packages/rol/example/PinT/tanks/Tanks_StateVector_Impl.hpp index bd12f5ee09cd..3cbd4b6c86e8 100644 --- a/packages/rol/example/PinT/tanks/Tanks_StateVector_Impl.hpp +++ b/packages/rol/example/PinT/tanks/Tanks_StateVector_Impl.hpp @@ -46,8 +46,6 @@ namespace Tanks { -using namespace std; - using size_type = typename vector::size_type; template diff --git a/packages/rol/src/compatibility/backward_cpp/backward.hpp b/packages/rol/src/compatibility/backward_cpp/backward.hpp index 237eb3253e81..4656af3a1b24 100644 --- a/packages/rol/src/compatibility/backward_cpp/backward.hpp +++ b/packages/rol/src/compatibility/backward_cpp/backward.hpp @@ -1184,9 +1184,9 @@ class TraceResolverLinuxImpl: return r; // damned, that's a stripped file that you got there! } - r.handle = move(bfd_handle); - r.symtab = move(symtab); - r.dynamic_symtab = move(dynamic_symtab); + r.handle = std::move(bfd_handle); + r.symtab = std::move(symtab); + r.dynamic_symtab = std::move(dynamic_symtab); return r; } @@ -2045,8 +2045,8 @@ class TraceResolverLinuxImpl: // If we have a valid elf handle, return the new elf handle // and file handle and discard the original ones if (debuglink_elf) { - elf_handle = move(debuglink_elf); - file_handle = move(debuglink_file); + elf_handle = std::move(debuglink_elf); + file_handle = std::move(debuglink_file); } } } @@ -2068,9 +2068,9 @@ class TraceResolverLinuxImpl: dwarf_handle.reset(dwarf_debug); - r.file_handle = move(file_handle); - r.elf_handle = move(elf_handle); - r.dwarf_handle = move(dwarf_handle); + r.file_handle = std::move(file_handle); + r.elf_handle = std::move(elf_handle); + r.dwarf_handle = std::move(dwarf_handle); return r; } @@ -3180,7 +3180,6 @@ class SourceFile { bool is_open() const { return _file->is_open(); } lines_t& get_lines(unsigned line_start, unsigned line_count, lines_t& lines) { - using namespace std; // This function make uses of the dumbest algo ever: // 1) seek(0) // 2) read lines one by one and discard until line_start @@ -3192,7 +3191,7 @@ class SourceFile { _file->clear(); _file->seekg(0); - string line; + std::string line; unsigned line_idx; for (line_idx = 1; line_idx < line_start; ++line_idx) { @@ -3590,7 +3589,6 @@ class Printer { Colorize& colorize, Color::type color_code, int context_size) { - using namespace std; typedef SnippetFactory::lines_t lines_t; lines_t lines = _snippets.get_snippet(source_loc.filename, diff --git a/packages/rol/src/compatibility/boost/property_tree/ROL_ParameterList.hpp b/packages/rol/src/compatibility/boost/property_tree/ROL_ParameterList.hpp index 11d52296aa01..cae86a2b8a85 100644 --- a/packages/rol/src/compatibility/boost/property_tree/ROL_ParameterList.hpp +++ b/packages/rol/src/compatibility/boost/property_tree/ROL_ParameterList.hpp @@ -68,8 +68,6 @@ namespace ROL { namespace details { -using namespace std; - // Try to get type of an object // FIXME: sometimes failing for std::string template struct value_type diff --git a/packages/rol/src/function/dynamic/ROL_DynamicConstraint_CheckInterface.hpp b/packages/rol/src/function/dynamic/ROL_DynamicConstraint_CheckInterface.hpp index b1b2ec2e6b11..7d585cb6772c 100644 --- a/packages/rol/src/function/dynamic/ROL_DynamicConstraint_CheckInterface.hpp +++ b/packages/rol/src/function/dynamic/ROL_DynamicConstraint_CheckInterface.hpp @@ -52,9 +52,6 @@ namespace ROL { namespace details { -using namespace std; -namespace ph = std::placeholders; - template class DynamicConstraint_CheckInterface { private: @@ -79,179 +76,179 @@ class DynamicConstraint_CheckInterface { } f_update_t update_uo( const V& un, const V& z ) { - return bind( &Con::update, &con_, ph::_1, cref(un), cref(z), ts_ ); + return std::bind( &Con::update, &con_, ph::_1, std::cref(un), std::cref(z), ts_ ); } f_update_t update_un( const V& uo, const V& z ) { - return bind( &Con::update, &con_, cref(uo), ph::_1, cref(z), ts_ ); + return std::bind( &Con::update, &con_, std::cref(uo), ph::_1, std::cref(z), ts_ ); } f_update_t update_z( const V& uo, const V& un ) { - return bind( &Con::update, &con_, cref(uo), cref(un), ph::_1, ts_ ); + return std::bind( &Con::update, &con_, std::cref(uo), std::cref(un), ph::_1, ts_ ); } //---------------------------------------------------------------------------- f_vector_t value_uo( const V& un, const V& z ) { - return bind( &Con::value, &con_, - ph::_1, ph::_2, cref(un), cref(z), ts_ ); + return std::bind( &Con::value, &con_, + ph::_1, ph::_2, std::cref(un), std::cref(z), ts_ ); } f_vector_t value_un( const V& uo, const V& z ) { - return bind( &Con::value, &con_, - ph::_1, cref(uo), ph::_2, cref(z), ts_ ); + return std::bind( &Con::value, &con_, + ph::_1, std::cref(uo), ph::_2, std::cref(z), ts_ ); } f_vector_t value_z( const V& uo, const V& un ) { - return bind( &Con::value, &con_, - ph::_1, cref(uo), cref(un), ph::_2, ts_ ); + return std::bind( &Con::value, &con_, + ph::_1, std::cref(uo), std::cref(un), ph::_2, ts_ ); } f_solve_t solve_un( const V& uo, const V& z ) { - return bind( &Con::solve, &con_, - ph::_1, cref(uo), ph::_2, cref(z), ts_ ); + return std::bind( &Con::solve, &con_, + ph::_1, std::cref(uo), ph::_2, std::cref(z), ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t jacobian_uo( const V& un, const V& z ) { - return bind( &Con::applyJacobian_uo, &con_, ph::_1, ph::_2, ph::_3, - cref(un), cref(z), ts_ ); + return std::bind( &Con::applyJacobian_uo, &con_, ph::_1, ph::_2, ph::_3, + std::cref(un), std::cref(z), ts_ ); } f_dderiv_t jacobian_un( const V& uo, const V& z ) { - return bind( &Con::applyJacobian_un, &con_, ph::_1, ph::_2, cref(uo), - ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyJacobian_un, &con_, ph::_1, ph::_2, std::cref(uo), + ph::_3, std::cref(z), ts_ ); } f_dderiv_t inverseJacobian_un( const V& uo, const V& z ) { - return bind( &Con::applyInverseJacobian_un, &con_, ph::_1, ph::_2, cref(uo), - ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyInverseJacobian_un, &con_, ph::_1, ph::_2, std::cref(uo), + ph::_3, std::cref(z), ts_ ); } f_dderiv_t jacobian_z( const V& uo, const V& un ) { - return bind( &Con::applyJacobian_z, &con_, ph::_1, ph::_2, cref(uo), - cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyJacobian_z, &con_, ph::_1, ph::_2, std::cref(uo), + std::cref(un), ph::_3, ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t adjointJacobian_uo( const V& un, const V& z ) { - return bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, ph::_3, - cref(un), cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, ph::_3, + std::cref(un), std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_un( const V& uo, const V& z ) { - return bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, cref(uo), - ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, std::cref(uo), + ph::_3, std::cref(z), ts_ ); } f_dderiv_t inverseAdjointJacobian_un( const V& uo, const V& z ) { - return bind( &Con::applyInverseAdjointJacobian_un, &con_, ph::_1, ph::_2, cref(uo), - ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyInverseAdjointJacobian_un, &con_, ph::_1, ph::_2, std::cref(uo), + ph::_3, std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_z( const V& uo, const V& un ) { - return bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, cref(uo), - cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, std::cref(uo), + std::cref(un), ph::_3, ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t adjointJacobian_uo_uo( const V& un, const V& z ) { - return bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, ph::_3, - cref(un), cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, ph::_3, + std::cref(un), std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_uo_un( const V& uo, const V& z ) { - return bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, cref(uo), - ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, std::cref(uo), + ph::_3, std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_uo_z( const V& uo, const V& un ) { - return bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, cref(uo), - cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyAdjointJacobian_uo, &con_, ph::_1, ph::_2, std::cref(uo), + std::cref(un), ph::_3, ts_ ); } f_dderiv_t adjointJacobian_un_uo( const V& un, const V& z ) { - return bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, ph::_3, - cref(un), cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, ph::_3, + std::cref(un), std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_un_un( const V& uo, const V& z ) { - return bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, cref(uo), - ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, std::cref(uo), + ph::_3, std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_un_z( const V& uo, const V& un ) { - return bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, cref(uo), - cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyAdjointJacobian_un, &con_, ph::_1, ph::_2, std::cref(uo), + std::cref(un), ph::_3, ts_ ); } f_dderiv_t adjointJacobian_z_uo( const V& un, const V& z ) { - return bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, ph::_3, - cref(un), cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, ph::_3, + std::cref(un), std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_z_un( const V& uo, const V& z ) { - return bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, cref(uo), - ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, std::cref(uo), + ph::_3, std::cref(z), ts_ ); } f_dderiv_t adjointJacobian_z_z( const V& uo, const V& un ) { - return bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, cref(uo), - cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyAdjointJacobian_z, &con_, ph::_1, ph::_2, std::cref(uo), + std::cref(un), ph::_3, ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t adjointHessian_un_un( const V& uo, const V& z, const V& l ) { - return bind( &Con::applyAdjointHessian_un_un, &con_, ph::_1, cref(l), ph::_2, - cref(uo), ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyAdjointHessian_un_un, &con_, ph::_1, std::cref(l), ph::_2, + std::cref(uo), ph::_3, std::cref(z), ts_ ); } f_dderiv_t adjointHessian_un_uo( const V& uo, const V& z, const V& l ) { - return bind( &Con::applyAdjointHessian_un_uo, &con_, ph::_1, cref(l), ph::_2, - cref(uo), ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyAdjointHessian_un_uo, &con_, ph::_1, std::cref(l), ph::_2, + std::cref(uo), ph::_3, std::cref(z), ts_ ); } f_dderiv_t adjointHessian_un_z( const V& uo, const V& z, const V& l ) { - return bind( &Con::applyAdjointHessian_un_z, &con_, ph::_1, cref(l), ph::_2, - cref(uo), ph::_3, cref(z), ts_ ); + return std::bind( &Con::applyAdjointHessian_un_z, &con_, ph::_1, std::cref(l), ph::_2, + std::cref(uo), ph::_3, std::cref(z), ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t adjointHessian_uo_un( const V& un, const V& z, const V& l ) { - return bind( &Con::applyAdjointHessian_uo_un, &con_, ph::_1, cref(l), ph::_2, - ph::_3, cref(un), cref(z), ts_ ); + return std::bind( &Con::applyAdjointHessian_uo_un, &con_, ph::_1, std::cref(l), ph::_2, + ph::_3, std::cref(un), std::cref(z), ts_ ); } f_dderiv_t adjointHessian_uo_uo( const V& un, const V& z, const V& l ) { - return bind( &Con::applyAdjointHessian_uo_uo, &con_, ph::_1, cref(l), ph::_2, - ph::_3, cref(un), cref(z), ts_ ); + return std::bind( &Con::applyAdjointHessian_uo_uo, &con_, ph::_1, std::cref(l), ph::_2, + ph::_3, std::cref(un), std::cref(z), ts_ ); } f_dderiv_t adjointHessian_uo_z( const V& un, const V& z, const V& l ) { - return bind( &Con::applyAdjointHessian_uo_z, &con_, ph::_1, cref(l), ph::_2, - ph::_3, cref(un), cref(z), ts_ ); + return std::bind( &Con::applyAdjointHessian_uo_z, &con_, ph::_1, std::cref(l), ph::_2, + ph::_3, std::cref(un), std::cref(z), ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t adjointHessian_z_un( const V& uo, const V& un, const V& l ) { - return bind( &Con::applyAdjointHessian_z_un, &con_, ph::_1, cref(l), ph::_2, - cref(uo), cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyAdjointHessian_z_un, &con_, ph::_1, std::cref(l), ph::_2, + std::cref(uo), std::cref(un), ph::_3, ts_ ); } f_dderiv_t adjointHessian_z_uo( const V& uo, const V& un, const V& l ) { - return bind( &Con::applyAdjointHessian_z_uo, &con_, ph::_1, cref(l), ph::_2, - cref(uo), cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyAdjointHessian_z_uo, &con_, ph::_1, std::cref(l), ph::_2, + std::cref(uo), std::cref(un), ph::_3, ts_ ); } f_dderiv_t adjointHessian_z_z( const V& uo, const V& un, const V& l ) { - return bind( &Con::applyAdjointHessian_z_z, &con_, ph::_1, cref(l), ph::_2, - cref(uo), cref(un), ph::_3, ts_ ); + return std::bind( &Con::applyAdjointHessian_z_z, &con_, ph::_1, std::cref(l), ph::_2, + std::cref(uo), std::cref(un), ph::_3, ts_ ); } }; // class DynamicConstraint_CheckInterface diff --git a/packages/rol/src/function/dynamic/ROL_DynamicObjective_CheckInterface.hpp b/packages/rol/src/function/dynamic/ROL_DynamicObjective_CheckInterface.hpp index 85343f20b423..a72fd0ab4cff 100644 --- a/packages/rol/src/function/dynamic/ROL_DynamicObjective_CheckInterface.hpp +++ b/packages/rol/src/function/dynamic/ROL_DynamicObjective_CheckInterface.hpp @@ -55,7 +55,6 @@ namespace ROL { namespace details { -using namespace std; namespace ph = std::placeholders; template @@ -83,122 +82,122 @@ class DynamicObjective_CheckInterface { f_update_t update_uo( const V& un, const V& z ) { - return bind( &Obj::update, &obj_, ph::_1, cref(un), cref(z), ts_ ); + return std::bind( &Obj::update, &obj_, ph::_1, std::cref(un), std::cref(z), ts_ ); } f_update_t update_un( const V& uo, const V& z ) { - return bind( &Obj::update, &obj_, cref(uo), ph::_1, cref(z), ts_ ); + return std::bind( &Obj::update, &obj_, std::cref(uo), ph::_1, std::cref(z), ts_ ); } f_update_t update_z( const V& uo, const V& un ) { - return bind( &Obj::update, &obj_, cref(uo), cref(un), ph::_1, ts_ ); + return std::bind( &Obj::update, &obj_, std::cref(uo), std::cref(un), ph::_1, ts_ ); } //---------------------------------------------------------------------------- f_scalar_t value_uo( const V& un, const V& z ) { - return bind( &Obj::value, &obj_, ph::_1, cref(un), cref(z), ts_ ); + return std::bind( &Obj::value, &obj_, ph::_1, std::cref(un), std::cref(z), ts_ ); } f_scalar_t value_un( const V& uo, const V& z ) { - return bind( &Obj::value, &obj_, cref(uo), ph::_1, cref(z), ts_ ); + return std::bind( &Obj::value, &obj_, std::cref(uo), ph::_1, std::cref(z), ts_ ); } f_scalar_t value_z( const V& uo, const V& un ) { - return bind( &Obj::value, &obj_, cref(uo), cref(un), ph::_1, ts_ ); + return std::bind( &Obj::value, &obj_, std::cref(uo), std::cref(un), ph::_1, ts_ ); } //---------------------------------------------------------------------------- f_vector_t gradient_uo( const V& un, const V& z ) { - return bind( &Obj::gradient_uo, &obj_, ph::_1, ph::_2, cref(un), cref(z), ts_ ); + return std::bind( &Obj::gradient_uo, &obj_, ph::_1, ph::_2, std::cref(un), std::cref(z), ts_ ); } f_vector_t gradient_un( const V& uo, const V& z ) { - return bind( &Obj::gradient_un, &obj_, ph::_1, cref(uo), ph::_2, cref(z), ts_ ); + return std::bind( &Obj::gradient_un, &obj_, ph::_1, std::cref(uo), ph::_2, std::cref(z), ts_ ); } f_vector_t gradient_z( const V& uo, const V& un ) { - return bind( &Obj::gradient_z, &obj_, ph::_1, cref(uo), cref(un), ph::_2, ts_ ); + return std::bind( &Obj::gradient_z, &obj_, ph::_1, std::cref(uo), std::cref(un), ph::_2, ts_ ); } // For hessian checks f_vector_t gradient_uo_uo( const V& un, const V& z ) { - return bind( &Obj::gradient_uo, &obj_, ph::_1, ph::_2, cref(un), cref(z), ts_ ); + return std::bind( &Obj::gradient_uo, &obj_, ph::_1, ph::_2, std::cref(un), std::cref(z), ts_ ); } f_vector_t gradient_uo_un( const V& uo, const V& z ) { - return bind( &Obj::gradient_uo, &obj_, ph::_1, cref(uo), ph::_2, cref(z), ts_ ); + return std::bind( &Obj::gradient_uo, &obj_, ph::_1, std::cref(uo), ph::_2, std::cref(z), ts_ ); } f_vector_t gradient_uo_z( const V& uo, const V& un ) { - return bind( &Obj::gradient_uo, &obj_, ph::_1, cref(uo), cref(un), ph::_2, ts_ ); + return std::bind( &Obj::gradient_uo, &obj_, ph::_1, std::cref(uo), std::cref(un), ph::_2, ts_ ); } f_vector_t gradient_un_uo( const V& un, const V& z ) { - return bind( &Obj::gradient_un, &obj_, ph::_1, ph::_2, cref(un), cref(z), ts_ ); + return std::bind( &Obj::gradient_un, &obj_, ph::_1, ph::_2, std::cref(un), std::cref(z), ts_ ); } f_vector_t gradient_un_un( const V& uo, const V& z ) { - return bind( &Obj::gradient_un, &obj_, ph::_1, cref(uo), ph::_2, cref(z), ts_ ); + return std::bind( &Obj::gradient_un, &obj_, ph::_1, std::cref(uo), ph::_2, std::cref(z), ts_ ); } f_vector_t gradient_un_z( const V& uo, const V& un ) { - return bind( &Obj::gradient_un, &obj_, ph::_1, cref(uo), cref(un), ph::_2, ts_ ); + return std::bind( &Obj::gradient_un, &obj_, ph::_1, std::cref(uo), std::cref(un), ph::_2, ts_ ); } f_vector_t gradient_z_uo( const V& un, const V& z ) { - return bind( &Obj::gradient_z, &obj_, ph::_1, ph::_2, cref(un), cref(z), ts_ ); + return std::bind( &Obj::gradient_z, &obj_, ph::_1, ph::_2, std::cref(un), std::cref(z), ts_ ); } f_vector_t gradient_z_un( const V& uo, const V& z ) { - return bind( &Obj::gradient_z, &obj_, ph::_1, cref(uo), ph::_2, cref(z), ts_ ); + return std::bind( &Obj::gradient_z, &obj_, ph::_1, std::cref(uo), ph::_2, std::cref(z), ts_ ); } f_vector_t gradient_z_z( const V& uo, const V& un ) { - return bind( &Obj::gradient_z, &obj_, ph::_1, cref(uo), cref(un), ph::_2, ts_ ); + return std::bind( &Obj::gradient_z, &obj_, ph::_1, std::cref(uo), std::cref(un), ph::_2, ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t hessVec_uo_uo( const V& un, const V& z ) { - return bind( &Obj::hessVec_uo_uo, &obj_, ph::_1, ph::_2, ph::_3, cref(un), cref(z), ts_ ); + return std::bind( &Obj::hessVec_uo_uo, &obj_, ph::_1, ph::_2, ph::_3, std::cref(un), std::cref(z), ts_ ); } f_dderiv_t hessVec_uo_un( const V& uo, const V& z ) { - return bind( &Obj::hessVec_uo_un, &obj_, ph::_1, ph::_2, cref(uo), ph::_3, cref(z), ts_ ); + return std::bind( &Obj::hessVec_uo_un, &obj_, ph::_1, ph::_2, std::cref(uo), ph::_3, std::cref(z), ts_ ); } f_dderiv_t hessVec_uo_z( const V& uo, const V& un ) { - return bind( &Obj::hessVec_uo_z, &obj_, ph::_1, ph::_2, cref(uo), cref(un), ph::_3, ts_ ); + return std::bind( &Obj::hessVec_uo_z, &obj_, ph::_1, ph::_2, std::cref(uo), std::cref(un), ph::_3, ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t hessVec_un_uo( const V& un, const V& z ) { - return bind( &Obj::hessVec_un_uo, &obj_, ph::_1, ph::_2, ph::_3, cref(un), cref(z), ts_ ); + return std::bind( &Obj::hessVec_un_uo, &obj_, ph::_1, ph::_2, ph::_3, std::cref(un), std::cref(z), ts_ ); } f_dderiv_t hessVec_un_un( const V& uo, const V& z ) { - return bind( &Obj::hessVec_un_un, &obj_, ph::_1, ph::_2, cref(uo), ph::_3, cref(z), ts_ ); + return std::bind( &Obj::hessVec_un_un, &obj_, ph::_1, ph::_2, std::cref(uo), ph::_3, std::cref(z), ts_ ); } f_dderiv_t hessVec_un_z( const V& uo, const V& un ) { - return bind( &Obj::hessVec_un_z, &obj_, ph::_1, ph::_2, cref(uo), cref(un), ph::_3, ts_ ); + return std::bind( &Obj::hessVec_un_z, &obj_, ph::_1, ph::_2, std::cref(uo), std::cref(un), ph::_3, ts_ ); } //---------------------------------------------------------------------------- f_dderiv_t hessVec_z_uo( const V& un, const V& z ) { - return bind( &Obj::hessVec_z_uo, &obj_, ph::_1, ph::_2, ph::_3, cref(un), cref(z), ts_ ); + return std::bind( &Obj::hessVec_z_uo, &obj_, ph::_1, ph::_2, ph::_3, std::cref(un), std::cref(z), ts_ ); } f_dderiv_t hessVec_z_un( const V& uo, const V& z ) { - return bind( &Obj::hessVec_z_un, &obj_, ph::_1, ph::_2, cref(uo), ph::_3, cref(z), ts_ ); + return std::bind( &Obj::hessVec_z_un, &obj_, ph::_1, ph::_2, std::cref(uo), ph::_3, std::cref(z), ts_ ); } f_dderiv_t hessVec_z_z( const V& uo, const V& un ) { - return bind( &Obj::hessVec_z_z, &obj_, ph::_1, ph::_2, cref(uo), cref(un), ph::_3, ts_ ); + return std::bind( &Obj::hessVec_z_z, &obj_, ph::_1, ph::_2, std::cref(uo), std::cref(un), ph::_3, ts_ ); } diff --git a/packages/rol/src/step/krylov/ROL_MINRES.hpp b/packages/rol/src/step/krylov/ROL_MINRES.hpp index 27aaabbc287c..d22323666b84 100644 --- a/packages/rol/src/step/krylov/ROL_MINRES.hpp +++ b/packages/rol/src/step/krylov/ROL_MINRES.hpp @@ -59,8 +59,6 @@ namespace ROL { namespace details { -using namespace std; - template class MINRES : public Krylov { @@ -73,8 +71,8 @@ class MINRES : public Krylov { Real resnorm_; int maxiter_; bool useInexact_; - array H_; - array rhs_; + std::array H_; + std::array rhs_; VectorCloneMap clones_; @@ -85,23 +83,23 @@ class MINRES : public Krylov { if( b == zero ) { c = ( a >= zero ? one : -one ); s = zero; - r = abs(a); + r = std::abs(a); } else if( a == zero ) { c = zero; s = ( b >= zero ? one : -one ); - r = abs(b); + r = std::abs(b); } - else if( abs(a) > abs(b) ) { + else if( std::abs(a) > std::abs(b) ) { auto t = b/a; - auto u = copysign(sqrt(one+t*t),a); + auto u = std::copysign(std::sqrt(one+t*t),a); c = one/u; s = c*t; r = a*u; } else { auto t = a/b; - auto u = copysign(sqrt(one+t*t),b); + auto u = std::copysign(std::sqrt(one+t*t),b); s = 1/u; c = s*t; r = b*u; @@ -127,8 +125,8 @@ class MINRES : public Krylov { Real c_prev{0}, s_prev{0}, c_curr{0}, s_curr{0}, c_next{0}, s_next{0}; resnorm_ = v_curr->norm(); - Real rtol = min(Krylov::getAbsoluteTolerance(),Krylov::getRelativeTolerance()*resnorm_); - Real itol = sqrt(ROL_EPSILON()); + Real rtol = std::min(Krylov::getAbsoluteTolerance(),Krylov::getRelativeTolerance()*resnorm_); + Real itol = std::sqrt(ROL_EPSILON()); for( auto &e: H_ ) e = 0; @@ -196,7 +194,7 @@ class MINRES : public Krylov { H_[1] = H_[3]; - resnorm_ = abs( rhs_[1] ); + resnorm_ = std::abs( rhs_[1] ); } // for (iter) diff --git a/packages/rol/src/utils/function_bindings/ROL_Constraint_CheckInterface.hpp b/packages/rol/src/utils/function_bindings/ROL_Constraint_CheckInterface.hpp index 5d116d3efd8c..78f700aff176 100644 --- a/packages/rol/src/utils/function_bindings/ROL_Constraint_CheckInterface.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_Constraint_CheckInterface.hpp @@ -53,7 +53,6 @@ namespace ROL { namespace details { -using namespace std; namespace ph = std::placeholders; template @@ -69,28 +68,28 @@ class Constraint_CheckInterface { con_(con), tol_(sqrt(ROL_EPSILON())) {} f_update_t update() { - return bind( (void(Constraint::*)(const Vector&,bool,int))&Constraint::update, &con_, ph::_1, true, 0 ); + return std::bind( (void(Constraint::*)(const Vector&,bool,int))&Constraint::update, &con_, ph::_1, true, 0 ); } f_vector_t value() { - return bind( &Constraint::value, &con_, ph::_1, ph::_2, tol_); + return std::bind( &Constraint::value, &con_, ph::_1, ph::_2, tol_); } f_dderiv_t jacobian() { - return bind( &Constraint::applyJacobian, &con_, ph::_1, ph::_2, ph::_3, tol_); + return std::bind( &Constraint::applyJacobian, &con_, ph::_1, ph::_2, ph::_3, tol_); } // Provide a vector in the dual constraint space f_dderiv_t adjointJacobian( ) { - return bind( static_cast::*) + return std::bind( static_cast::*) ( V&, const V&, const V&, Real& )> (&Constraint::applyAdjointJacobian), &con_, ph::_1, ph::_2, ph::_3, tol_); } f_dderiv_t adjointHessian( const V& l ) { - return bind( &Constraint::applyAdjointHessian, &con_, ph::_1, cref(l), ph::_2, ph::_3, tol_); + return std::bind( &Constraint::applyAdjointHessian, &con_, ph::_1, std::cref(l), ph::_2, ph::_3, tol_); } diff --git a/packages/rol/src/utils/function_bindings/ROL_FiniteDifference.hpp b/packages/rol/src/utils/function_bindings/ROL_FiniteDifference.hpp index 3c5fab8d1654..0683f9debbda 100644 --- a/packages/rol/src/utils/function_bindings/ROL_FiniteDifference.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_FiniteDifference.hpp @@ -63,8 +63,6 @@ namespace ROL { namespace details { -using namespace std; - template class FiniteDifference { public: diff --git a/packages/rol/src/utils/function_bindings/ROL_FiniteDifferenceDef.hpp b/packages/rol/src/utils/function_bindings/ROL_FiniteDifferenceDef.hpp index a75605a0b493..b796c7ba534b 100644 --- a/packages/rol/src/utils/function_bindings/ROL_FiniteDifferenceDef.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_FiniteDifferenceDef.hpp @@ -46,13 +46,10 @@ #ifndef ROL_FINITEDIFFERENCEDEF_HPP #define ROL_FINITEDIFFERENCEDEF_HPP -#include - namespace ROL { namespace details { -using namespace std; using ::ROL::Finite_Difference_Arrays::shifts; using ::ROL::Finite_Difference_Arrays::weights; diff --git a/packages/rol/src/utils/function_bindings/ROL_FunctionBindings.hpp b/packages/rol/src/utils/function_bindings/ROL_FunctionBindings.hpp index 5d5916c411db..1b7a99b3e38b 100644 --- a/packages/rol/src/utils/function_bindings/ROL_FunctionBindings.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_FunctionBindings.hpp @@ -53,32 +53,31 @@ namespace ROL { namespace details { -using namespace std; namespace ph = std::placeholders; template -using f_update_t = function& )>; +using f_update_t = std::function& )>; template -using f_scalar_t = function& )>; +using f_scalar_t = std::function& )>; template -using f_vector_t = function&, const Vector& )>; +using f_vector_t = std::function&, const Vector& )>; template -using f_dderiv_t = function&, const Vector&, const Vector& )>; +using f_dderiv_t = std::function&, const Vector&, const Vector& )>; template -using f_solve_t = function &, Vector & )>; +using f_solve_t = std::function &, Vector & )>; template inline f_vector_t fix_direction( f_dderiv_t& f, const Vector& v ) { - return bind( f, ph::_1, cref(v), ph::_2 ); + return std::bind( f, ph::_1, std::cref(v), ph::_2 ); } template inline f_vector_t fix_position( f_dderiv_t& f, const Vector& x ) { - return bind( f, ph::_1, ph::_2, cref(x) ); + return std::bind( f, ph::_1, ph::_2, std::cref(x) ); } } // namespace details diff --git a/packages/rol/src/utils/function_bindings/ROL_Objective_CheckInterface.hpp b/packages/rol/src/utils/function_bindings/ROL_Objective_CheckInterface.hpp index 53685cd649a9..eadf019f5572 100644 --- a/packages/rol/src/utils/function_bindings/ROL_Objective_CheckInterface.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_Objective_CheckInterface.hpp @@ -52,7 +52,6 @@ namespace ROL { namespace details { -using namespace std; namespace ph = std::placeholders; template @@ -68,19 +67,19 @@ class Objective_CheckInterface { obj_(obj), tol_(sqrt(ROL_EPSILON())) {} f_update_t update() { - return bind( (void(Objective::*)(const Vector&,bool,int))&Objective::update, &obj_, ph::_1, true, 0 ); + return std::bind( (void(Objective::*)(const Vector&,bool,int))&Objective::update, &obj_, ph::_1, true, 0 ); } f_scalar_t value() { - return bind( &Objective::value, &obj_, ph::_1, tol_); + return std::bind( &Objective::value, &obj_, ph::_1, tol_); } f_vector_t gradient() { - return bind( &Objective::gradient, &obj_, ph::_1, ph::_2, tol_); + return std::bind( &Objective::gradient, &obj_, ph::_1, ph::_2, tol_); } f_dderiv_t hessVec() { - return bind( &Objective::hessVec, &obj_, ph::_1, ph::_2, ph::_3, tol_); + return std::bind( &Objective::hessVec, &obj_, ph::_1, ph::_2, ph::_3, tol_); } }; // Objective_CheckInterface diff --git a/packages/rol/src/utils/function_bindings/ROL_Objective_SimOpt_CheckInterfaceDef.hpp b/packages/rol/src/utils/function_bindings/ROL_Objective_SimOpt_CheckInterfaceDef.hpp index 533e4b184a0b..1576edf9d45b 100644 --- a/packages/rol/src/utils/function_bindings/ROL_Objective_SimOpt_CheckInterfaceDef.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_Objective_SimOpt_CheckInterfaceDef.hpp @@ -53,7 +53,6 @@ namespace ROL { namespace details { -using namespace std; namespace ph = std::placeholders; template @@ -70,24 +69,24 @@ class Objective_SimOpt_CheckInterface { // Takes a Vector_SimOpt f_update_t update() { - return bind( &Objective_SimOpt::update, &obj_, ph::_1, true, 0 ); + return std::bind( &Objective_SimOpt::update, &obj_, ph::_1, true, 0 ); } // Takes a Vector_SimOpt f_scalar_t value() { - return bind( &Objective_SimOpt::value, &obj_, ph::_1, tol_); + return std::bind( &Objective_SimOpt::value, &obj_, ph::_1, tol_); } f_vector_t gradient_1( const V& z ) { - return bind( &Objective_SimOpt::gradient_1, &obj_, ph::_1, ph::_2, cref(z), tol_); + return std::bind( &Objective_SimOpt::gradient_1, &obj_, ph::_1, ph::_2, std::cref(z), tol_); } f_vector_t gradient_2( const V& u ) { - return bind( &Objective_SimOpt::gradient_2, &obj_, ph::_1, cref(u), ph::_2, tol_); + return std::bind( &Objective_SimOpt::gradient_2, &obj_, ph::_1, std::cref(u), ph::_2, tol_); } f_dderiv_t hessVec_11( const ) { - return bind( &Objective_SimOpt::hessVec, &obj_, ph::_1, ph::_2, ph::_3, tol_); + return std::bind( &Objective_SimOpt::hessVec, &obj_, ph::_1, ph::_2, ph::_3, tol_); } }; // Objective_CheckInterface diff --git a/packages/rol/src/utils/function_bindings/ROL_ValidateFunction.hpp b/packages/rol/src/utils/function_bindings/ROL_ValidateFunction.hpp index e8bfb6d672d0..868a9c4b8d2c 100644 --- a/packages/rol/src/utils/function_bindings/ROL_ValidateFunction.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_ValidateFunction.hpp @@ -61,6 +61,20 @@ namespace ROL { namespace details { + +using std::cout; +using std::ostream; +using std::string; +using std::vector; +using std::setw; +using std::setprecision; +using std::scientific; +using std::abs; +using std::right; +using std::max; +using std::endl; + + template class ValidateFunction { public: diff --git a/packages/rol/src/utils/function_bindings/ROL_ValidateFunctionDef.hpp b/packages/rol/src/utils/function_bindings/ROL_ValidateFunctionDef.hpp index e430360649f3..b84b6096d3b3 100644 --- a/packages/rol/src/utils/function_bindings/ROL_ValidateFunctionDef.hpp +++ b/packages/rol/src/utils/function_bindings/ROL_ValidateFunctionDef.hpp @@ -6,7 +6,6 @@ namespace ROL { namespace details { -using namespace std; template ValidateFunction::ValidateFunction( const int order, diff --git a/packages/rol/src/vector/ROL_VectorClone.hpp b/packages/rol/src/vector/ROL_VectorClone.hpp index 31928e07f3fc..dc34a5bec5c1 100644 --- a/packages/rol/src/vector/ROL_VectorClone.hpp +++ b/packages/rol/src/vector/ROL_VectorClone.hpp @@ -64,8 +64,6 @@ namespace ROL { namespace details { -using namespace std; - template class VectorClone { private: @@ -80,9 +78,9 @@ class VectorClone { Ptr> operator() ( const Vector& x ) { if( is_allocated_ ) { if( typeid(x) != typeid(*vec_) ) - throw logic_error("Argument and member vector types are different!"); + throw std::logic_error("Argument and member vector types are different!"); if( x.dimension() != vec_->dimension() ) - throw logic_error("Argument and member vector types have different dimensions!"); + throw std::logic_error("Argument and member vector types have different dimensions!"); } else { vec_ = x.clone(); @@ -94,9 +92,9 @@ class VectorClone { Ptr> operator() ( const Ptr>& x ) { if( is_allocated_ ) { if( typeid(*x) != typeid(*vec_) ) - throw logic_error("Argument and member vector types are different!"); + throw std::logic_error("Argument and member vector types are different!"); if( x->dimension() != vec_->dimension() ) - throw logic_error("Argument and member vector types have different dimensions!"); + throw std::logic_error("Argument and member vector types have different dimensions!"); } else { vec_ = x->clone(); @@ -118,7 +116,7 @@ class VectorClone { template class VectorCloneMap { private: - map> clones_; + std::map> clones_; template void Constructor_Impl( First first, Rest... rest ) { diff --git a/packages/rol/src/vector/ROL_VectorWorkspace.hpp b/packages/rol/src/vector/ROL_VectorWorkspace.hpp index a1430bd4271b..96cba971d63e 100644 --- a/packages/rol/src/vector/ROL_VectorWorkspace.hpp +++ b/packages/rol/src/vector/ROL_VectorWorkspace.hpp @@ -60,22 +60,22 @@ Will allocate new memory of a clone of x *if needed* and return a pointer to the clone. A new clone is considered to be needed - only if these is not a previously allocated compatible vector + only if these is not a previously allocated compatible std::vector stored in the VectorWorkspace. Compatibility is determined by derived type (typeid::hash_code) - and vector dimension. Together these form a VectorKey. - When cloning a vector inside a member function, VectorWorkspace + and std::vector dimension. Together these form a VectorKey. + When cloning a std::vector inside a member function, VectorWorkspace will identify it's VectorKey type. If such a type exists in the database, with will then refer to the associated VectorStack that is specific to the VectorKey type. The VectorStack will be searched for the first available dynamically - allocated vector which has no external references to it and return - a pointer to it. If no such vector exists, a new one will be + allocated std::vector which has no external references to it and return + a pointer to it. If no such std::vector exists, a new one will be cloned and added to the stack. When the local pointers to the VectorStack elements go out of scope at the end of the member - function, the reference counts are decremented and the vectors + function, the reference counts are decremented and the std::vectors become available for use again. NOTE: Stored clones will have a reference count of 2 when there @@ -94,14 +94,11 @@ namespace ROL { namespace details { -using namespace std; - - template class VectorWorkspace { using V = ROL::Vector; - using size_type = typename vector::size_type; + using size_type = typename std::vector::size_type; private: @@ -117,10 +114,10 @@ class VectorWorkspace { VectorKey( const Ptr& x ) : VectorKey( *x ) {} - static string to_string( const VectorKey& key ) { - stringstream ss; - ss << "VectorKey(" << hex << key.hash_code << "," - << dec << key.dimension << ")"; + static std::string to_string( const VectorKey& key ) { + std::stringstream ss; + ss << "VectorKey(" << std::hex << key.hash_code << "," + << std::dec << key.dimension << ")"; return ss.str(); } @@ -141,7 +138,7 @@ class VectorWorkspace { struct VectorStack { friend class VectorWorkspace; - vector> vectors_; + std::vector> vectors_; VectorKey key_; VectorStack( const V& x ) : vectors_( 1, x.clone() ), @@ -163,37 +160,37 @@ class VectorWorkspace { Ptr clone( const V& x ) { VectorKey x_key(x); - ROL_TEST_FOR_EXCEPTION( key_.hash_code != x_key.hash_code, logic_error, - "VectorWorkspace::VectorStack tried to clone a vector of type " << - hex << key_.hash_code << ", but it can only clone vectors of type " << - hex << x_key.hash_code ); - - ROL_TEST_FOR_EXCEPTION( key_.dimension != x_key.dimension, logic_error, - "VectorWorkspace::VectorStack tried to clone a vector of dimension " << - hex << key_.dimension << ", but it can only clone vectors of dimension " << - hex << x_key.dimension ); - - for( auto e : vectors_ ) { // Return first unreferenced vector - if( getCount(e) <= 2 ) { // Storing pointers in vector increments count + ROL_TEST_FOR_EXCEPTION( key_.hash_code != x_key.hash_code, std::logic_error, + "VectorWorkspace::VectorStack tried to clone a std::vector of type " << + std::hex << key_.hash_code << ", but it can only clone std::vectors of type " << + std::hex << x_key.hash_code ); + + ROL_TEST_FOR_EXCEPTION( key_.dimension != x_key.dimension, std::logic_error, + "VectorWorkspace::VectorStack tried to clone a std::vector of dimension " << + std::hex << key_.dimension << ", but it can only clone std::vectors of dimension " << + std::hex << x_key.dimension ); + + for( auto e : vectors_ ) { // Return first unreferenced std::vector + if( getCount(e) <= 2 ) { // Storing pointers in std::vector increments count return e; } } - // If no unreferenced vectors exist, add a new one + // If no unreferenced std::vectors exist, add a new one auto v = x.clone(); vectors_.push_back( v ); return v; } // For testing purposes - vector getRefCounts( void ) const { - vector counts; + std::vector getRefCounts( void ) const { + std::vector counts; for( auto e: vectors_ ) counts.push_back( getCount(e) ); return counts; } }; // VectorStack - map> workspace_; + std::map> workspace_; public: @@ -207,7 +204,7 @@ class VectorWorkspace { if( key_count == 0 ) { // New key vstack = makePtr(x); - workspace_.insert( make_pair(key,vstack) ); + workspace_.insert( std::make_pair(key,vstack) ); } else vstack = workspace_[key]; @@ -225,18 +222,18 @@ class VectorWorkspace { Ptr copy( const Ptr& x ) { return copy(*x); } - void status( ostream& os ) const { - os << "\n\n" << string(80,'-') << std::endl; + void status( std::ostream& os ) const { + os << "\n\n" << std::string(80,'-') << std::endl; os << "VectorWorkspace contains the following VectorStack(hash_code,dim) entries:\n\n"; for( auto entry : workspace_ ) { - os << " VectorStack(" << hex << entry.first.hash_code << "," - << dec << entry.first.dimension << ")"; + os << " VectorStack(" << std::hex << entry.first.hash_code << "," + << std::dec << entry.first.dimension << ")"; os << "\n Reference Counts per element" << std::endl; for( auto e : entry.second->vectors_ ) { os << " " << getCount( e ) << std::endl; } } - os << string(80,'-') << std::endl; + os << std::string(80,'-') << std::endl; } diff --git a/packages/rol/src/zoo/ROL_Stream.hpp b/packages/rol/src/zoo/ROL_Stream.hpp index 883af0a4bf88..69da2e830efa 100644 --- a/packages/rol/src/zoo/ROL_Stream.hpp +++ b/packages/rol/src/zoo/ROL_Stream.hpp @@ -61,27 +61,25 @@ namespace ROL { namespace details { -using namespace std; - template -class basic_nullstream : virtual public basic_ostream<_CharT, _Traits> { +class basic_nullstream : virtual public std::basic_ostream<_CharT, _Traits> { public: - explicit basic_nullstream() : basic_ostream<_CharT, _Traits>(NULL) {} + explicit basic_nullstream() : std::basic_ostream<_CharT, _Traits>(NULL) {} }; -using nullstream = basic_nullstream>; +using nullstream = basic_nullstream>; inline -Ptr makeStreamPtr( ostream& os, bool noSuppressOutput=true ) { - Ptr retstream; - if( noSuppressOutput ) retstream = makePtrFromRef(os); +Ptr makeStreamPtr( std::ostream& os, bool noSuppressOutput=true ) { + Ptr retstream; + if( noSuppressOutput ) retstream = makePtrFromRef(os); else retstream = makePtr(); return retstream; // noSuppressOutput ? makePtrFromRef( os ) : makePtr(); } inline -Ptr makeStreamPtr( Ptr os, bool noSuppressOutput=true ) { - Ptr retstream; +Ptr makeStreamPtr( Ptr os, bool noSuppressOutput=true ) { + Ptr retstream; if( noSuppressOutput ) retstream = os; else retstream = makePtr(); return retstream; // noSuppressOutput ? makePtrFromRef( os ) : makePtr(); From ff6aeb07b243e168f83c785feba5138d1de332a9 Mon Sep 17 00:00:00 2001 From: Greg von Winckel Date: Wed, 16 Oct 2024 14:58:25 -0600 Subject: [PATCH 061/243] Added GitHub Action to update rol_parameters.xml automatically Signed-off-by: Greg von Winckel --- .github/workflows/update_rol_parameters.yml | 39 +++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/update_rol_parameters.yml diff --git a/.github/workflows/update_rol_parameters.yml b/.github/workflows/update_rol_parameters.yml new file mode 100644 index 000000000000..43fdf289be70 --- /dev/null +++ b/.github/workflows/update_rol_parameters.yml @@ -0,0 +1,39 @@ +name: Update ROL Parameters + +on: + push: + branches: + - develop + +jobs: + update-parameters: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Prepare environment + run: | + python -m pip install --upgrade pip + chmod +x find_parameters.sh + + - name: Run rol_parameters script + run: python rol_parameters.py . + + # Uncomment and modify the following step if you want to move the file to a specific directory + # - name: Move XML file to specific directory + # run: | + # mkdir -p path/to/desired/directory + # mv rol_parameters.xml path/to/desired/directory/ + + - name: Commit and push if changes + run: | + git config --global user.name 'GitHub Action' + git config --global user.email 'action@github.com' + git add rol_parameters.xml + git diff --quiet && git diff --staged --quiet || (git commit -m "Update ROL parameters XML" && git push) + From f01bf741e84712b94556f1412317504982a7370e Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 16 Oct 2024 15:18:39 -0600 Subject: [PATCH 062/243] Fixed incorrect specification of t0_ type. Signed-off-by: Drew Kouri --- .../src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm.hpp index a4b546ae88c6..989735618c9a 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_InexactNewtonAlgorithm.hpp @@ -56,7 +56,7 @@ namespace TypeP { template class InexactNewtonAlgorithm : public TypeP::Algorithm { private: - int t0_; + Real t0_; bool initProx_; int maxit_; ///< Maximum number of line search steps (default: 20) From 2b7dc1815fe846c660be962371a8f2081a14d53d Mon Sep 17 00:00:00 2001 From: Drew Kouri Date: Wed, 16 Oct 2024 15:21:10 -0600 Subject: [PATCH 063/243] Fixed incorrect type for t0_. Signed-off-by: Drew Kouri --- .../rol/src/algorithm/TypeP/ROL_TypeP_QuasiNewtonAlgorithm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_QuasiNewtonAlgorithm.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_QuasiNewtonAlgorithm.hpp index 294b22f0b439..f7675bb5e363 100644 --- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_QuasiNewtonAlgorithm.hpp +++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_QuasiNewtonAlgorithm.hpp @@ -61,7 +61,7 @@ class QuasiNewtonAlgorithm : public TypeP::Algorithm { ESecant esec_; ///< Secant type std::string secantName_; ///< Secant name - int t0_; + Real t0_; bool initProx_; int maxit_; ///< Maximum number of line search steps (default: 20) From 0aee00546706c6574531ca56720fc332c2c32814 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Wed, 31 Jul 2024 18:16:24 -0600 Subject: [PATCH 064/243] Tpetra: TAFC Converted to use Kokkos Kokkos versions of doPosts(), doPostsAllToALl(), and doPostsNbrAllToAllV() added to Tpetra_Details_DistributorActor.hpp. Kokkos version of doPosts() added to Tpetra_Distributor.hpp. Tpetra_CrsMatrix_def.hpp edited to use these new methods. Some syncs have been removed as they are now superfluous. Signed-off-by: Ian Halim --- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 90 +-- .../src/Tpetra_Details_DistributorActor.hpp | 652 +++++++++++++++++- .../tpetra/core/src/Tpetra_Distributor.hpp | 89 ++- 3 files changed, 757 insertions(+), 74 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index f0eef6b3b32e..a88b5ca649ba 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -47,6 +47,7 @@ #include "KokkosBlas1_scal.hpp" #include "KokkosSparse_getDiagCopy.hpp" #include "KokkosSparse_spmv.hpp" +#include "Kokkos_StdAlgorithms.hpp" #include #include @@ -8301,24 +8302,16 @@ CrsMatrix:: << std::endl; std::cerr << os.str (); } - // Make sure that host has the latest version, since we're - // using the version on host. If host has the latest - // version, syncing to host does nothing. - destMat->numExportPacketsPerLID_.sync_host (); - Teuchos::ArrayView numExportPacketsPerLID = - getArrayViewFromDualView (destMat->numExportPacketsPerLID_); - destMat->numImportPacketsPerLID_.sync_host (); - Teuchos::ArrayView numImportPacketsPerLID = - getArrayViewFromDualView (destMat->numImportPacketsPerLID_); - + destMat->numExportPacketsPerLID_.sync_device(); + auto numExportPacketsPerLID = destMat->numExportPacketsPerLID_.view_device(); + auto numImportPacketsPerLID = destMat->numImportPacketsPerLID_.view_device(); if (verbose) { std::ostringstream os; os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits" << std::endl; std::cerr << os.str (); } - Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1, - destMat->numImportPacketsPerLID_.view_host()); + Distor.doReversePostsAndWaits(numExportPacketsPerLID, 1, numImportPacketsPerLID); if (verbose) { std::ostringstream os; os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits" @@ -8326,34 +8319,26 @@ CrsMatrix:: std::cerr << os.str (); } - size_t totalImportPackets = 0; - for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) { - totalImportPackets += numImportPacketsPerLID[i]; - } + size_t totalImportPackets = Kokkos::Experimental::reduce(typename Node::execution_space(), numImportPacketsPerLID); // Reallocation MUST go before setting the modified flag, // because it may clear out the flags. destMat->reallocImportsIfNeeded (totalImportPackets, verbose, verbosePrefix.get ()); destMat->imports_.modify_host (); - auto hostImports = destMat->imports_.view_host(); - // This is a legacy host pack/unpack path, so use the host - // version of exports_. - destMat->exports_.sync_host (); - auto hostExports = destMat->exports_.view_host(); + auto deviceImports = destMat->imports_.view_device(); + auto deviceExports = destMat->exports_.view_device(); if (verbose) { std::ostringstream os; - os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits" + os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaitsKokkos" << std::endl; std::cerr << os.str (); } - Distor.doReversePostsAndWaits (hostExports, - numExportPacketsPerLID, - hostImports, - numImportPacketsPerLID); + destMat->imports_.sync_device(); + Distor.doReversePostsAndWaitsKokkos (deviceExports, numExportPacketsPerLID, deviceImports, numImportPacketsPerLID); if (verbose) { std::ostringstream os; - os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits" + os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaitsKokkos" << std::endl; std::cerr << os.str (); } @@ -8396,23 +8381,16 @@ CrsMatrix:: << std::endl; std::cerr << os.str (); } - // Make sure that host has the latest version, since we're - // using the version on host. If host has the latest - // version, syncing to host does nothing. - destMat->numExportPacketsPerLID_.sync_host (); - Teuchos::ArrayView numExportPacketsPerLID = - getArrayViewFromDualView (destMat->numExportPacketsPerLID_); - destMat->numImportPacketsPerLID_.sync_host (); - Teuchos::ArrayView numImportPacketsPerLID = - getArrayViewFromDualView (destMat->numImportPacketsPerLID_); + destMat->numExportPacketsPerLID_.sync_device (); + auto numExportPacketsPerLID = destMat->numExportPacketsPerLID_.view_device(); + auto numImportPacketsPerLID = destMat->numImportPacketsPerLID_.view_device(); if (verbose) { std::ostringstream os; os << *verbosePrefix << "Calling 3-arg doPostsAndWaits" << std::endl; std::cerr << os.str (); } - Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1, - destMat->numImportPacketsPerLID_.view_host()); + Distor.doPostsAndWaits(numExportPacketsPerLID, 1, numImportPacketsPerLID); if (verbose) { std::ostringstream os; os << *verbosePrefix << "Finished 3-arg doPostsAndWaits" @@ -8420,34 +8398,26 @@ CrsMatrix:: std::cerr << os.str (); } - size_t totalImportPackets = 0; - for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) { - totalImportPackets += numImportPacketsPerLID[i]; - } + size_t totalImportPackets = Kokkos::Experimental::reduce(typename Node::execution_space(), numImportPacketsPerLID); // Reallocation MUST go before setting the modified flag, // because it may clear out the flags. destMat->reallocImportsIfNeeded (totalImportPackets, verbose, verbosePrefix.get ()); destMat->imports_.modify_host (); - auto hostImports = destMat->imports_.view_host(); - // This is a legacy host pack/unpack path, so use the host - // version of exports_. - destMat->exports_.sync_host (); - auto hostExports = destMat->exports_.view_host(); + auto deviceImports = destMat->imports_.view_device(); + auto deviceExports = destMat->exports_.view_device(); if (verbose) { std::ostringstream os; - os << *verbosePrefix << "Calling 4-arg doPostsAndWaits" + os << *verbosePrefix << "Calling 4-arg doPostsAndWaitsKokkos" << std::endl; std::cerr << os.str (); } - Distor.doPostsAndWaits (hostExports, - numExportPacketsPerLID, - hostImports, - numImportPacketsPerLID); + destMat->imports_.sync_device (); + Distor.doPostsAndWaitsKokkos (deviceExports, numExportPacketsPerLID, deviceImports, numImportPacketsPerLID); if (verbose) { std::ostringstream os; - os << *verbosePrefix << "Finished 4-arg doPostsAndWaits" + os << *verbosePrefix << "Finished 4-arg doPostsAndWaitsKokkos" << std::endl; std::cerr << os.str (); } @@ -8494,12 +8464,6 @@ CrsMatrix:: Teuchos::Array RemotePids; if (runOnHost) { Teuchos::Array TargetPids; - // Backwards compatibility measure. We'll use this again below. - - // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been) - // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits(). - // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. - destMat->numImportPacketsPerLID_.modify_host(); //FIXME # ifdef HAVE_TPETRA_MMM_TIMINGS RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data")))); @@ -8691,14 +8655,6 @@ CrsMatrix:: } else { // run on device - - // Backwards compatibility measure. We'll use this again below. - - // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been) - // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits(). - // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. - destMat->numImportPacketsPerLID_.modify_host(); //FIXME - # ifdef HAVE_TPETRA_MMM_TIMINGS RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data")))); # endif diff --git a/packages/tpetra/core/src/Tpetra_Details_DistributorActor.hpp b/packages/tpetra/core/src/Tpetra_Details_DistributorActor.hpp index 9b021ac53e9b..24e8351a6133 100644 --- a/packages/tpetra/core/src/Tpetra_Details_DistributorActor.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_DistributorActor.hpp @@ -22,6 +22,7 @@ #include "Teuchos_Time.hpp" #include "Kokkos_TeuchosCommAdapters.hpp" +#include "Kokkos_StdAlgorithms.hpp" #ifdef HAVE_TPETRA_MPI #include "mpi.h" @@ -53,6 +54,13 @@ class DistributorActor { const ImpView &imports, const Teuchos::ArrayView& numImportPacketsPerLID); + template + void doPostsAndWaitsKokkos(const DistributorPlan& plan, + const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); + template void doPosts(const DistributorPlan& plan, const ExpView& exports, @@ -66,6 +74,27 @@ class DistributorActor { const ImpView &imports, const Teuchos::ArrayView& numImportPacketsPerLID); + template + void doPostsKokkos(const DistributorPlan& plan, + const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); + + template + void doPostsAllToAllKokkos( + const DistributorPlan &plan, const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); + + template + void doPostsNbrAllToAllVKokkos( + const DistributorPlan &plan, const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); + void doWaits(const DistributorPlan& plan); bool isReady() const; @@ -147,6 +176,22 @@ void DistributorActor::doPostsAndWaits(const DistributorPlan& plan, doWaits(plan); } + +template +void DistributorActor::doPostsAndWaitsKokkos(const DistributorPlan& plan, + const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID) +{ + static_assert(areKokkosViews, + "Data arrays for DistributorActor::doPostsAndWaitsKokkos must be Kokkos::Views"); + static_assert(areKokkosViews, + "Num packets arrays for DistributorActor::doPostsAndWaitsKokkos must be Kokkos::Views"); + doPostsKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); + doWaits(plan); +} + template using HostAccessibility = Kokkos::SpaceAccessibility; @@ -760,6 +805,140 @@ void DistributorActor::doPostsAllToAll( << "\"."); } +template +void DistributorActor::doPostsAllToAllKokkos( + const DistributorPlan &plan, const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID) { + TEUCHOS_TEST_FOR_EXCEPTION( + !plan.getIndicesTo().is_null(), std::runtime_error, + "Send Type=\"Alltoall\" only works for fast-path communication."); + + using size_type = Teuchos::Array::size_type; + using ExpExecSpace = typename ExpPacketsView::execution_space; + using ImpExecSpace = typename ImpPacketsView::execution_space; + + auto comm = plan.getComm(); + Kokkos::View sendcounts("sendcounts", comm->getSize()); + Kokkos::View sdispls("sdispls", comm->getSize()); + Kokkos::View recvcounts("recvcounts", comm->getSize()); + Kokkos::View rdispls("rdispls", comm->getSize()); + + auto sendcounts_d = Kokkos::create_mirror_view(ExpExecSpace(), sendcounts); + auto sdispls_d = Kokkos::create_mirror_view(ExpExecSpace(), sdispls); + auto recvcounts_d = Kokkos::create_mirror_view(ImpExecSpace(), recvcounts); + auto rdispls_d = Kokkos::create_mirror_view(ImpExecSpace(), rdispls); + + auto getStartsTo = Kokkos::Compat::getKokkosViewDeepCopy(plan.getStartsTo()); + auto getLengthsTo = Kokkos::Compat::getKokkosViewDeepCopy(plan.getLengthsTo()); + auto getProcsTo = Kokkos::Compat::getKokkosViewDeepCopy(plan.getProcsTo()); + + size_t curPKToffset = 0; + Kokkos::parallel_scan(Kokkos::RangePolicy(0, plan.getNumSends()), KOKKOS_LAMBDA(const size_t pp, size_t& offset, bool is_final) { + sdispls_d(getProcsTo(pp)) = offset; + size_t numPackets = 0; + for (size_t j = getStartsTo(pp); j < getStartsTo(pp) + getLengthsTo(pp); ++j) { + numPackets += numExportPacketsPerLID(j); + } + sendcounts_d(getProcsTo(pp)) = static_cast(numPackets); + offset += numPackets; + }, curPKToffset); + + int overflow; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, plan.getNumSends()), KOKKOS_LAMBDA(const size_t pp, int& index) { + if(sendcounts_d(getProcsTo(pp)) < 0) { + index = pp+1; + } + }, overflow); + + // numPackets is converted down to int, so make sure it can be represented + TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error, + "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): " + "Send count for send " + << overflow-1 << " is too large " + "to be represented as int."); + + const size_type actualNumReceives = + Teuchos::as(plan.getNumReceives()) + + Teuchos::as(plan.hasSelfMessage() ? 1 : 0); + + auto getLengthsFrom = Kokkos::Compat::getKokkosViewDeepCopy(plan.getLengthsFrom()); + auto getProcsFrom = Kokkos::Compat::getKokkosViewDeepCopy(plan.getProcsFrom()); + + Kokkos::View curLIDoffset("curLIDoffset", actualNumReceives); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, actualNumReceives), KOKKOS_LAMBDA(const size_type i, size_t& offset, bool is_final) { + if(is_final) curLIDoffset(i) = offset; + offset += getLengthsFrom(i); + }); + + Kokkos::parallel_scan(Kokkos::RangePolicy(0, actualNumReceives), KOKKOS_LAMBDA(const size_type i, size_t& curBufferOffset, bool is_final) { + size_t totalPacketsFrom_i = 0; + for(size_t j = 0; j < getLengthsFrom(i); j++) { + totalPacketsFrom_i += numImportPacketsPerLID(curLIDoffset(i) + j); + } + + if(is_final) rdispls_d(getProcsFrom(i)) = curBufferOffset; + if(is_final) recvcounts_d(getProcsFrom(i)) = static_cast(totalPacketsFrom_i); + curBufferOffset += totalPacketsFrom_i; + }); + + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, actualNumReceives), KOKKOS_LAMBDA(const size_type i, int& index) { + if(recvcounts_d(getProcsFrom(i)) < 0) { + index = i+1; + } + }, overflow); + + // totalPacketsFrom_i is converted down to int, so make sure it can be + // represented + TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error, + "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): " + "Recv count for receive " + << overflow-1 << " is too large " + "to be represented as int."); + + Kokkos::deep_copy(sendcounts, sendcounts_d); + Kokkos::deep_copy(sdispls, sdispls_d); + Kokkos::deep_copy(recvcounts, recvcounts_d); + Kokkos::deep_copy(rdispls, rdispls_d); + + Teuchos::RCP> mpiComm = + Teuchos::rcp_dynamic_cast>(comm); + Teuchos::RCP> rawComm = + mpiComm->getRawMpiComm(); + using T = typename ExpView::non_const_value_type; + MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits::getType(T()); + +#if defined(HAVE_TPETRACORE_MPI_ADVANCE) + if (Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL == plan.getSendType()) { + MPIX_Comm *mpixComm = *plan.getMPIXComm(); + TEUCHOS_TEST_FOR_EXCEPTION(!mpixComm, std::runtime_error, + "MPIX_Comm is null in doPostsAllToAll \"" + << __FILE__ << ":" << __LINE__); + + const int err = MPIX_Alltoallv( + exports.data(), sendcounts.data(), sdispls.data(), rawType, + imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm); + + TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error, + "MPIX_Alltoallv failed with error \"" + << Teuchos::mpiErrorCodeToString(err) + << "\"."); + + return; + } +#endif // HAVE_TPETRACORE_MPI_ADVANCE + + const int err = MPI_Alltoallv( + exports.data(), sendcounts.data(), sdispls.data(), rawType, + imports.data(), recvcounts.data(), rdispls.data(), rawType, (*rawComm)()); + + TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error, + "MPI_Alltoallv failed with error \"" + << Teuchos::mpiErrorCodeToString(err) + << "\"."); +} + #if defined(HAVE_TPETRACORE_MPI_ADVANCE) template void DistributorActor::doPostsNbrAllToAllV( @@ -840,6 +1019,117 @@ void DistributorActor::doPostsNbrAllToAllV( << Teuchos::mpiErrorCodeToString(err) << "\"."); } + +template +void DistributorActor::doPostsNbrAllToAllVKokkos( + const DistributorPlan &plan, const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID) { + TEUCHOS_TEST_FOR_EXCEPTION( + !plan.getIndicesTo().is_null(), std::runtime_error, + "Send Type=\"Alltoall\" only works for fast-path communication."); + + const Teuchos_Ordinal numSends = plan.getProcsTo().size(); + const Teuchos_Ordinal numRecvs = plan.getProcsFrom().size(); + + auto comm = plan.getComm(); + Kokkos::View sendcounts("sendcounts", comm->getSize()); + Kokkos::View sdispls("sdispls", comm->getSize()); + Kokkos::View recvcounts("recvcounts", comm->getSize()); + Kokkos::View rdispls("rdispls", comm->getSize()); + + auto sendcounts_d = Kokkos::create_mirror_view(ExpExecSpace(), sendcounts); + auto sdispls_d = Kokkos::create_mirror_view(ExpExecSpace(), sdispls); + auto recvcounts_d = Kokkos::create_mirror_view(ImpExecSpace(), recvcounts); + auto rdispls_d = Kokkos::create_mirror_view(ImpExecSpace(), rdispls); + + auto getStartsTo = Kokkos::Compat::getKokkosViewDeepCopy(plan.getStartsTo()); + auto getLengthsTo = Kokkos::Compat::getKokkosViewDeepCopy(plan.getLengthsTo()); + + Teuchos::RCP> mpiComm = + Teuchos::rcp_dynamic_cast>(comm); + Teuchos::RCP> rawComm = + mpiComm->getRawMpiComm(); + using T = typename ExpView::non_const_value_type; + using ExpExecSpace = typename ExpPacketsView::execution_space; + using ImpExecSpace = typename ImpPacketsView::execution_space; + MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits::getType(T()); + + // unlike standard alltoall, entry `i` in sdispls and sendcounts + // refer to the ith participating rank, rather than rank i + Kokkos::parallel_scan(Kokkos::RangePolicy(0, numSends), KOKKOS_LAMBDA(const Teuchos_Ordinal pp, size_t& curPKToffset, bool is_final) { + sdispls_d(pp) = curPKToffset; + size_t numPackets = 0; + for (size_t j = getStartsTo(pp); j < getStartsTo(pp) + getLengthsTo(pp); ++j) { + numPackets += numExportPacketsPerLID(j); + } + sendcounts_d(pp) = static_cast(numPackets); + curPKToffset += numPackets; + }); + + int overflow; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, numSends), KOKKOS_LAMBDA(const Teuchos_Ordinal pp, int& index) { + if(sendcounts_d(pp) < 0) { + index = i+1; + } + }, overflow); + + // numPackets is converted down to int, so make sure it can be represented + TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error, + "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): " + "Send count for send " + << overflow-1 << " is too large " + "to be represented as int."); + + auto getLengthsFrom = Kokkos::Compat::getKokkosViewDeepCopy(plan.getLengthsFrom()); + + Kokkos::View curLIDoffset("curLIDoffset", numRecvs); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, numRecvs), KOKKOS_LAMBDA(const Teuchos_Ordinal i, size_t& offset, bool is_final) { + if(is_final) curLIDoffset(i) = offset; + offset += getLengthsFrom(i); + }); + + Kokkos::parallel_scan(Kokkos::RangePolicy(0, numRecvs), KOKKOS_LAMBDA(const Teuchos_Ordinal i, size_t& curBufferOffset, bool is_final) { + rdispls_d(i) = curBufferOffset; + size_t totalPacketsFrom_i = 0; + for(size_t j = 0; j < getLengthsFrom(i); j++) { + totalPacketsFrom_i += numImportPacketsPerLID(curLIDoffset(i) + j); + } + + recvcounts_d(i) = static_cast(totalPacketsFrom_i); + curBufferOffset += totalPacketsFrom_i; + }); + + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, numRecvs), KOKKOS_LAMBDA(const Teuchos_Ordinal i, int& index) { + if(recvcounts_d(pp) < 0) { + index = i+1; + } + }, overflow); + + // totalPacketsFrom_i is converted down to int, so make sure it can be + // represented + TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error, + "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): " + "Recv count for receive " + << overflow-1 << ") is too large " + "to be represented as int."); + + Kokkos::deep_copy(sendcounts, sendcounts_d); + Kokkos::deep_copy(sdispls, sdispls_d); + Kokkos::deep_copy(recvcounts, recvcounts_d); + Kokkos::deep_copy(rdispls, rdispls_d); + + MPIX_Comm *mpixComm = *plan.getMPIXComm(); + const int err = MPIX_Neighbor_alltoallv( + exports.data(), sendcounts.data(), sdispls.data(), rawType, + imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm); + + TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error, + "MPIX_Neighbor_alltoallv failed with error \"" + << Teuchos::mpiErrorCodeToString(err) + << "\"."); +} #endif // HAVE_TPETRACORE_MPI_ADVANCE #endif // HAVE_TPETRA_MPI // clang-format off @@ -1107,16 +1397,16 @@ void DistributorActor::doPosts(const DistributorPlan& plan, // This buffer is long enough for only one message at a time. // Thus, we use DISTRIBUTOR_SEND always in this case, regardless - // of sendType requested by user. + // of sendType requested by user. // This code path formerly errored out with message: - // Tpetra::Distributor::doPosts(4-arg, Kokkos): + // Tpetra::Distributor::doPosts(4-arg, Kokkos): // The "send buffer" code path // doesn't currently work with nonblocking sends. // Now, we opt to just do the communication in a way that works. #ifdef HAVE_TPETRA_DEBUG if (sendType != Details::DISTRIBUTOR_SEND) { if (plan.getComm()->getRank() == 0) - std::cout << "The requested Tpetra send type " + std::cout << "The requested Tpetra send type " << DistributorSendTypeEnumToString(sendType) << " requires Distributor data to be ordered by" << " the receiving processor rank. Since these" @@ -1125,7 +1415,7 @@ void DistributorActor::doPosts(const DistributorPlan& plan, } #endif - Kokkos::View sendArray ("sendArray", + Kokkos::View sendArray ("sendArray", maxNumPackets); Array indicesOffsets (numExportPacketsPerLID.size(), 0); @@ -1180,6 +1470,360 @@ void DistributorActor::doPosts(const DistributorPlan& plan, } } +template +void DistributorActor::doPostsKokkos(const DistributorPlan& plan, + const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID) +{ + static_assert(areKokkosViews, + "Data arrays for DistributorActor::doPostsKokkos must be Kokkos::Views"); + static_assert(areKokkosViews, + "Num packets arrays for DistributorActor::doPostsKokkos must be Kokkos::Views"); + using Teuchos::Array; + using Teuchos::as; + using Teuchos::ireceive; + using Teuchos::isend; + using Teuchos::send; + using Teuchos::TypeNameTraits; + using std::endl; + using Kokkos::Compat::create_const_view; + using Kokkos::Compat::create_view; + using Kokkos::Compat::subview_offset; + using Kokkos::Compat::deep_copy_offset; + using ExpExecSpace = typename ExpPacketsView::execution_space; + using ImpExecSpace = typename ImpPacketsView::execution_space; + typedef Array::size_type size_type; + typedef ExpView exports_view_type; + typedef ImpView imports_view_type; + +#ifdef KOKKOS_ENABLE_CUDA + static_assert (! std::is_same::value && + ! std::is_same::value, + "Please do not use Tpetra::Distributor with UVM " + "allocations. See GitHub issue #1088."); +#endif // KOKKOS_ENABLE_CUDA + +#ifdef KOKKOS_ENABLE_SYCL + static_assert (! std::is_same::value && + ! std::is_same::value, + "Please do not use Tpetra::Distributor with SharedUSM " + "allocations. See GitHub issue #1088 (corresponding to CUDA)."); +#endif // KOKKOS_ENABLE_SYCL + +#ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS + Teuchos::TimeMonitor timeMon (*timer_doPosts4KV_); +#endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS + + // Run-time configurable parameters that come from the input + // ParameterList set by setParameterList(). + const Details::EDistributorSendType sendType = plan.getSendType(); + +#ifdef HAVE_TPETRA_MPI + // All-to-all communication layout is quite different from + // point-to-point, so we handle it separately. + if (sendType == Details::DISTRIBUTOR_ALLTOALL) { + doPostsAllToAllKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); + return; + } +#ifdef HAVE_TPETRACORE_MPI_ADVANCE + else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL) + { + doPostsAllToAllKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); + return; + } else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) { + doPostsNbrAllToAllVKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); + return; + } +#endif + +#else // HAVE_TPETRA_MPI + if (plan.hasSelfMessage()) { + size_t packetsPerSend; + Kokkos::parallel_reduce(Kokkos::RangePolicy(plan.getStartsTo()[0], plan.getStartsTo()[0]+plan.getLengthsTo()[0]), KOKKOS_LAMBDA(const size_t j, size_t& packets) { + packets += numExportPacketsPerLID(j); + }, packetsPerSend); + + deep_copy_offset(imports, exports, (size_t)0, (size_t)0, packetsPerSend); + } +#endif // HAVE_TPETRA_MPI + + const int myProcID = plan.getComm()->getRank (); + size_t selfReceiveOffset = 0; + +#ifdef HAVE_TPETRA_DEBUG + // Different messages may have different numbers of packets. + size_t totalNumImportPackets = Kokkos::Experimental::reduce(ImpExecSpace(), numImportPacketsPerLID); + TEUCHOS_TEST_FOR_EXCEPTION( + imports.extent (0) < totalNumImportPackets, std::runtime_error, + "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): The 'imports' array must have " + "enough entries to hold the expected number of import packets. " + "imports.extent(0) = " << imports.extent (0) << " < " + "totalNumImportPackets = " << totalNumImportPackets << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (requests_.size () != 0, std::logic_error, "Tpetra::Distributor::" + "doPostsKokkos(4 args, Kokkos): Process " << myProcID << ": requests_.size () = " + << requests_.size () << " != 0."); +#endif // HAVE_TPETRA_DEBUG + // Distributor uses requests_.size() as the number of outstanding + // nonblocking message requests, so we resize to zero to maintain + // this invariant. + // + // getNumReceives() does _not_ include the self message, if there is + // one. Here, we do actually send a message to ourselves, so we + // include any self message in the "actual" number of receives to + // post. + // + // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts() + // doesn't (re)allocate its array of requests. That happens in + // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on + // demand), or Resize_(). + const size_type actualNumReceives = as (plan.getNumReceives()) + + as (plan.hasSelfMessage() ? 1 : 0); + requests_.resize (0); + + // Post the nonblocking receives. It's common MPI wisdom to post + // receives before sends. In MPI terms, this means favoring + // adding to the "posted queue" (of receive requests) over adding + // to the "unexpected queue" (of arrived messages not yet matched + // with a receive). + { +#ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS + Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4KV_recvs_); +#endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS + + size_t curBufferOffset = 0; + size_t curLIDoffset = 0; + for (size_type i = 0; i < actualNumReceives; ++i) { + size_t totalPacketsFrom_i = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, plan.getLengthsFrom()[i]), KOKKOS_LAMBDA(const size_t j, size_t& total) { + total += numImportPacketsPerLID(curLIDoffset+j); + }, totalPacketsFrom_i); + // totalPacketsFrom_i is converted down to int, so make sure it can be represented + TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i > size_t(INT_MAX), + std::logic_error, "Tpetra::Distributor::doPostsKokkos(3 args, Kokkos): " + "Recv count for receive " << i << " (" << totalPacketsFrom_i << ") is too large " + "to be represented as int."); + curLIDoffset += plan.getLengthsFrom()[i]; + if (plan.getProcsFrom()[i] != myProcID && totalPacketsFrom_i) { + // If my process is receiving these packet(s) from another + // process (not a self-receive), and if there is at least + // one packet to receive: + // + // 1. Set up the persisting view (recvBuf) into the imports + // array, given the offset and size (total number of + // packets from process getProcsFrom()[i]). + // 2. Start the Irecv and save the resulting request. + imports_view_type recvBuf = + subview_offset (imports, curBufferOffset, totalPacketsFrom_i); + requests_.push_back (ireceive (recvBuf, plan.getProcsFrom()[i], + mpiTag_, *plan.getComm())); + } + else { // Receiving these packet(s) from myself + selfReceiveOffset = curBufferOffset; // Remember the offset + } + curBufferOffset += totalPacketsFrom_i; + } + } + +#ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS + Teuchos::TimeMonitor timeMonSends (*timer_doPosts4KV_sends_); +#endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS + + // setup views containing starting-offsets into exports for each send, + // and num-packets-to-send for each send. + Kokkos::View sendPacketOffsets("sendPacketOffsets", plan.getNumSends()); + Kokkos::View packetsPerSend("packetsPerSend", plan.getNumSends()); + auto sendPacketOffsets_d = Kokkos::create_mirror_view(ExpExecSpace(), sendPacketOffsets); + auto packetsPerSend_d = Kokkos::create_mirror_view(ExpExecSpace(), packetsPerSend); + + auto starts = Kokkos::Compat::getKokkosViewDeepCopy(plan.getStartsTo()); + auto lengths = Kokkos::Compat::getKokkosViewDeepCopy(plan.getLengthsTo()); + + Kokkos::parallel_scan(Kokkos::RangePolicy(0, plan.getNumSends()), KOKKOS_LAMBDA(const size_t pp, size_t& curPKToffset, bool final_pass) { + if(final_pass) sendPacketOffsets_d(pp) = curPKToffset; + size_t numPackets = 0; + for(size_t j = starts(pp); j < starts(pp) + lengths(pp); j++) { + numPackets += numExportPacketsPerLID(j); + } + if(final_pass) packetsPerSend_d(pp) = numPackets; + curPKToffset += numPackets; + }); + + size_t maxNumPackets; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, plan.getNumSends()), KOKKOS_LAMBDA(const size_t pp, size_t& max) { + if(packetsPerSend_d(pp) > max) { + max = packetsPerSend_d(pp); + } + }, Kokkos::Max(maxNumPackets)); + + // numPackets will be used as a message length, so make sure it can be represented as int + TEUCHOS_TEST_FOR_EXCEPTION(maxNumPackets > size_t(INT_MAX), + std::logic_error, "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): " + "numPackets = " << maxNumPackets << " is too large " + "to be represented as int."); + + Kokkos::deep_copy(sendPacketOffsets, sendPacketOffsets_d); + Kokkos::deep_copy(packetsPerSend, packetsPerSend_d); + + // setup scan through getProcsTo() list starting with higher numbered procs + // (should help balance message traffic) + size_t numBlocks = plan.getNumSends() + plan.hasSelfMessage(); + size_t procIndex = 0; + while ((procIndex < numBlocks) && (plan.getProcsTo()[procIndex] < myProcID)) { + ++procIndex; + } + if (procIndex == numBlocks) { + procIndex = 0; + } + + size_t selfNum = 0; + size_t selfIndex = 0; + if (plan.getIndicesTo().is_null()) { + +#ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS + Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_fast_); +#endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS + + // Data are already blocked (laid out) by process, so we don't + // need a separate send buffer (besides the exports array). + for (size_t i = 0; i < numBlocks; ++i) { + size_t p = i + procIndex; + if (p > (numBlocks - 1)) { + p -= numBlocks; + } + + if (plan.getProcsTo()[p] != myProcID && packetsPerSend[p] > 0) { + exports_view_type tmpSend = + subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]); + + if (sendType == Details::DISTRIBUTOR_ISEND) { + exports_view_type tmpSendBuf = + subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]); + requests_.push_back (isend (tmpSendBuf, plan.getProcsTo()[p], + mpiTag_, *plan.getComm())); + } + else { // DISTRIBUTOR_SEND + send (tmpSend, + as (tmpSend.size ()), + plan.getProcsTo()[p], mpiTag_, *plan.getComm()); + } + } + else { // "Sending" the message to myself + selfNum = p; + } + } + + if (plan.hasSelfMessage()) { + deep_copy_offset(imports, exports, selfReceiveOffset, + sendPacketOffsets[selfNum], packetsPerSend[selfNum]); + } + } + else { // data are not blocked by proc, use send buffer + +#ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS + Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_slow_); +#endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS + + // FIXME (mfh 05 Mar 2013) This may be broken for Isend. + typedef typename ExpView::non_const_value_type Packet; + typedef typename ExpView::array_layout Layout; + typedef typename ExpView::device_type Device; + typedef typename ExpView::memory_traits Mem; + + // This buffer is long enough for only one message at a time. + // Thus, we use DISTRIBUTOR_SEND always in this case, regardless + // of sendType requested by user. + // This code path formerly errored out with message: + // Tpetra::Distributor::doPostsKokkos(4-arg, Kokkos): + // The "send buffer" code path + // doesn't currently work with nonblocking sends. + // Now, we opt to just do the communication in a way that works. +#ifdef HAVE_TPETRA_DEBUG + if (sendType != Details::DISTRIBUTOR_SEND) { + if (plan.getComm()->getRank() == 0) + std::cout << "The requested Tpetra send type " + << DistributorSendTypeEnumToString(sendType) + << " requires Distributor data to be ordered by" + << " the receiving processor rank. Since these" + << " data are not ordered, Tpetra will use Send" + << " instead." << std::endl; + } +#endif + + Kokkos::View sendArray ("sendArray", + maxNumPackets); + + Kokkos::View indicesOffsets ("indicesOffsets", numExportPacketsPerLID.extent(0)); + size_t ioffset = 0; + Kokkos::parallel_scan(Kokkos::RangePolicy(0, numExportPacketsPerLID.extent(0)), KOKKOS_LAMBDA(const size_t j, size_t& offset, bool is_final) { + if(is_final) indicesOffsets(j) = offset; + offset += numExportPacketsPerLID(j); + }, ioffset); + + for (size_t i = 0; i < numBlocks; ++i) { + size_t p = i + procIndex; + if (p > (numBlocks - 1)) { + p -= numBlocks; + } + + if (plan.getProcsTo()[p] != myProcID) { + size_t j = plan.getStartsTo()[p]; + size_t numPacketsTo_p = 0; + //mirror in case execspaces are different + auto sendArrayMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), sendArray); + auto exportsMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), exports); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, plan.getLengthsTo()[p]), KOKKOS_LAMBDA(const size_t k, size_t& offset, bool is_final) { + if(is_final) { + const size_t dst_end = offset + numExportPacketsPerLID(j + k); + const size_t src_end = indicesOffsets(j + k) + numExportPacketsPerLID(j + k); + auto dst_sub = Kokkos::subview(sendArrayMirror, Kokkos::make_pair(offset, dst_end)); + auto src_sub = Kokkos::subview(exportsMirror, Kokkos::make_pair(indicesOffsets(j + k), src_end)); + Kokkos::Experimental::local_deep_copy(dst_sub, src_sub); + } + offset += numExportPacketsPerLID(j + k); + }, numPacketsTo_p); + Kokkos::deep_copy(sendArray, sendArrayMirror); + typename ExpView::execution_space().fence(); + + if (numPacketsTo_p > 0) { + ImpView tmpSend = + subview_offset(sendArray, size_t(0), numPacketsTo_p); + + send (tmpSend, + as (tmpSend.size ()), + plan.getProcsTo()[p], mpiTag_, *plan.getComm()); + } + } + else { // "Sending" the message to myself + selfNum = p; + selfIndex = plan.getStartsTo()[p]; + } + } + + if (plan.hasSelfMessage()) { + //mirror in case execspaces are different + auto importsMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), imports); + auto exportsMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), exports); + size_t temp; + Kokkos::parallel_scan(Kokkos::RangePolicy(0, plan.getLengthsTo()[selfNum]), KOKKOS_LAMBDA(const size_t k, size_t& offset, bool is_final) { + if(is_final) { + const size_t dst_end = selfReceiveOffset + offset + numExportPacketsPerLID(selfIndex + k); + const size_t src_end = indicesOffsets(selfIndex + k) + numExportPacketsPerLID(selfIndex + k); + auto dst_sub = Kokkos::subview(importsMirror, Kokkos::make_pair(selfReceiveOffset + offset, dst_end)); + auto src_sub = Kokkos::subview(exportsMirror, Kokkos::make_pair(indicesOffsets(selfIndex + k), src_end)); + Kokkos::Experimental::local_deep_copy(dst_sub, src_sub); + } + offset += numExportPacketsPerLID(selfIndex + k); + }, temp); + Kokkos::deep_copy(imports, importsMirror); + selfIndex += plan.getLengthsTo()[selfNum]; + selfReceiveOffset += temp; + } + } +} + } } diff --git a/packages/tpetra/core/src/Tpetra_Distributor.hpp b/packages/tpetra/core/src/Tpetra_Distributor.hpp index c0c31a0f8b54..a8beece8ee9d 100644 --- a/packages/tpetra/core/src/Tpetra_Distributor.hpp +++ b/packages/tpetra/core/src/Tpetra_Distributor.hpp @@ -23,6 +23,7 @@ #include "KokkosCompat_View.hpp" #include "Kokkos_Core.hpp" #include "Kokkos_TeuchosCommAdapters.hpp" +#include "Kokkos_StdAlgorithms.hpp" #include #include #include @@ -426,6 +427,13 @@ namespace Tpetra { const ImpView &imports, const Teuchos::ArrayView& numImportPacketsPerLID); + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + doPostsAndWaitsKokkos (const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); + /// \brief Post the data for a forward plan, but do not execute the waits yet. /// /// Call this overload when you have the same number of Packets @@ -480,6 +488,13 @@ namespace Tpetra { const Teuchos::ArrayView& numExportPacketsPerLID, const ImpView &imports, const Teuchos::ArrayView& numImportPacketsPerLID); + + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + doPostsKokkos (const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); /// \brief Execute the reverse communication plan. /// @@ -501,7 +516,14 @@ namespace Tpetra { const Teuchos::ArrayView& numExportPacketsPerLID, const ImpView &imports, const Teuchos::ArrayView& numImportPacketsPerLID); - + + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + doReversePostsAndWaitsKokkos (const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); + /// \brief Post the data for a reverse plan, but do not execute the waits yet. /// /// This method takes the same arguments as the three-argument @@ -522,7 +544,14 @@ namespace Tpetra { const Teuchos::ArrayView& numExportPacketsPerLID, const ImpView &imports, const Teuchos::ArrayView& numImportPacketsPerLID); - + + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + doReversePostsKokkos (const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID); + //@} //! @name Implementation of Teuchos::Describable //@{ @@ -640,6 +669,16 @@ namespace Tpetra { actor_.doPostsAndWaits(plan_, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); } + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + Distributor:: + doPostsAndWaitsKokkos (const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID) + { + actor_.doPostsAndWaitsKokkos(plan_, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); + } template typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type @@ -661,6 +700,17 @@ namespace Tpetra { { actor_.doPosts(plan_, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); } + + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + Distributor:: + doPostsKokkos (const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID) + { + actor_.doPostsKokkos(plan_, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); + } template typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type @@ -685,6 +735,19 @@ namespace Tpetra { numImportPacketsPerLID); doReverseWaits (); } + + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + Distributor:: + doReversePostsAndWaitsKokkos (const ExpView& exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView& imports, + const ImpPacketsView &numImportPacketsPerLID) + { + doReversePostsKokkos (exports, numExportPacketsPerLID, imports, + numImportPacketsPerLID); + doReverseWaits (); + } template typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type @@ -723,7 +786,27 @@ namespace Tpetra { reverseDistributor_->doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); } - + + template + typename std::enable_if<(Kokkos::is_view::value && Kokkos::is_view::value)>::type + Distributor:: + doReversePostsKokkos (const ExpView &exports, + const ExpPacketsView &numExportPacketsPerLID, + const ImpView &imports, + const ImpPacketsView &numImportPacketsPerLID) + { + // FIXME (mfh 29 Mar 2012) WHY? + TEUCHOS_TEST_FOR_EXCEPTION( + ! plan_.getIndicesTo().is_null(), std::runtime_error, + "Tpetra::Distributor::doReversePosts(3 args): Can only do " + "reverse communication when original data are blocked by process."); + if (reverseDistributor_.is_null ()) { + createReverseDistributor (); + } + reverseDistributor_->doPostsKokkos (exports, numExportPacketsPerLID, + imports, numImportPacketsPerLID); + } + template void Distributor:: computeSends(const Teuchos::ArrayView& importGIDs, From 5c2d10b920da60e2c652da54c49a5ff25fbd443f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 24 Oct 2024 15:56:44 -0600 Subject: [PATCH 065/243] config-specs: set Kokkos_CoreUnitTest_Cuda1 to run serial attempt to resolve the cuda_graph.diamond subtest failure in nightly integration testing track Signed-off-by: Nathan Ellingwood --- packages/framework/ini-files/config-specs.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 2c0ce84d57bf..de052bca3530 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -1248,6 +1248,7 @@ opt-set-cmake-var Tpetra_INST_SERIAL BOOL FORCE : ON opt-set-cmake-var Zoltan_ENABLE_Scotch BOOL FORCE : OFF [CUDA11-RUN-SERIAL-TESTS] +opt-set-cmake-var Kokkos_CoreUnitTest_Cuda1_SET_RUN_SERIAL BOOL FORCE : ON opt-set-cmake-var KokkosKernels_sparse_cuda_MPI_1_SET_RUN_SERIAL BOOL FORCE : ON opt-set-cmake-var KokkosKernels_batched_dla_cuda_MPI_1_SET_RUN_SERIAL BOOL FORCE : ON opt-set-cmake-var Intrepid2_unit-test_MonolithicExecutable_Intrepid2_Tests_MPI_1_SET_RUN_SERIAL BOOL FORCE : ON From 26b6f00960c2f3c5895a70151d67eff8628ff159 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Thu, 17 Oct 2024 13:08:11 -0600 Subject: [PATCH 066/243] ShyLU - Basker : replace View-of-Views with std::vector-of-Views Signed-off-by: iyamazaki --- .../shylu_node/basker/src/shylubasker_def.hpp | 45 +-- .../basker/src/shylubasker_error_manager.hpp | 162 +++++------ .../basker/src/shylubasker_matrix_decl.hpp | 3 + .../basker/src/shylubasker_matrix_def.hpp | 9 +- .../basker/src/shylubasker_nfactor_blk.hpp | 144 +++++----- .../src/shylubasker_nfactor_blk_inc.hpp | 272 +++++++++--------- .../basker/src/shylubasker_nfactor_col.hpp | 266 ++++++++--------- .../basker/src/shylubasker_nfactor_col2.hpp | 84 +++--- .../src/shylubasker_nfactor_col_inc.hpp | 266 ++++++++--------- .../basker/src/shylubasker_nfactor_diag.hpp | 66 ++--- .../basker/src/shylubasker_order.hpp | 16 +- .../basker/src/shylubasker_sfactor.hpp | 189 +++++++----- .../basker/src/shylubasker_sfactor_inc.hpp | 52 ++-- .../basker/src/shylubasker_solve_rhs.hpp | 16 +- .../basker/src/shylubasker_solve_rhs_tr.hpp | 16 +- .../basker/src/shylubasker_structs.hpp | 7 +- .../basker/src/shylubasker_tree.hpp | 33 ++- .../basker/src/shylubasker_types.hpp | 158 +++++----- .../basker/src/shylubasker_util.hpp | 255 ++++++++-------- 19 files changed, 1089 insertions(+), 970 deletions(-) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp index c1b92347a094..c7b9d66311ab 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp @@ -92,16 +92,8 @@ namespace BaskerNS BASKER_INLINE void Basker::Finalize() { - //finalize all matrices - A.Finalize(); - At.Finalize(); //??? is At even used - BTF_A.Finalize(); - BTF_C.Finalize(); - BTF_B.Finalize(); - BTF_D.Finalize(); - BTF_E.Finalize(); - //finalize array of 2d matrics + // Actuall Finalize is called by desctructor FREE_MATRIX_2DARRAY(AVM, tree.nblks); FREE_MATRIX_2DARRAY(ALM, tree.nblks); @@ -120,7 +112,6 @@ namespace BaskerNS //Thread Array FREE_THREAD_1DARRAY(thread_array); - basker_barrier.Finalize(); //S (Check on this) FREE_INT_2DARRAY(S, tree.nblks); @@ -187,12 +178,6 @@ namespace BaskerNS FREE_ENTRY_1DARRAY(x_view_ptr_scale); FREE_ENTRY_1DARRAY(y_view_ptr_scale); - - //Structures - part_tree.Finalize(); - tree.Finalize(); - stree.Finalize(); - stats.Finalize(); }//end Finalize() @@ -239,7 +224,7 @@ namespace BaskerNS //Option = 2, BTF BASKER if(option == 1) - { + { default_order(); } else if(option == 2) @@ -475,12 +460,16 @@ namespace BaskerNS //Find BTF ordering if(btf_order2() != BASKER_SUCCESS) { + if(Options.verbose == BASKER_TRUE) + { + printf("Basker Ordering Failed \n"); fflush(stdout); + } return BASKER_ERROR; } if(Options.verbose == BASKER_TRUE) { - printf("Basker Ordering Found \n"); + printf("Basker Ordering Found \n"); fflush(stdout); } /*if((Options.btf == BASKER_TRUE) && (btf_tabs_offset != 0)) @@ -512,7 +501,7 @@ namespace BaskerNS if(symb_flag == BASKER_TRUE) { if(Options.verbose == BASKER_TRUE) { - printf("BASKER: YOU CANNOT RERUN SFACTOR\n"); + printf("BASKER: YOU CANNOT RERUN SFACTOR\n"); fflush(stdout); } return BASKER_ERROR; } @@ -547,7 +536,7 @@ namespace BaskerNS if(Options.verbose == BASKER_TRUE) { - printf(" == Basker Symbolic Done ==\n\n"); + printf(" == Basker Symbolic Done ==\n\n"); fflush(stdout); } #ifdef BASKER_TIMER @@ -1573,7 +1562,7 @@ namespace BaskerNS #endif } - // ---------------------------------------------------------------------------------------------- + // ---------------------------------------------------------------------------------------------- // 'sort' rows of BTF_A into ND structure #if 0 for (Int i = 0; i < BTF_A.nnz; ++i) { @@ -1621,6 +1610,7 @@ namespace BaskerNS symmetric_sfactor(); if(Options.verbose == BASKER_TRUE) { std::cout<< " > Basker Factor: Time for symbolic after ND on a big block A: " << nd_symbolic_timer.seconds() << std::endl; + fflush(stdout); } Kokkos::Timer nd_last_dense_timer; @@ -1628,16 +1618,23 @@ namespace BaskerNS btf_last_dense(flag); if(Options.verbose == BASKER_TRUE) { std::cout<< " > Basker Factor: Time for last-dense after ND on a big block A: " << nd_last_dense_timer.seconds() << std::endl; + fflush(stdout); } #ifdef BASKER_KOKKOS // ---------------------------------------------------------------------------------------------- // Allocate & Initialize blocks + #ifdef BASKER_PARALLEL_INIT_FACTOR kokkos_sfactor_init_factor iF(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), iF); Kokkos::fence(); + #else + for (Int p = 0; p < num_threads; p++) { + this->t_init_factor(p); + } + #endif /*kokkos_sfactor_init_workspace iWS(flag, this); @@ -1950,10 +1947,16 @@ namespace BaskerNS }*/ Kokkos::Timer nd_setup2_timer; +#ifdef BASKER_PARALLEL_INIT_WORKSPACE kokkos_sfactor_init_workspace iWS(flag, this); Kokkos::parallel_for(TeamPolicy(num_threads,1), iWS); Kokkos::fence(); +#else + for (Int p = 0; p < num_threads; p++) { + this->t_init_workspace(flag, p); + } +#endif if(Options.verbose == BASKER_TRUE) { std::cout<< " > Basker Factor: Time for workspace allocation after ND on a big block A: " << nd_setup2_timer.seconds() << std::endl; } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp index a6e1f5c41e91..84cbb8b801b7 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp @@ -46,66 +46,66 @@ namespace BaskerNS for(Int ti = 0; ti < num_threads; ti++) { //Note: jdb we can make this into a switch - if(thread_array(ti).error_type == BASKER_ERROR_NOERROR) + if(thread_array[ti].error_type == BASKER_ERROR_NOERROR) { threads_start(ti) = BASKER_MAX_IDX; continue; - } else if(thread_array(ti).error_type == BASKER_ERROR_SINGULAR) + } else if(thread_array[ti].error_type == BASKER_ERROR_SINGULAR) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREAD: " << ti - << " DOMBLK SINGULAR: blk=" << thread_array(ti).error_blk + << " DOMBLK SINGULAR: blk=" << thread_array[ti].error_blk << std::endl; } return BASKER_ERROR; - } else if(thread_array(ti).error_type == BASKER_ERROR_NOMALLOC) + } else if(thread_array[ti].error_type == BASKER_ERROR_NOMALLOC) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREAD: " << ti - << " DOMBLK NOMALLOC : blk=" << thread_array(ti).error_blk + << " DOMBLK NOMALLOC : blk=" << thread_array[ti].error_blk << std::endl; } return BASKER_ERROR; - } else if(thread_array(ti).error_type == BASKER_ERROR_REMALLOC) + } else if(thread_array[ti].error_type == BASKER_ERROR_REMALLOC) { - BASKER_ASSERT(thread_array(ti).error_blk >= 0, "nfactor_dom_error error_blk"); + BASKER_ASSERT(thread_array[ti].error_blk >= 0, "nfactor_dom_error error_blk"); if(Options.verbose == BASKER_TRUE) { std::cout << " > THREAD: " << ti - << " DOMBLK MALLOC : blk=" << thread_array(ti).error_blk - << " subblk=" << thread_array(ti).error_subblk - << " newsize=" << thread_array(ti).error_info + << " DOMBLK MALLOC : blk=" << thread_array[ti].error_blk + << " subblk=" << thread_array[ti].error_subblk + << " newsize=" << thread_array[ti].error_info << std::endl; } //If on diagonal, want to compare L and U Int resize_L = BASKER_MAX_IDX; Int resize_U = BASKER_MAX_IDX; - if(thread_array(ti).error_subblk != BASKER_MAX_IDX) + if(thread_array[ti].error_subblk != BASKER_MAX_IDX) { - BASKER_ASSERT(thread_array(ti).error_info > 0, "L) newsize not big enough"); - resize_L = thread_array(ti).error_info; + BASKER_ASSERT(thread_array[ti].error_info > 0, "L) newsize not big enough"); + resize_L = thread_array[ti].error_info; //if L is already bigger and U, //We will want re size U as, well - if(thread_array(ti).error_subblk == 0) + if(thread_array[ti].error_subblk == 0) { - Int blkcol = thread_array(ti).error_blk; + Int blkcol = thread_array[ti].error_blk; Int blkUrow = LU_size(blkcol)-1; - if(LL(blkcol)(0).nnz >= - LU(blkcol)(blkUrow).nnz) + if(LL[blkcol][0].nnz >= + LU[blkcol][blkUrow].nnz) { - resize_U = thread_array(ti).error_info; + resize_U = thread_array[ti].error_info; } }//if - a domain } //We don't care about the other way since, //L is already checked before U. - if(thread_array(ti).error_subblk == -1) + if(thread_array[ti].error_subblk == -1) { - resize_U = thread_array(ti).error_info; + resize_U = thread_array[ti].error_info; } //Resize L, if resize_L != -1 (meaning realloc-L is requested) @@ -116,7 +116,7 @@ namespace BaskerNS std::cout << " ++ resize L( tid = " << ti << " ): new size = " << resize_L << std::endl; } BASKER_MATRIX &L = - LL(thread_array(ti).error_blk)(thread_array(ti).error_subblk); + LL[thread_array[ti].error_blk][thread_array[ti].error_subblk]; REALLOC_INT_1DARRAY(L.row_idx, L.nnz, resize_L); @@ -142,7 +142,7 @@ namespace BaskerNS std::cout << " ++ resize U( tid = " << ti << " ): new size = " << resize_U << std::endl; } BASKER_MATRIX &U = - LU(thread_array(ti).error_blk)(0); + LU[thread_array[ti].error_blk][0]; REALLOC_INT_1DARRAY(U.row_idx, U.nnz, resize_U); @@ -153,7 +153,7 @@ namespace BaskerNS U.nnz = resize_U; //Still need to clear pend BASKER_MATRIX &L = - LL(thread_array(ti).error_blk)(0); + LL[thread_array[ti].error_blk][0]; L.clear_pend(); } @@ -163,11 +163,11 @@ namespace BaskerNS { //Clear workspace, whole column for(Int sb = 0; - sb < LL_size(thread_array(ti).error_blk); + sb < LL_size(thread_array[ti].error_blk); sb++) { BASKER_MATRIX &SL = - LL(thread_array(ti).error_blk)(sb); + LL[thread_array[ti].error_blk][sb]; for(Int i = 0; i < SL.iws_size*SL.iws_mult; ++i) { SL.iws(i) = (Int) 0; @@ -198,13 +198,13 @@ namespace BaskerNS }//for - sb (subblks) }//if ws is filled - threads_start(ti) = thread_array(ti).error_blk; + threads_start(ti) = thread_array[ti].error_blk; //Reset - thread_array(ti).error_type = BASKER_ERROR_NOERROR; - thread_array(ti).error_blk = BASKER_MAX_IDX; - thread_array(ti).error_info = BASKER_MAX_IDX; + thread_array[ti].error_type = BASKER_ERROR_NOERROR; + thread_array[ti].error_blk = BASKER_MAX_IDX; + thread_array[ti].error_info = BASKER_MAX_IDX; nthread_remalloc++; }//if REMALLOC @@ -231,26 +231,26 @@ namespace BaskerNS for(Int ti = 0; ti < num_threads; ti++) { //Note: jdb we can make this into a switch - if(thread_array(ti).error_type == BASKER_ERROR_NOERROR) + if(thread_array[ti].error_type == BASKER_ERROR_NOERROR) { thread_start(ti) = BASKER_MAX_IDX; continue; } - else if(thread_array(ti).error_type == BASKER_ERROR_SINGULAR) + else if(thread_array[ti].error_type == BASKER_ERROR_SINGULAR) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREAD: " << ti - << " SEPBLK SINGULAR: blk=" << thread_array(ti).error_blk + << " SEPBLK SINGULAR: blk=" << thread_array[ti].error_blk << std::endl; } return BASKER_ERROR; - } else if(thread_array(ti).error_type == BASKER_ERROR_NOMALLOC) + } else if(thread_array[ti].error_type == BASKER_ERROR_NOMALLOC) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREADS: " << ti - << " SEPBLK NOMALLOC: blk=" << thread_array(ti).error_blk + << " SEPBLK NOMALLOC: blk=" << thread_array[ti].error_blk << std::endl; } return BASKER_ERROR; @@ -260,22 +260,22 @@ namespace BaskerNS Int error_sep_lvl = BASKER_MAX_IDX; for(Int l = 1; l < tree.nlvls+1; l++) { - if(thread_array(ti).error_blk == S(l)(ti)) + if(thread_array[ti].error_blk == S[l][ti]) { error_sep_lvl = l; break; } } - if(thread_array(ti).error_type == BASKER_ERROR_REMALLOC) + if(thread_array[ti].error_type == BASKER_ERROR_REMALLOC) { - BASKER_ASSERT(thread_array(ti).error_blk >= 0, "nfactor_SEP_error error_blk"); + BASKER_ASSERT(thread_array[ti].error_blk >= 0, "nfactor_SEP_error error_blk"); if(Options.verbose == BASKER_TRUE) { std::cout << " > THREADS: " << ti - << " SEPBLK MALLOC: blk=" << thread_array(ti).error_blk - << " subblk=" << thread_array(ti).error_subblk - << " newsize=" << thread_array(ti).error_info + << " SEPBLK MALLOC: blk=" << thread_array[ti].error_blk + << " subblk=" << thread_array[ti].error_subblk + << " newsize=" << thread_array[ti].error_info << std::endl; std::cout << " > SEPLVL: " << error_sep_lvl << std::endl; } @@ -283,9 +283,9 @@ namespace BaskerNS //If on diagonal, want to compare L and U Int resize_L = BASKER_MAX_IDX; Int resize_U = BASKER_MAX_IDX; - if(thread_array(ti).error_subblk <= -1) + if(thread_array[ti].error_subblk <= -1) { - resize_L = thread_array(ti).error_info; + resize_L = thread_array[ti].error_info; if(Options.verbose == BASKER_TRUE) { std::cout << " ++ L size: " << resize_L << std::endl; @@ -293,9 +293,9 @@ namespace BaskerNS } //We don't care about the other way since, //L is already checked before U. - if(thread_array(ti).error_subblk > -1) + if(thread_array[ti].error_subblk > -1) { - resize_U = thread_array(ti).error_info; + resize_U = thread_array[ti].error_info; if(Options.verbose == BASKER_TRUE) { std::cout << " ++ U size: " << resize_U << std::endl; @@ -305,9 +305,9 @@ namespace BaskerNS //Resize L, if resize_L != -1 (meaning realloc-L is requested) if(resize_L != BASKER_MAX_IDX) { - const Int tsb = (-1*thread_array(ti).error_subblk)-1; + const Int tsb = (-1*thread_array[ti].error_subblk)-1; BASKER_MATRIX &L = - LL(thread_array(ti).error_blk)(tsb); + LL[thread_array[ti].error_blk][tsb]; REALLOC_INT_1DARRAY(L.row_idx, L.nnz, resize_L); @@ -322,9 +322,9 @@ namespace BaskerNS //Resize U, if resize_U != -1 (meaning realloc-U is requested) if(resize_U != BASKER_MAX_IDX) { - const Int tsb = thread_array(ti).error_subblk; + const Int tsb = thread_array[ti].error_subblk; BASKER_MATRIX &U = - LU(thread_array(ti).error_blk)(tsb); + LU[thread_array[ti].error_blk][tsb]; REALLOC_INT_1DARRAY(U.row_idx, U.nnz, resize_U); @@ -346,13 +346,13 @@ namespace BaskerNS //Though this could be done in parallel in the future for(Int p = 0; p < num_threads; p++) { - Int blk = S(0)(p); + Int blk = S[0][p]; //if(LL(blk)(0).w_fill == BASKER_TRUE) { //Clear workspace, whole column for(Int sb = 0; sb < LL_size(blk); sb++) { - BASKER_MATRIX &SL = LL(blk)(sb); + BASKER_MATRIX &SL = LL[blk][sb]; for(Int i = 0; i < SL.iws_size*SL.iws_mult; ++i) { SL.iws(i) = (Int) 0; @@ -369,10 +369,10 @@ namespace BaskerNS Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A for(Int p = 0; p < num_threads; p++) { - Int blk = S(error_sep_lvl)(p); + Int blk = S[error_sep_lvl][p]; //if(LL(blk)(0).w_fill == BASKER_TRUE) { - BASKER_MATRIX &TM = LL(blk)(0); + BASKER_MATRIX &TM = LL[blk][0]; //printf( " > p=%d: scol_top = %d, scol = %d, ncol = %d\n",p,scol_top,TM.scol,TM.ncol ); for(Int i = scol_top + TM.scol; i < scol_top + (TM.scol+TM.ncol); i++) { @@ -386,7 +386,7 @@ namespace BaskerNS //Note, will have to clear the perm in all sep blk in that level //Clear permuation BASKER_MATRIX &SL = - LL(thread_array(ti).error_blk)(0); + LL[thread_array[ti].error_blk][0]; //printf( " + scol_top = %d, srow = %d, nrowl = %d\n",scol_top,SL.srow,SL.nrow ); for(Int i = scol_top + SL.srow; i < scol_top + (SL.srow+SL.nrow); i++) { @@ -394,12 +394,12 @@ namespace BaskerNS gperm(i) = BASKER_MAX_IDX; }//for--to clear perm - thread_start(ti) = thread_array(ti).error_blk; + thread_start(ti) = thread_array[ti].error_blk; //Reset - thread_array(ti).error_type = BASKER_ERROR_NOERROR; - thread_array(ti).error_blk = BASKER_MAX_IDX; - thread_array(ti).error_info = BASKER_MAX_IDX; + thread_array[ti].error_type = BASKER_ERROR_NOERROR; + thread_array[ti].error_blk = BASKER_MAX_IDX; + thread_array[ti].error_info = BASKER_MAX_IDX; for(Int i = 0; i < num_threads; i++) { @@ -451,9 +451,9 @@ namespace BaskerNS Int btab = btf_tabs_offset; for(Int ti = 0; ti < num_threads; ti++) { - Int c = thread_array(ti).error_blk; + Int c = thread_array[ti].error_blk; //Note: jdb we can make this into a switch - if(thread_array(ti).error_type == BASKER_ERROR_NOERROR) + if(thread_array[ti].error_type == BASKER_ERROR_NOERROR) { if (c >= btab) { thread_start(ti) = BASKER_MAX_IDX; @@ -463,7 +463,7 @@ namespace BaskerNS continue; }//end if NOERROR - if(thread_array(ti).error_type == BASKER_ERROR_SINGULAR) + if(thread_array[ti].error_type == BASKER_ERROR_SINGULAR) { if(Options.verbose == BASKER_TRUE) { @@ -474,7 +474,7 @@ namespace BaskerNS return BASKER_ERROR; }//end if SINGULAR - if(thread_array(ti).error_type == BASKER_ERROR_NOMALLOC) + if(thread_array[ti].error_type == BASKER_ERROR_NOMALLOC) { std::cout << "ERROR_THREADS: " << ti << " DIAGBLK NOMALLOC blk=" << c @@ -482,16 +482,16 @@ namespace BaskerNS return BASKER_ERROR; }//end if NOMALLOC - if(thread_array(ti).error_type == BASKER_ERROR_REMALLOC) + if(thread_array[ti].error_type == BASKER_ERROR_REMALLOC) { - Int liwork = thread_array(ti).iws_size*thread_array(ti).iws_mult; - Int lework = thread_array(ti).ews_size*thread_array(ti).ews_mult; + Int liwork = thread_array[ti].iws_size*thread_array[ti].iws_mult; + Int lework = thread_array[ti].ews_size*thread_array[ti].ews_mult; BASKER_ASSERT(c >= 0, "nfactor_diag_error error_blk"); if(Options.verbose == BASKER_TRUE) { std::cout << " > THREADS: " << ti << " DIAGBLK MALLOC blk=" << c - << " newsize=" << thread_array(ti).error_info + << " newsize=" << thread_array[ti].error_info << " for both L( " << c << " ) and U( " << c << " )" << std::endl; @@ -504,24 +504,24 @@ namespace BaskerNS for(Int i = 0; i < liwork; i++) { - thread_array(ti).iws(i) = (Int) 0; + thread_array[ti].iws(i) = (Int) 0; } for(Int i = 0; i < lework; i++) { - thread_array(ti).ews(i) = zero; + thread_array[ti].ews(i) = zero; } //Resize L - BASKER_MATRIX &L = (c >= btab ? LBTF(c-btab) : L_D(c)); + BASKER_MATRIX &L = (c >= btab ? LBTF[c-btab] : L_D[c]); L.clear_pend(); REALLOC_INT_1DARRAY(L.row_idx, L.nnz, - thread_array(ti).error_info); + thread_array[ti].error_info); REALLOC_ENTRY_1DARRAY(L.val, L.nnz, - thread_array(ti).error_info); - L.mnnz = thread_array(ti).error_info; - L.nnz = thread_array(ti).error_info; + thread_array[ti].error_info); + L.mnnz = thread_array[ti].error_info; + L.nnz = thread_array[ti].error_info; for(Int i = 0; i < L.ncol; i++) { L.col_ptr(i) = 0; @@ -533,15 +533,15 @@ namespace BaskerNS } //Resize U - BASKER_MATRIX &U = (c >= btab ? UBTF(c-btab) : U_D(c)); + BASKER_MATRIX &U = (c >= btab ? UBTF[c-btab] : U_D[c]); REALLOC_INT_1DARRAY(U.row_idx, U.nnz, - thread_array(ti).error_info); + thread_array[ti].error_info); REALLOC_ENTRY_1DARRAY(U.val, U.nnz, - thread_array(ti).error_info); - U.mnnz = thread_array(ti).error_info; - U.nnz = thread_array(ti).error_info; + thread_array[ti].error_info); + U.mnnz = thread_array[ti].error_info; + U.nnz = thread_array[ti].error_info; for(Int i = 0; i < U.ncol; i++) { U.col_ptr(i) = 0; @@ -561,9 +561,9 @@ namespace BaskerNS } //Reset - thread_array(ti).error_type = BASKER_ERROR_NOERROR; - thread_array(ti).error_blk = BASKER_MAX_IDX; - thread_array(ti).error_info = BASKER_MAX_IDX; + thread_array[ti].error_type = BASKER_ERROR_NOERROR; + thread_array[ti].error_blk = BASKER_MAX_IDX; + thread_array[ti].error_info = BASKER_MAX_IDX; nthread_remalloc++; @@ -593,7 +593,7 @@ namespace BaskerNS { for(Int ti = 0; ti < num_threads; ti++) { - thread_array(ti).error_type = BASKER_ERROR_NOERROR; + thread_array[ti].error_type = BASKER_ERROR_NOERROR; } } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_matrix_decl.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_matrix_decl.hpp index 02a896d957c0..4bbd86507d9d 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_matrix_decl.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_matrix_decl.hpp @@ -95,6 +95,9 @@ namespace BaskerNS BASKER_INLINE int fill(); + BASKER_INLINE + void init_ptr(); + BASKER_INLINE void init_inc_lvl(); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_matrix_def.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_matrix_def.hpp index 4f12887c87ed..e40361e6f988 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_matrix_def.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_matrix_def.hpp @@ -328,7 +328,7 @@ namespace BaskerNS if(nnz == _nnz) { copy_vec(_row_idx, _nnz, row_idx); - copy_vec(_val,_nnz, val); + copy_vec(_val, _nnz, val); } else { @@ -498,6 +498,13 @@ namespace BaskerNS return 0; } + template + BASKER_INLINE + void BaskerMatrix::init_ptr() + { + for (Int i = 0; i < ncol+1; i ++) col_ptr(i) = 0; + } + template BASKER_INLINE void BaskerMatrix::convert2D diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp index 499e00edd417..6613d992dbc2 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp @@ -150,14 +150,14 @@ namespace BaskerNS const Mag normA_blk = BTF_A.anorm; Int b = S[0][kid]; //Which blk from schedule - BASKER_MATRIX &L = LL(b)(0); - BASKER_MATRIX &U = LU(b)(LU_size(b)-1); - BASKER_MATRIX &M = ALM(b)(0); //A->blk + BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &U = LU[b][LU_size(b)-1]; + BASKER_MATRIX &M = ALM[b][0]; //A->blk #ifdef BASKER_2DL //printf("Accessing blk: %d kid: %d \n", b, kid); - INT_1DARRAY ws = LL(b)(0).iws; - ENTRY_1DARRAY X = LL(b)(0).ews; - Int ws_size = LL(b)(0).iws_size; + INT_1DARRAY ws = LL[b][0].iws; + ENTRY_1DARRAY X = LL[b][0].ews; + Int ws_size = LL[b][0].iws_size; #else //else if BASKER_2DL INT_1DARRAY ws = thread_array[kid].iws; ENTRY_1DARRAY X = thread_array[kid].ews; @@ -577,11 +577,11 @@ namespace BaskerNS } } if (!explicit_pivot) { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_SINGULAR; - thread_array(kid).error_blk = b; - thread_array(kid).error_subblk = 0; - thread_array(kid).error_info = k; + thread_array[kid].error_blk = b; + thread_array[kid].error_subblk = 0; + thread_array[kid].error_info = k; return BASKER_ERROR; } } @@ -676,17 +676,17 @@ namespace BaskerNS (int)kid, (long)b, (long)llnnz, (long)lnnz, (long)lcnt, (int)lnnz, (int)M.nrow, (long)newsize); } - thread_array(kid).error_blk = b; - thread_array(kid).error_subblk = 0; + thread_array[kid].error_blk = b; + thread_array[kid].error_subblk = 0; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } @@ -701,17 +701,17 @@ namespace BaskerNS (int)kid, (long)b, (long)uunnz, (long)unnz+ucnt, (long)k, (int)uunnz, (int)M.nrow, (int)newsize); } - thread_array(kid).error_blk = b; - thread_array(kid).error_subblk = -1; + thread_array[kid].error_blk = b; + thread_array[kid].error_subblk = -1; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } @@ -981,10 +981,10 @@ namespace BaskerNS ) { //Setup variables - const Int wsb = S(0)(kid); + const Int wsb = S[0][kid]; - INT_1DARRAY ws = LL(wsb)(l).iws; - const Int ws_size = LL(wsb)(l).iws_size; + INT_1DARRAY ws = LL[wsb][l].iws; + const Int ws_size = LL[wsb][l].iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -1011,18 +1011,18 @@ namespace BaskerNS ) { const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A - const Int b = S(lvl)(kid); + const Int b = S[lvl][kid]; //const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); - const Int U_col = S(lvl)(kid); + BASKER_MATRIX &L = LL[b][0]; + const Int U_col = S[lvl][kid]; Int U_row = LU_size(U_col)-1; if(lvl > 0) { //U_row = (lvl==1)?(kid%2):S(l)(kid)%LU_size(U_col); } - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; //const Int brow = L.srow; @@ -1128,14 +1128,14 @@ namespace BaskerNS { //Setup variables - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A const Int brow_g = L.srow + scol_top; // global offset - INT_1DARRAY ws = LL(wsb)(l).iws; - const Int ws_size = LL(wsb)(l).iws_size; + INT_1DARRAY ws = LL[wsb][l].iws; + const Int ws_size = LL[wsb][l].iws_size; //Int *color = &(ws[0]); Int *pattern = &(ws(ws_size)); @@ -1279,12 +1279,12 @@ namespace BaskerNS { //Setup variables - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; #ifdef BASKER_2DL - INT_1DARRAY ws = LL(wsb)(l).iws; - const Int ws_size = LL(wsb)(l).iws_size; + INT_1DARRAY ws = LL[wsb][l].iws; + const Int ws_size = LL[wsb][l].iws_size; #else INT_1DARRAY ws = thread_array[kid].iws; Int ws_size = thread_array[kid].iws_size; @@ -1452,13 +1452,13 @@ namespace BaskerNS Int k, Int top, Int xnnz) { - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; #ifdef BASKER_2DL - INT_1DARRAY ws = LL(wsb)(l).iws; - ENTRY_1DARRAY X = LL(wsb)(l).ews; - Int ws_size = LL(wsb)(l).iws_size; + INT_1DARRAY ws = LL[wsb][l].iws; + ENTRY_1DARRAY X = LL[wsb][l].ews; + Int ws_size = LL[wsb][l].iws_size; #else INT_1DARRAY ws = thread_array[kid].iws; ENTRY_1DARRAY X = thread_array[kid].ews; @@ -1534,10 +1534,10 @@ namespace BaskerNS Int X_col, Int X_row, Int k, Entry pivot) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; //const Int ws_size = LL(X_col)(X_row).iws_size; //const Int p_size = LL(X_col)(X_row).p_size; @@ -1608,7 +1608,7 @@ namespace BaskerNS #endif //LL[X_col][X_row].p_size = 0; - LL(X_col)(X_row).p_size = 0; + LL[X_col][X_row].p_size = 0; return 0; }//end t_dense_offdiag_mov_L() @@ -1623,12 +1623,12 @@ namespace BaskerNS Int X_col, Int X_row, Int k, Entry pivot) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - const Int ws_size = LL(X_col)(X_row).iws_size; - const Int p_size = LL(X_col)(X_row).p_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + const Int ws_size = LL[X_col][X_row].iws_size; + const Int p_size = LL[X_col][X_row].p_size; #ifdef BASKER_DEBUG_NFACTOR_BLK @@ -1658,17 +1658,17 @@ namespace BaskerNS (long)blkcol, (long)blkrow, (long)kid, (long)llnnz, (long)lnnz, (long)p_size ); } - thread_array(kid).error_blk = blkcol; - thread_array(kid).error_subblk = blkrow; + thread_array[kid].error_blk = blkcol; + thread_array[kid].error_subblk = blkrow; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } //BASKER_ASSERT(0==1, "REALLOC LOWER BLOCK\n"); @@ -1714,7 +1714,7 @@ namespace BaskerNS } #endif - LL(X_col)(X_row).p_size = 0; + LL[X_col][X_row].p_size = 0; return 0; }//end t_offdiag_mov_L() @@ -1733,17 +1733,17 @@ namespace BaskerNS BASKER_BOOL A_option) { //Note: need to add support for offdiag permuation - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int nnz = LL(X_col)(X_row).p_size; + Int nnz = LL[X_col][X_row].p_size; //printf( " t_dense_back_solve_offdiag( LL(%d,%d) and ALM(%d,%d)\n", blkcol,blkrow,blkcol,blkrow ); #ifdef BASKER_DEBUG_NFACTOR_BLK - Int ws_size = LL(X_col)(X_row).iws_size; + Int ws_size = LL[X_col][X_row].iws_size; const Int brow = L.srow; const Int bcol = L.scol; printf("\n\n"); @@ -1832,7 +1832,7 @@ namespace BaskerNS #ifdef BASKER_2DL //LL[X_col][X_row].p_size = nnz; - LL(X_col)(X_row).p_size = nnz; + LL[X_col][X_row].p_size = nnz; #endif //Debug @@ -1878,14 +1878,14 @@ namespace BaskerNS { //Note: need to add support for offdiag permuation - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL(X_col)(X_row).iws_size; - Int nnz = LL(X_col)(X_row).p_size; + Int ws_size = LL[X_col][X_row].iws_size; + Int nnz = LL[X_col][X_row].p_size; //const Int brow = L.srow; //const Int bcol = L.scol; @@ -2057,7 +2057,7 @@ namespace BaskerNS printf("kid %d Ending nnz: %d \n",kid, nnz); #endif //LL[X_col][X_row].p_size = nnz; - LL(X_col)(X_row).p_size = nnz; + LL[X_col][X_row].p_size = nnz; #endif //Debug diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp index 1fb5dc3fcc2b..48dae30f95c9 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp @@ -130,14 +130,14 @@ namespace BaskerNS BASKER_INLINE int Basker::t_nfactor_blk_inc_lvl(Int kid) { - Int b = S(0)(kid); //Which blk from schedule - BASKER_MATRIX &L = LL(b)(0); - BASKER_MATRIX &U = LU(b)(LU_size(b)-1); - BASKER_MATRIX &M = ALM(b)(0); //A->blk + Int b = S[0][kid]; //Which blk from schedule + BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &U = LU[b][LU_size(b)-1]; + BASKER_MATRIX &M = ALM[b][0]; //A->blk - INT_1DARRAY ws = LL(b)(0).iws; - ENTRY_1DARRAY X = LL(b)(0).ews; - Int ws_size = LL(b)(0).iws_size; + INT_1DARRAY ws = LL[b][0].iws; + ENTRY_1DARRAY X = LL[b][0].ews; + Int ws_size = LL[b][0].iws_size; Int brow = L.srow; //begining row Int lval = 0; @@ -384,10 +384,10 @@ namespace BaskerNS << pivot << endl; cout << "lcnt: " << lcnt << endl; } - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_SINGULAR; - thread_array(kid).error_blk = b; - thread_array(kid).error_info = k; + thread_array[kid].error_blk = b; + thread_array[kid].error_info = k; return BASKER_ERROR; } @@ -410,17 +410,17 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_blk = b; - thread_array(kid).error_subblk = 0; - thread_array(kid).error_info = newsize; + thread_array[kid].error_blk = b; + thread_array[kid].error_subblk = 0; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } @@ -441,17 +441,17 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_blk = b; - thread_array(kid).error_subblk = -1; - thread_array(kid).error_info = newsize; + thread_array[kid].error_blk = b; + thread_array[kid].error_subblk = -1; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } @@ -665,13 +665,13 @@ namespace BaskerNS { //Setup variables - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; const Int brow = L.srow; - INT_1DARRAY ws = LL(wsb)(l).iws; - const Int ws_size = LL(wsb)(l).iws_size; + INT_1DARRAY ws = LL[wsb][l].iws; + const Int ws_size = LL[wsb][l].iws_size; //Int *color = &(ws[0]); Int *pattern = &(ws(ws_size)); @@ -936,12 +936,12 @@ namespace BaskerNS ) { //Setup variables - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; - INT_1DARRAY ws = LL(wsb)(l).iws; - const Int ws_size = LL(wsb)(l).iws_size; + INT_1DARRAY ws = LL[wsb][l].iws; + const Int ws_size = LL[wsb][l].iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -985,13 +985,13 @@ namespace BaskerNS //Will want to make this backward in the future //Setup variables - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; const Int brow = L.srow; - INT_1DARRAY ws = LL(wsb)(l).iws; - const Int ws_size = LL(wsb)(l).iws_size; + INT_1DARRAY ws = LL[wsb][l].iws; + const Int ws_size = LL[wsb][l].iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -1353,12 +1353,12 @@ namespace BaskerNS //We note that this can be fixed to be faster - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); - INT_1DARRAY ws = LL(wsb)(l).iws; - ENTRY_1DARRAY X = LL(wsb)(l).ews; - const Int ws_size = LL(wsb)(l).iws_size; + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; + INT_1DARRAY ws = LL[wsb][l].iws; + ENTRY_1DARRAY X = LL[wsb][l].ews; + const Int ws_size = LL[wsb][l].iws_size; Int brow = L.srow; @@ -1441,12 +1441,12 @@ namespace BaskerNS { //We note that this can be fixed to be faster - const Int b = S(lvl)(kid); - const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL(b)(0); - INT_1DARRAY ws = LL(wsb)(l).iws; - ENTRY_1DARRAY X = LL(wsb)(l).ews; - const Int ws_size = LL(wsb)(l).iws_size; + const Int b = S[lvl][kid]; + const Int wsb = S[0][kid]; + BASKER_MATRIX &L = LL[b][0]; + INT_1DARRAY ws = LL[wsb][l].iws; + ENTRY_1DARRAY X = LL[wsb][l].ews; + const Int ws_size = LL[wsb][l].iws_size; Int brow = L.srow; Int *color = &(ws(0)); @@ -1555,14 +1555,14 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; - Int nnz = LL(X_col)(X_row).p_size; + Int nnz = LL[X_col][X_row].p_size; #ifdef BASKER_DEBUG_NFACTOR_BLK printf("t_back_solve_diag, kid: %d blkcol: %d blkrow: %d \n", @@ -1696,7 +1696,7 @@ namespace BaskerNS nnz, kid, X_col, X_row); printf("kid %d Ending nnz: %d \n",kid, nnz); #endif - LL(X_col)(X_row).p_size = nnz; + LL[X_col][X_row].p_size = nnz; #endif return; @@ -1717,14 +1717,14 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; - Int nnz = LL(X_col)(X_row).p_size; + Int nnz = LL[X_col][X_row].p_size; //Int brow = L.srow; //Int bcol = L.scol; @@ -1869,14 +1869,14 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; - Int nnz = LL(X_col)(X_row).p_size; + Int nnz = LL[X_col][X_row].p_size; Int brow = L.srow; Int bcol = L.scol; @@ -2065,12 +2065,12 @@ namespace BaskerNS Int k, Entry pivot ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - const Int ws_size = LL(X_col)(X_row).iws_size; - const Int p_size = LL(X_col)(X_row).p_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + const Int ws_size = LL[X_col][X_row].iws_size; + const Int p_size = LL[X_col][X_row].p_size; #ifdef BASKER_DEBUG_NFACTOR_BLK @@ -2105,18 +2105,18 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_blk = blkcol; - thread_array(kid).error_subblk = blkrow; - thread_array(kid).error_info = newsize; + thread_array[kid].error_blk = blkcol; + thread_array[kid].error_subblk = blkrow; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } @@ -2155,14 +2155,14 @@ namespace BaskerNS //Fix later if(Options.same_pattern == BASKER_FALSE) { - for(Int i = 0; i < LL(X_col)(X_row).nrow; i++) + for(Int i = 0; i < LL[X_col][X_row].nrow; i++) { stack[i] = BASKER_MAX_IDX; } } L.col_ptr(k+1) = lnnz; - LL(X_col)(X_row).p_size = 0; + LL[X_col][X_row].p_size = 0; return 0; }//end t_offdiag_mov_L_inc_lvl() @@ -2729,8 +2729,8 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; /* @@ -2740,27 +2740,27 @@ namespace BaskerNS LP_col, LP_row, kid); */ - BASKER_MATRIX *UPP = &LU(UP_col)(0); + BASKER_MATRIX *UPP = &LU[UP_col][0]; if(UP_row != BASKER_MAX_IDX) { - UPP = &(LU(UP_col)(UP_row)); + UPP = &(LU[UP_col][UP_row]); } BASKER_MATRIX &UP = *(UPP); - BASKER_MATRIX *LPP = &LU(LP_col)(0); + BASKER_MATRIX *LPP = &LU[LP_col][0]; if(LP_row != BASKER_MAX_IDX) { - LPP = &(LL(LP_col)(LP_row)); + LPP = &(LL[LP_col][LP_row]); } BASKER_MATRIX &LP = *(LPP); - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; - Int nnz = LL(X_col)(X_row).p_size; + Int nnz = LL[X_col][X_row].p_size; @@ -2948,7 +2948,7 @@ namespace BaskerNS }//over all nonzero in left - LL(X_col)(X_row).p_size = nnz; + LL[X_col][X_row].p_size = nnz; return; @@ -2969,14 +2969,14 @@ namespace BaskerNS Int x_size, Int x_offset, BASKER_BOOL A_option) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; - Int nnz = LL(X_col)(X_row).p_size; + Int nnz = LL[X_col][X_row].p_size; //const Int brow = L.srow; //const Int bcol = L.scol; @@ -3106,7 +3106,7 @@ namespace BaskerNS */ - Int temp = INC_LVL_TEMP(k_i+LL(blkcol)(0).srow) + L.inc_lvl(j) + 1; + Int temp = INC_LVL_TEMP(k_i+LL[blkcol][0].srow) + L.inc_lvl(j) + 1; /* printf("lower row: %d kid: %d inc: %d %d %d j: %d \n", @@ -3183,7 +3183,7 @@ namespace BaskerNS nnz, kid, X_col, X_row); printf("kid %d Ending nnz: %d \n",kid, nnz); #endif - LL(X_col)(X_row).p_size = nnz; + LL[X_col][X_row].p_size = nnz; #endif //Debug @@ -3219,11 +3219,11 @@ namespace BaskerNS Int k, Entry pivot ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - const Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + const Int ws_size = LL[X_col][X_row].iws_size; //const Int p_size = LL(X_col)(X_row).p_size; //NDE - warning: unused @@ -3296,7 +3296,7 @@ namespace BaskerNS } L.col_ptr(k+1) = lnnz; - LL(X_col)(X_row).p_size = 0; + LL[X_col][X_row].p_size = 0; return 0; }//end t_dense_offdiag_mov_L_inv_lvl() @@ -3315,12 +3315,12 @@ namespace BaskerNS const BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &B = ALM[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; //Int nnz = LL(X_col)(X_row).p_size; //Int brow = L.srow; @@ -3439,11 +3439,11 @@ namespace BaskerNS Int x_size, Int x_offset ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; //Int nnz = LL(X_col)(X_row).p_size; //const Int brow = L.srow; //Not used @@ -3576,11 +3576,11 @@ namespace BaskerNS Int x_size, Int x_offset ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &L = LL[blkcol][blkrow]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; + Int ws_size = LL[X_col][X_row].iws_size; //Int nnz = LL(X_col)(X_row).p_size; //const Int brow = L.srow; //Not used @@ -3758,16 +3758,16 @@ namespace BaskerNS BASKER_MATRIX *B; if(lower == BASKER_TRUE) { - B = &(ALM(blkcol)(blkrow)); + B = &(ALM[blkcol][blkrow]); } else { - B = &(AVM(blkcol)(blkrow)); + B = &(AVM[blkcol][blkrow]); } BASKER_MATRIX &M = *B; //BASKER_MATRIX &M = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL(X_col)(X_row).iws; - const Int ws_size = LL(X_col)(X_row).iws_size; + INT_1DARRAY ws = LL[X_col][X_row].iws; + const Int ws_size = LL[X_col][X_row].iws_size; Int *color = &(ws(0)); Int *pattern = &(color[ws_size]); @@ -3840,9 +3840,9 @@ namespace BaskerNS ) { - const Int my_idx = S(0)(kid); + const Int my_idx = S[0][kid]; const Int team_leader = find_leader(kid,sl); - const Int leader_idx = S(0)(team_leader); + const Int leader_idx = S[0][team_leader]; //Int loop_col_idx = S(l)(kid); //printf("Reduce col fill called, kid: %d leader: %d \n", @@ -3857,12 +3857,12 @@ namespace BaskerNS for(Int blk = l+1; blk < endblk; ++blk) { // ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; //NDE - warning: unused - INT_1DARRAY &wsL = LL(leader_idx)(blk).iws; + INT_1DARRAY &wsL = LL[leader_idx][blk].iws; //Int p_sizeL = LL(leader_idx)(blk).p_size; - Int ws_sizeL = LL(leader_idx)(blk).iws_size; + Int ws_sizeL = LL[leader_idx][blk].iws_size; // ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; //NDE - warning: unused - INT_1DARRAY &ws = LL(my_idx)(blk).iws; - const Int ws_size = LL(my_idx)(blk).iws_size; + INT_1DARRAY &ws = LL[my_idx][blk].iws; + const Int ws_size = LL[my_idx][blk].iws_size; //Int p_size = LL(my_idx)(blk).p_size; Int *color = &(ws[0]); Int *pattern = &(color[ws_size]); @@ -3875,7 +3875,7 @@ namespace BaskerNS Int *stackL = &(patternL[ws_sizeL]); //over all nnnz found - for(Int jj = 0; jj < LL(my_idx)(blk).nrow; ++jj) + for(Int jj = 0; jj < LL[my_idx][blk].nrow; ++jj) { //if(kid==3) // { @@ -3941,12 +3941,12 @@ namespace BaskerNS //printf("===========T ADD ORIG FILL CALLED\n"); const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S(lvl)(leader_id); + const Int L_col = S[lvl][leader_id]; Int L_row = 0; //const Int U_col = S(lvl)(leader_id); //Int U_row = LU_size(U_col)-1; //Int X_col = S(0)(leader_id); - Int X_col = S(0)(kid); + Int X_col = S[0][kid]; Int X_row = l+1; @@ -3977,7 +3977,7 @@ namespace BaskerNS //Int L_row = 0; //const Int U_col = S(lvl)(leader_id); //Int U_row = LU_size(U_col)-1; - Int X_col = S(0)(leader_id); + Int X_col = S[0][leader_id]; Int X_row = l+1; //printf("=***== fill MY ID: %d LEADER ID: %d ===** \n", @@ -3987,7 +3987,7 @@ namespace BaskerNS { Int bl = l+1; - Int A_col = S(lvl)(kid); + Int A_col = S[lvl][kid]; /* printf("leader_id: %d kid: %d lvl: %d l: %d blk: %d %d \n", @@ -3996,16 +3996,16 @@ namespace BaskerNS */ Int my_row_leader = find_leader(kid, lvl-1); Int my_new_row = - S(bl)(kid) - S(0)(my_row_leader); + S[bl][kid] - S[0][my_row_leader]; - Int A_row = (lvl==l)?(2):S(bl)(kid)%(LU_size(A_col)); - if((S(bl)(kid)>14) && - (S(bl)(kid)>LU_size(A_col)) && + Int A_row = (lvl==l)?(2):S[bl][kid]%(LU_size(A_col)); + if((S[bl](kid)>14) && + (S[bl](kid)>LU_size(A_col)) && (lvl != 1)) { - Int tm = (S(bl)(kid)+1)/16; - A_row = ((S(bl)(kid)+1)-(tm*16))%LU_size(A_col); + Int tm = (S[bl][kid]+1)/16; + A_row = ((S[bl][kid]+1)-(tm*16))%LU_size(A_col); } /* diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp index 650bc77a8de6..9c77c1f38994 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp @@ -134,12 +134,12 @@ namespace BaskerNS double barrier_time = 0; #endif - Int U_col = S(lvl)(kid); + Int U_col = S[lvl][kid]; Int U_row = 0; - const Int scol = LU(U_col)(U_row).scol; - const Int ecol = LU(U_col)(U_row).ecol; - const Int ncol = LU(U_col)(U_row).ncol; + const Int scol = LU[U_col][U_row].scol; + const Int ecol = LU[U_col][U_row].ecol; + const Int ncol = LU[U_col][U_row].ncol; //for(Int k = scol; k < ecol; k++) //might have to use k+scol for barrier @@ -460,15 +460,15 @@ namespace BaskerNS const Entry zero (0.0); //Get needed variables - const Int L_col = S(l)(kid); - const Int U_col = S(lvl)(kid); + const Int L_col = S[l][kid]; + const Int U_col = S[lvl][kid]; - Int my_row_leader = S(0)(find_leader(kid,lvl-1)); + Int my_row_leader = S[0][find_leader(kid,lvl-1)]; //Int my_new_row = // L_col - my_row_leader; Int U_row = L_col - my_row_leader; - const Int X_col = S(0)(kid); + const Int X_col = S[0][kid]; const Int X_row = l; //X_row = lower(L) //const Int col_idx_offset = 0; //we might be able to remove @@ -480,7 +480,7 @@ namespace BaskerNS #endif //end get needed variables// - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; //Ask C++ guru if this is ok BASKER_MATRIX *Bp; @@ -488,7 +488,7 @@ namespace BaskerNS //if(sep_flg == BASKER_FALSE) if(l == 0) { - Bp = &(AVM(U_col)(U_row)); + Bp = &(AVM[U_col][U_row]); //bbcol = Bp->scol; } else @@ -503,9 +503,9 @@ namespace BaskerNS // kid, X_col, X_row); - INT_1DARRAY ws = LL(X_col)(X_row).iws; - const Int ws_size = LL(X_col)(X_row).iws_size; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + INT_1DARRAY ws = LL[X_col][X_row].iws; + const Int ws_size = LL[X_col][X_row].iws_size; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A const Int brow_a = U.srow; // offset within A @@ -649,17 +649,17 @@ namespace BaskerNS Int newsize = (unnz+U.nrow) * 1.2 ; - thread_array(kid).error_blk = U_col; - thread_array(kid).error_subblk = U_row; + thread_array[kid].error_blk = U_col; + thread_array[kid].error_subblk = U_row; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; }//if/else realloc } @@ -741,10 +741,10 @@ namespace BaskerNS std::cout << "----Error--- kid = " << kid << ": extra L[" << j << "]=" << X[j] << " with gperm( " << brow_g << " + " << j << " ) = " << t << std::endl; - thread_array(kid).error_type = BASKER_ERROR_OTHER; - thread_array(kid).error_blk = lvl; - thread_array(kid).error_subblk = l; - thread_array(kid).error_info = k; + thread_array[kid].error_type = BASKER_ERROR_OTHER; + thread_array[kid].error_blk = lvl; + thread_array[kid].error_subblk = l; + thread_array[kid].error_info = k; info = BASKER_ERROR; //BASKER_ASSERT(t != BASKER_MAX_IDX, "lower entry in U"); #endif @@ -864,14 +864,14 @@ namespace BaskerNS int lteam_size = pow(2, l); #ifdef BASKER_2DL - Int L_col = S(l)(my_leader); + Int L_col = S[l][my_leader]; Int L_row = 0; - Int U_col = S(lvl)(kid); - Int U_row = (lvl==1)?(kid%2):S(l)(kid)%LU_size(U_col); - Int X_col = S(0)(my_leader); + Int U_col = S[lvl][kid]; + Int U_row = (lvl==1)?(kid%2):S[l][kid]%LU_size(U_col); + Int X_col = S[0][my_leader]; Int X_row = l; //this will change for us Int col_idx_offset = 0; - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; const Int bcol = U.scol; #else BASKER_ASSERT(0==1, "t_upper_col_factor_offdiag, only work with with 2D layout"); @@ -1066,11 +1066,11 @@ namespace BaskerNS const Mag normA_blk = BTF_A.anorm; //Get needed variables - const Int L_col = S(lvl)(kid); + const Int L_col = S[lvl][kid]; const Int L_row = 0; - const Int U_col = S(lvl)(kid); + const Int U_col = S[lvl][kid]; const Int U_row = LU_size(U_col)-1; - const Int X_col = S(0)(kid); + const Int X_col = S[0][kid]; //Int col_idx_offset = 0; //can we get rid of now? #ifdef BASKER_DEBUG_NFACTOR_COL @@ -1080,10 +1080,10 @@ namespace BaskerNS #endif //end get needed variables - BASKER_MATRIX &L = LL(L_col)(L_row); - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &L = LL[L_col][L_row]; + BASKER_MATRIX &U = LU[U_col][U_row]; - BASKER_MATRIX &B = thread_array(kid).C; + BASKER_MATRIX &B = thread_array[kid].C; #ifdef BASKER_DEBUG_NFACTOR_COL if(kid >= 0) @@ -1098,9 +1098,9 @@ namespace BaskerNS //B.print(); - INT_1DARRAY ws = LL(X_col)(l+1).iws; - const Int ws_size = LL(X_col)(l+1).iws_size; - ENTRY_1DARRAY X = LL(X_col)(l+1).ews; + INT_1DARRAY ws = LL[X_col][l+1].iws; + const Int ws_size = LL[X_col][l+1].iws_size; + ENTRY_1DARRAY X = LL[X_col][l+1].ews; Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A const Int brow_a = U.srow; // offset within A @@ -1327,10 +1327,10 @@ namespace BaskerNS X(maxindex) = pivot; } else { // replace-tiny-pivot not requested, or the current column is structurally empty after elimination - thread_array(kid).error_type = BASKER_ERROR_SINGULAR; - thread_array(kid).error_blk = L_col; - thread_array(kid).error_subblk = -1; - thread_array(kid).error_info = k; + thread_array[kid].error_type = BASKER_ERROR_SINGULAR; + thread_array[kid].error_blk = L_col; + thread_array[kid].error_subblk = -1; + thread_array[kid].error_info = k; return BASKER_ERROR; } } else if (Options.replace_tiny_pivot && normA_blk > abs(zero) && abs(pivot) < normA_blk * sqrt(eps)) { @@ -1374,17 +1374,17 @@ namespace BaskerNS //cout << " > L_col = " << L_col << " L_row = " << L_row << endl; } - thread_array(kid).error_blk = L_col; - thread_array(kid).error_subblk = -1; + thread_array[kid].error_blk = L_col; + thread_array[kid].error_subblk = -1; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } } @@ -1399,17 +1399,17 @@ namespace BaskerNS << endl; } - thread_array(kid).error_blk = U_col; - thread_array(kid).error_subblk = U_row; + thread_array[kid].error_blk = U_col; + thread_array[kid].error_subblk = U_row; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } } @@ -1640,20 +1640,20 @@ namespace BaskerNS const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S(lvl)(leader_id); + const Int L_col = S[lvl][leader_id]; Int L_row = 0; - const Int U_col = S(lvl)(leader_id); + const Int U_col = S[lvl][leader_id]; Int U_row = LU_size(U_col)-1; - Int X_col = S(0)(leader_id); + Int X_col = S[0][leader_id]; Int X_row = l+1; Int col_idx_offset = 0; //can get rid of? - BASKER_MATRIX &L = LL(L_col)(L_row); - BASKER_MATRIX &U = LU(U_col)(U_row); //U.fill(); + BASKER_MATRIX &L = LL[L_col][L_row]; + BASKER_MATRIX &U = LU[U_col][U_row]; //U.fill(); - INT_1DARRAY ws = LL(X_col)(X_row).iws; - const Int ws_size = LL(X_col)(X_row).iws_size; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + INT_1DARRAY ws = LL[X_col][X_row].iws; + const Int ws_size = LL[X_col][X_row].iws_size; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; const Int bcol = U.scol; @@ -1743,15 +1743,15 @@ namespace BaskerNS //Setup - Int A_col = S(lvl)(kid); - Int A_row = (lvl==1)?(2):S(l+1)(kid)%(LU_size(A_col)); + Int A_col = S[lvl][kid]; + Int A_row = (lvl==1)?(2):S[l+1][kid]%(LU_size(A_col)); - BASKER_MATRIX &B = AVM(A_col)(A_col); + BASKER_MATRIX &B = AVM[A_col][A_col]; - const Int my_idx = S(0)(kid); + const Int my_idx = S[0][kid]; team_leader = find_leader(kid, l); - const Int leader_idx = S(0)(team_leader); - Int loop_col_idx = S(l)(kid); + const Int leader_idx = S[0][team_leader]; + Int loop_col_idx = S[l][kid]; #ifdef BASKER_DEBUG_NFACTOR_COL printf("Called t_blk_col_copy_atomic kid: %d " , kid); @@ -1769,17 +1769,17 @@ namespace BaskerNS //Split over threads (leader and nonleader) for(Int blk=l+1; blk Accumulate the update from (l-1)th level: // LU(U_col)(U_row) -= L(U_col)(l-1) * U(l-1)(U_row) t_add_extend(thread, kid, lvl, l-1, k, - LU(U_col)(U_row).scol, + LU[U_col][U_row].scol, BASKER_FALSE); if(kid%((Int)pow(2, l)) == 0) @@ -248,9 +248,9 @@ namespace BaskerNS // printf("[3] barrier test, kid: %d leader: %d b_size: %d lvl: %d \n", // kid, my_leader, b_size, lvl); t_basker_barrier(thread, kid, my_leader, - b_size, 3, LU(U_col)(U_row).scol, 0); + b_size, 3, LU[U_col][U_row].scol, 0); for(Int ti = 0; ti < num_threads; ti++) { - if (thread_array(kid).error_type != BASKER_SUCCESS) { + if (thread_array[kid].error_type != BASKER_SUCCESS) { info = BASKER_ERROR; } } @@ -287,7 +287,7 @@ namespace BaskerNS printf( " kid=%d: calling t_add_extend(k=%d/%d)\n",kid,k,ncol ); fflush(stdout); #endif t_add_extend(thread, kid,lvl,lvl-1, k, - LU(U_col)(U_row).scol, + LU[U_col][U_row].scol, BASKER_TRUE); } #ifdef BASKER_TIMER @@ -336,7 +336,7 @@ namespace BaskerNS t_basker_barrier(thread, kid, my_leader, b_size, 4, k, lvl-1); for(Int tid = 0; tid < num_threads; tid++) { - if (thread_array(tid).error_type != BASKER_SUCCESS) { + if (thread_array[tid].error_type != BASKER_SUCCESS) { info = BASKER_ERROR; } } @@ -395,7 +395,7 @@ namespace BaskerNS #ifdef BASKER_TIMER double time_factot = timer.seconds(); if((kid%(Int)(pow(2,lvl))) == 0) { - const Int L_col = S(lvl)(kid); + const Int L_col = S[lvl][kid]; const Int L_row = LU_size(U_col)-1; printf("Time Lower-Col(%d): %lf, n = %d, nnz(L) = %d, nnz(U) = %d \n", (int)kid, time_factot, @@ -446,7 +446,7 @@ namespace BaskerNS #endif //This will do the correct spmv - if(thread_array(kid).error_type == BASKER_ERROR_NOERROR) { + if(thread_array[kid].error_type == BASKER_ERROR_NOERROR) { t_upper_col_factor_offdiag2(kid, lvl, sl,l, k, lower); } //Barrier--Start @@ -461,7 +461,7 @@ namespace BaskerNS //Barrier--End if(kid%((Int)pow(2,sl)) == 0 && - thread_array(kid).error_type == BASKER_ERROR_NOERROR) { + thread_array[kid].error_type == BASKER_ERROR_NOERROR) { t_dense_blk_col_copy_atomic2(kid, my_leader, lvl, sl, l, k, lower); } @@ -477,7 +477,7 @@ namespace BaskerNS #endif }//over all sublevels - if(thread_array(kid).error_type == BASKER_ERROR_NOERROR) { + if(thread_array[kid].error_type == BASKER_ERROR_NOERROR) { t_dense_copy_update_matrix2(kid, my_leader, lvl, l, k); } }//end t_add_add @@ -507,15 +507,15 @@ namespace BaskerNS return; } - Int my_row_leader = S(0)(find_leader(kid,lvl-1)); - const Int L_col = S(sl)(my_leader); - const Int U_col = S(lvl)(kid); - const Int X_col = S(0)(my_leader); + Int my_row_leader = S[0][find_leader(kid,lvl-1)]; + const Int L_col = S[sl][my_leader]; + const Int U_col = S[lvl][kid]; + const Int X_col = S[0][my_leader]; Int L_row = l-sl+1; //Might have to think about th Int U_row = L_col-my_row_leader; Int X_row = l+1; //this will change for us - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; #ifdef BASKER_DEBUG_NFACTOR_COL2 if(L_row >= LL_size(L_col)) { @@ -588,10 +588,10 @@ namespace BaskerNS //Setup //printf("DEBUG, kid: %d k: %d A_col: %d A_row: %d \n", // kid, k, A_col, A_row); - const Int my_idx = S(0)(kid); + const Int my_idx = S[0][kid]; //should remove either as a paramter or here Int team_leader = find_leader(kid, sl); - const Int leader_idx = S(0)(team_leader); + const Int leader_idx = S[0][team_leader]; #ifdef BASKER_DEBUG_NFACTOR_COL2 if(lower == BASKER_TRUE) { @@ -609,10 +609,10 @@ namespace BaskerNS Int endblk = (lower)?(LL_size(my_idx)):(l+2); for(Int blk = l+1; blk < endblk; ++blk) { - ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; - Int p_sizeL = LL(leader_idx)(blk).p_size; - ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; - INT_1DARRAY &ws = LL(my_idx)(blk).iws; + ENTRY_1DARRAY &XL = LL[leader_idx][blk].ews; + Int p_sizeL = LL[leader_idx][blk].p_size; + ENTRY_1DARRAY &X = LL[my_idx][blk].ews; + INT_1DARRAY &ws = LL[my_idx][blk].iws; Int *color = &(ws[0]); //printf( " + t_dense_blk_col_copy_atomic2(kid=%d: LL(%d)(%d) += LL(%d)(%d)\n",kid,leader_idx, blk,my_idx,blk); @@ -629,7 +629,7 @@ namespace BaskerNS #endif //over all nnnz found - for(Int jj = 0; jj < LL(my_idx)(blk).nrow; ++jj) + for(Int jj = 0; jj < LL[my_idx][blk].nrow; ++jj) { color[jj] = 0; #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -677,7 +677,7 @@ namespace BaskerNS //This can be removed in the future if(kid != team_leader) { - LL(my_idx)(blk).p_size = 0; + LL[my_idx][blk].p_size = 0; } else { @@ -685,7 +685,7 @@ namespace BaskerNS printf("SETTING PS: %d L:%d %d kid: %d\n", p_sizeL, leader_idx, blk, kid); #endif - LL(leader_idx)(blk).p_size = p_sizeL; + LL[leader_idx][blk].p_size = p_sizeL; //p_size = 0; //not needed }//over all blks } @@ -709,8 +709,8 @@ namespace BaskerNS //printf("\n\n\n\n"); const Entry zero (0.0); - const Int leader_idx = S(0)(kid); - BASKER_MATRIX &C = thread_array(kid).C; + const Int leader_idx = S[0][kid]; + BASKER_MATRIX &C = thread_array[kid].C; Int nnz = 0; //Over each blk @@ -724,10 +724,10 @@ namespace BaskerNS // X += B(:, k) { Int bl = l+1; - Int A_col = S(lvl)(kid); + Int A_col = S[lvl][kid]; - Int my_row_leader = S(0)(find_leader(kid,lvl-1)); - Int A_row = S(bl)(kid) - my_row_leader; + Int my_row_leader = S[0][find_leader(kid,lvl-1)]; + Int A_row = S[bl][kid] - my_row_leader; BASKER_MATRIX *Bp; if(A_row != (LU_size(A_col)-1)) @@ -735,12 +735,12 @@ namespace BaskerNS //printf("upper picked, kid: %d \n", kid); //printf("up: %d %d kid: %d \n", // A_col, A_row, kid); - Bp = &(AVM(A_col)(A_row)); + Bp = &(AVM[A_col][A_row]); } else { //printf("lower picked, kid: %d\n", kid); - Bp = &(ALM(A_col)(0)); + Bp = &(ALM[A_col][0]); } #ifdef BASKER_DEBUG_NFACTOR_COL2 printf("copy, kid: %d bl: %d A: %d %d \n", @@ -749,7 +749,7 @@ namespace BaskerNS // X += B(:, k) BASKER_MATRIX &B = *Bp; - ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; + ENTRY_1DARRAY X = LL[leader_idx][bl].ews; //printf( " -- t_dense_copy_update_matrix2(kid=%d: LL(%d)(%d) += B)\n",kid,leader_idx,bl ); //printf("ADDING UPDATES TO B\n"); //B.info(); @@ -800,9 +800,9 @@ namespace BaskerNS //For recounting patterns in dense blk //Need better sparse update - ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; - INT_1DARRAY ws = LL(leader_idx)(bl).iws; - const Int nrow = LL(leader_idx)(bl).nrow; + ENTRY_1DARRAY X = LL[leader_idx][bl].ews; + INT_1DARRAY ws = LL[leader_idx][bl].iws; + const Int nrow = LL[leader_idx][bl].nrow; Int *color = &(ws(0)); #ifdef BASKER_DEBUG_NFACTOR_COL2 printf("moving, kid: %d A: %d %d %d %d p_size: %d \n", @@ -875,18 +875,18 @@ namespace BaskerNS const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S(lvl)(leader_id); - const Int U_col = S(lvl)(leader_id); + const Int L_col = S[lvl][leader_id]; + const Int U_col = S[lvl][leader_id]; Int L_row = 0; Int U_row = LU_size(U_col)-1; - Int X_col = S(0)(leader_id); + Int X_col = S[0][leader_id]; Int X_row = l+1; Int col_idx_offset = 0; //can get rid of? - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; pivot = U.tpivot; //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused L diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp index 1425385d9f2e..ee72c5d32c7b 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp @@ -83,7 +83,7 @@ namespace BaskerNS ) { - const Int U_col = S(lvl)(kid); + const Int U_col = S[lvl][kid]; Int U_row = 0; //const Int scol = LU(U_col)(U_row).scol; @@ -101,7 +101,7 @@ namespace BaskerNS //for(Int k = 0; k < 1; ++k) - for(Int k = 0; k < LU(U_col)(U_row).ncol; ++k) + for(Int k = 0; k < LU[U_col][U_row].ncol; ++k) { #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -148,7 +148,7 @@ namespace BaskerNS //barrier k = 0 usedl1 t_basker_barrier_inc_lvl(thread,kid,my_leader, - b_size, 0, LU(U_col)(U_row).scol, 0); + b_size, 0, LU[U_col][U_row].scol, 0); //printf("1 kid: %d error_leader: %d lvl: %d \n", kid, error_leader, lvl); BASKER_BOOL error_flag = BASKER_FALSE; basker_barrier.ExitGet(error_leader, error_flag); @@ -172,7 +172,7 @@ namespace BaskerNS { //for(Int k = 2; k < 3; ++k) - for(Int k = 0; k < LU(U_col)(U_row).ncol; ++k) + for(Int k = 0; k < LU[U_col][U_row].ncol; ++k) { #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -181,7 +181,7 @@ namespace BaskerNS #endif t_add_extend_inc_lvl(thread, kid,lvl,l-1, k, - LU(U_col)(U_row).scol, + LU[U_col][U_row].scol, BASKER_FALSE); //where to start again @@ -234,7 +234,7 @@ namespace BaskerNS // printf("[3] barrier test, kid: %d leader: %d b_size: %d lvl: %d \n", // kid, my_leader, b_size, lvl); t_basker_barrier_inc_lvl(thread, kid, my_leader, - b_size, 7, LU(U_col)(U_row).scol, 0); + b_size, 7, LU[U_col][U_row].scol, 0); #ifdef BASKER_DEBUG_NFACTOR_COL_INC if(kid == 0) @@ -248,7 +248,7 @@ namespace BaskerNS //if(lvl < 2) { //for(Int k=0; k < 1; ++k) - for(Int k = 0; k < LU(U_col)(U_row).ncol; ++k) + for(Int k = 0; k < LU[U_col][U_row].ncol; ++k) { #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -259,7 +259,7 @@ namespace BaskerNS //printf("test: %d \n", LU(U_col)(U_row).scol); t_add_extend_inc_lvl(thread, kid,lvl,lvl-1, k, - LU(U_col)(U_row).scol, + LU[U_col][U_row].scol, BASKER_TRUE); Entry pivot = 0; if((kid%(Int)(pow(2,lvl))) == 0) @@ -577,12 +577,12 @@ namespace BaskerNS ) { l = l+1; - Int my_token = S(l)(kid); + Int my_token = S[l][kid]; Int my_loc = kid; while((my_loc > 0)) { my_loc--; - if(S(l)(my_loc) != my_token) + if(S[l][my_loc] != my_token) { my_loc++; break; @@ -615,14 +615,14 @@ namespace BaskerNS //Get needed variables - const Int L_col = S(l)(kid); + const Int L_col = S[l][kid]; // const Int L_row = 0; //NDE - warning: unused - const Int U_col = S(lvl)(kid); + const Int U_col = S[lvl][kid]; Int my_row_leader = find_leader(kid,lvl-1); //Int my_new_row = // L_col - S(0)(my_row_leader); - Int U_row = L_col - S(0)(my_row_leader); + Int U_row = L_col - S[0][my_row_leader]; /* Int U_row = (lvl==1)?(kid%2):S(l)(kid)%LU_size(U_col); @@ -642,7 +642,7 @@ namespace BaskerNS //U_row = my_new_row; - const Int X_col = S(0)(kid); + const Int X_col = S[0][kid]; const Int X_row = l; //X_row = lower(L) //const Int col_idx_offset = 0; //we might be able to remove @@ -654,13 +654,13 @@ namespace BaskerNS //end get needed variables// //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused L - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; //Ask C++ guru if this is ok BASKER_MATRIX *Bp; if(l == 0) { - Bp = &(AVM(U_col)(U_row)); + Bp = &(AVM[U_col][U_row]); } else { @@ -674,9 +674,9 @@ namespace BaskerNS // } //B.print(); - INT_1DARRAY ws = LL(X_col)(X_row).iws; - const Int ws_size = LL(X_col)(X_row).iws_size; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + INT_1DARRAY ws = LL[X_col][X_row].iws; + const Int ws_size = LL[X_col][X_row].iws_size; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; const Int brow = U.srow; //const Int bcol = U.scol; @@ -920,18 +920,18 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { //printf("HERE\n"); - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_blk = U_col; - thread_array(kid).error_subblk = U_row; - thread_array(kid).error_info = newsize; + thread_array[kid].error_blk = U_col; + thread_array[kid].error_subblk = U_row; + thread_array[kid].error_info = newsize; return BASKER_ERROR; }//if/else realloc }//if need to realloc @@ -1086,26 +1086,26 @@ namespace BaskerNS // kid, lvl, sl, l); } - const Int L_col = S(sl)(my_leader); + const Int L_col = S[sl][my_leader]; Int L_row = l-sl+1; //Might have to think about th - const Int U_col = S(lvl)(kid); + const Int U_col = S[lvl][kid]; Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - L_col - S(0)(my_row_leader); + L_col - S[0][my_row_leader]; // Int U_row = my_new_row; Int U_row = - (lvl==1)?(kid%2):S(sl)(kid)%LU_size(U_col); - if((S(sl)(kid) > 14) && - (S(sl)(kid) > LU_size(U_col)) && + (lvl==1)?(kid%2):S[sl][kid]%LU_size(U_col); + if((S[sl][kid] > 14) && + (S[sl][kid] > LU_size(U_col)) && (lvl != 1)) { //printf("lower offdiag new num, %d %d \n", // S(sl)(kid), LU_size(U_col)); - Int tm = (S(sl)(kid)+1)/16; - U_row = ((S(sl)(kid)+1) - (tm*16))%LU_size(U_col); + Int tm = (S[sl][kid]+1)/16; + U_row = ((S[sl][kid]+1) - (tm*16))%LU_size(U_col); } //printf("UFF kid:%d U: %d %d new: %d leader: %d %d lvl: %d l: %d sl: %d \n", @@ -1116,12 +1116,12 @@ namespace BaskerNS //JDB PASS TEST U_row = my_new_row; - const Int X_col = S(0)(my_leader); + const Int X_col = S[0][my_leader]; Int X_row = l+1; //this will change for us //Int col_idx_offset = 0; - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; //const Int bcol = U.scol; #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -1256,31 +1256,31 @@ namespace BaskerNS return; } - const Int L_col = S(sl)(my_leader); + const Int L_col = S[sl][my_leader]; Int L_row = l-sl+1; //Might have to think about th - const Int U_col = S(lvl)(kid); + const Int U_col = S[lvl][kid]; Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - L_col - S(0)(my_row_leader); + L_col - S[0][my_row_leader]; Int U_row = 0; U_row = my_new_row; - const Int X_col = S(0)(my_leader); + const Int X_col = S[0][my_leader]; Int X_row = l+1; //this will change for us Int col_idx_offset = 0; - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; //Need to give them the output pattern - Int U_pattern_col = S(lvl)(kid); + Int U_pattern_col = S[lvl][kid]; Int my_pattern_leader = find_leader_inc_lvl(kid,l); - Int U_pattern_row = S(l+1)(my_pattern_leader) - - S(0)(my_row_leader); + Int U_pattern_row = S[l+1][my_pattern_leader] - + S[0][my_row_leader]; /* printf("Test mypleader: %d myrowleader: %d kid: %d\n", @@ -1292,7 +1292,7 @@ namespace BaskerNS */ - Int L_pattern_col = S(lvl)(kid); + Int L_pattern_col = S[lvl][kid]; Int L_pattern_row = BASKER_MAX_IDX; if(lower == BASKER_TRUE) { @@ -1418,26 +1418,26 @@ namespace BaskerNS return; } - const Int L_col = S(sl)(my_leader); + const Int L_col = S[sl][my_leader]; Int L_row = l-sl+1; //Might have to think about th - const Int U_col = S(lvl)(kid); + const Int U_col = S[lvl][kid]; Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - L_col - S(0)(my_row_leader); + L_col - S[0][my_row_leader]; // Int U_row = my_new_row; Int U_row = - (lvl==1)?(kid%2):S(sl)(kid)%LU_size(U_col); - if((S(sl)(kid) > 14) && - (S(sl)(kid) > LU_size(U_col)) && + (lvl==1)?(kid%2):S[sl][kid]%LU_size(U_col); + if((S[sl][kid] > 14) && + (S[sl][kid] > LU_size(U_col)) && (lvl != 1)) { - Int tm = (S(sl)(kid)+1)/16; - U_row = ((S(sl)(kid)+1) - (tm*16))%LU_size(U_col); + Int tm = (S[sl][kid]+1)/16; + U_row = ((S[sl][kid]+1) - (tm*16))%LU_size(U_col); } // printf("lowerspmv kid: %d U: %d %d new %d leader: %d %d lvl: %d %d %d \n", @@ -1448,12 +1448,12 @@ namespace BaskerNS U_row = my_new_row; - const Int X_col = S(0)(my_leader); + const Int X_col = S[0][my_leader]; Int X_row = l+1; //this will change for us Int col_idx_offset = 0; - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; //const Int bcol = U.scol; #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -1538,8 +1538,8 @@ namespace BaskerNS ) { - const Int leader_idx = S(0)(kid); - BASKER_MATRIX &C = thread_array(kid).C; + const Int leader_idx = S[0][kid]; + BASKER_MATRIX &C = thread_array[kid].C; Int nnz = 0; // Int gbrow = 0; //NDE - warning: unused @@ -1549,11 +1549,11 @@ namespace BaskerNS { //Copy B -> C Int bl = l+1; - Int A_col = S(lvl)(kid); + Int A_col = S[lvl][kid]; Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - S(bl)(kid) - S(0)(my_row_leader); + S[bl][kid] - S[0][my_row_leader]; Int A_row = 0; A_row = my_new_row; @@ -1564,12 +1564,12 @@ namespace BaskerNS //printf("upper picked, kid: %d \n", kid); //printf("up: %d %d kid: %d \n", // A_col, A_row, kid); - Bp = &(AVM(A_col)(A_row)); + Bp = &(AVM[A_col][A_row]); } else { //printf("lower picked, kid: %d\n", kid); - Bp = &(ALM(A_col)(0)); + Bp = &(ALM[A_col][0]); } BASKER_MATRIX &B = *Bp; //printf("ADDING UPDATES TO B\n"); @@ -1580,10 +1580,10 @@ namespace BaskerNS //return; //Int team_leader = find_leader(kid, l); //Not used - ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; - INT_1DARRAY ws = LL(leader_idx)(bl).iws; + ENTRY_1DARRAY X = LL[leader_idx][bl].ews; + INT_1DARRAY ws = LL[leader_idx][bl].iws; Int *color = &(ws(0)); - LL(leader_idx)(bl).p_size = 0; + LL[leader_idx][bl].p_size = 0; //Get the columns pattern Int U_pattern_col = A_col; @@ -1606,7 +1606,7 @@ namespace BaskerNS //Copy into C - BASKER_MATRIX &Up = LU(U_pattern_col)(U_pattern_row); + BASKER_MATRIX &Up = LU[U_pattern_col][U_pattern_row]; for(Int i = Up.col_ptr(k); i < Up.col_ptr(k+1); i++) { const Int j = Up.row_idx(i); @@ -1620,7 +1620,7 @@ namespace BaskerNS //if there is a L if(L_pattern_row != BASKER_MAX_IDX) { - BASKER_MATRIX &Lp = LL(L_pattern_col)(L_pattern_row); + BASKER_MATRIX &Lp = LL[L_pattern_col][L_pattern_row]; for(Int i = Lp.col_ptr(k)+1; i < Lp.col_ptr(k+1);i++) { const Int j = Lp.row_idx(i); @@ -1653,8 +1653,8 @@ namespace BaskerNS ) { - const Int leader_idx = S(0)(kid); - BASKER_MATRIX &C = thread_array(kid).C; + const Int leader_idx = S[0][kid]; + BASKER_MATRIX &C = thread_array[kid].C; Int nnz = 0; Int gbrow = 0; @@ -1672,24 +1672,24 @@ namespace BaskerNS { //Copy B -> C Int bl = l+1; - Int A_col = S(lvl)(kid); + Int A_col = S[lvl][kid]; Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - S(bl)(kid) - S(0)(my_row_leader); + S[bl][kid] - S[0][my_row_leader]; //Int A_row = my_new_row; - Int A_row = (lvl==1)?(2):S(bl)(kid)%(LU_size(A_col)); - if((S(bl)(kid) > 14) && - (S(bl)(kid) > LU_size(A_col)) && + Int A_row = (lvl==1)?(2):S[bl][kid]%(LU_size(A_col)); + if((S[bl][kid] > 14) && + (S[bl][kid] > LU_size(A_col)) && (lvl != 1)) { //printf("test cm %d %d %d \n", // kid, S(bl)(kid), LU_size(A_col)); - Int tm = (S(bl)(kid)+1)/16; - A_row = ((S(bl)(kid)+1) - (tm*16))%LU_size(A_col); + Int tm = (S[bl][kid]+1)/16; + A_row = ((S[bl][kid]+1) - (tm*16))%LU_size(A_col); } @@ -1708,12 +1708,12 @@ namespace BaskerNS //printf("upper picked, kid: %d \n", kid); //printf("up: %d %d kid: %d \n", // A_col, A_row, kid); - Bp = &(AVM(A_col)(A_row)); + Bp = &(AVM[A_col][A_row]); } else { //printf("lower picked, kid: %d\n", kid); - Bp = &(ALM(A_col)(0)); + Bp = &(ALM[A_col][0]); } BASKER_MATRIX &B = *Bp; //printf("ADDING UPDATES TO B\n"); @@ -1724,8 +1724,8 @@ namespace BaskerNS //return; //Int team_leader = find_leader(kid, l); //Not used - ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; - INT_1DARRAY ws = LL(leader_idx)(bl).iws; + ENTRY_1DARRAY X = LL[leader_idx][bl].ews; + INT_1DARRAY ws = LL[leader_idx][bl].iws; //const Int brow = LL(leader_idx)(bl).srow; //const Int nrow = LL(leader_idx)(bl).nrow; //Int p_size = LL(leader_idx)(bl).p_size; @@ -1789,11 +1789,11 @@ namespace BaskerNS //Int CM_idx = kid; - ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; - INT_1DARRAY ws = LL(leader_idx)(bl).iws; - const Int ws_size = LL(leader_idx)(bl).ews_size; + ENTRY_1DARRAY X = LL[leader_idx][bl].ews; + INT_1DARRAY ws = LL[leader_idx][bl].iws; + const Int ws_size = LL[leader_idx][bl].ews_size; // const Int brow = LL(leader_idx)(bl).srow; //NU //NDE - warning: unused - const Int nrow = LL(leader_idx)(bl).nrow; + const Int nrow = LL[leader_idx][bl].nrow; //Int p_size = LL(leader_idx)(bl).p_size; //For recounting patterns in dense blk @@ -1883,12 +1883,12 @@ namespace BaskerNS ) { //Get needed variables - const Int L_col = S(lvl)(kid); + const Int L_col = S[lvl][kid]; const Int L_row = 0; - const Int U_col = S(lvl)(kid); + const Int U_col = S[lvl][kid]; const Int U_row = LU_size(U_col)-1; - const Int X_col = S(0)(kid); + const Int X_col = S[0][kid]; //Int col_idx_offset = 0; //can we get rid of now? @@ -1902,10 +1902,10 @@ namespace BaskerNS #endif //end get needed variables - BASKER_MATRIX &L = LL(L_col)(L_row); - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &L = LL[L_col][L_row]; + BASKER_MATRIX &U = LU[U_col][U_row]; - BASKER_MATRIX &B = thread_array(kid).C; + BASKER_MATRIX &B = thread_array[kid].C; #ifdef BASKER_DEBUG_NFACTOR_COL if(kid >= 0) @@ -1926,9 +1926,9 @@ namespace BaskerNS } */ - INT_1DARRAY ws = LL(X_col)(l+1).iws; - const Int ws_size = LL(X_col)(l+1).iws_size; - ENTRY_1DARRAY X = LL(X_col)(l+1).ews; + INT_1DARRAY ws = LL[X_col][l+1].iws; + const Int ws_size = LL[X_col][l+1].iws_size; + ENTRY_1DARRAY X = LL[X_col][l+1].ews; const Int brow = U.srow; //const Int bcol = U.scol; @@ -2201,17 +2201,17 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_blk = L_col; - thread_array(kid).error_subblk = -1; - thread_array(kid).error_info = newsize; + thread_array[kid].error_blk = L_col; + thread_array[kid].error_subblk = -1; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } } @@ -2229,16 +2229,16 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; } else { - thread_array(kid).error_type = + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_blk = U_col; - thread_array(kid).error_subblk = U_row; - thread_array(kid).error_info = newsize; + thread_array[kid].error_blk = U_col; + thread_array[kid].error_subblk = U_row; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } } @@ -2462,20 +2462,20 @@ namespace BaskerNS const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S(lvl)(leader_id); + const Int L_col = S[lvl][leader_id]; Int L_row = 0; - const Int U_col = S(lvl)(leader_id); + const Int U_col = S[lvl][leader_id]; Int U_row = LU_size(U_col)-1; - Int X_col = S(0)(leader_id); + Int X_col = S[0][leader_id]; Int X_row = l+1; Int col_idx_offset = 0; //can get rid of? //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused L - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; + INT_1DARRAY ws = LL[X_col][X_row].iws; //const Int ws_size = LL(X_col)(X_row).iws_size; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; //const Int brow = U.srow; //const Int bcol = U.scol; @@ -2585,18 +2585,18 @@ namespace BaskerNS //const Int lteam_size = pow(2,l+1); //NDE - warning: unused // const Int L_col = S(lvl)(leader_id); //NDE - warning: unused // Int L_row = 0; //NDE - warning: unused - const Int U_col = S(lvl)(leader_id); + const Int U_col = S[lvl][leader_id]; Int U_row = LU_size(U_col)-1; - Int X_col = S(0)(leader_id); + Int X_col = S[0][leader_id]; Int X_row = l+1; //Int col_idx_offset = 0; //can get rid of?//NDE - warning: unused //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused - BASKER_MATRIX &U = LU(U_col)(U_row); + BASKER_MATRIX &U = LU[U_col][U_row]; - INT_1DARRAY ws = LL(X_col)(X_row).iws; + INT_1DARRAY ws = LL[X_col][X_row].iws; //const Int ws_size = LL(X_col)(X_row).iws_size; - ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + ENTRY_1DARRAY X = LL[X_col][X_row].ews; if(kid == leader_id) { @@ -2621,11 +2621,11 @@ namespace BaskerNS const BASKER_BOOL lower ) { - const Int my_idx = S(0)(kid); + const Int my_idx = S[0][kid]; //should remove either as a paramter or here Int team_leader = find_leader(kid, sl); - const Int leader_idx = S(0)(team_leader); + const Int leader_idx = S[0][team_leader]; //If I an not a leader, then need to copy over if(kid != team_leader) @@ -2636,15 +2636,15 @@ namespace BaskerNS { //const Int blk = l+1; - ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; + ENTRY_1DARRAY &XL = LL[leader_idx][blk].ews; // INT_1DARRAY &wsL = LL(leader_idx)(blk).iws; //NDE - warning: unused // Int p_sizeL = LL(leader_idx)(blk).p_size; //NDE - warning: unused // Int ws_sizeL = LL(leader_idx)(blk).iws_size; //NDE - warning: unused - ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; - INT_1DARRAY &ws = LL(my_idx)(blk).iws; + ENTRY_1DARRAY &X = LL[my_idx][blk].ews; + INT_1DARRAY &ws = LL[my_idx][blk].iws; // const Int ws_size = LL(my_idx)(blk).iws_size; //NDE - warning: unused //Int p_size = LL(my_idx)(blk).p_size; - LL(my_idx)(blk).p_size = 0; + LL[my_idx][blk].p_size = 0; Int *color = &(ws[0]); // Int *pattern = &(color[ws_size]); //NDE - warning: unused // Int *stack = &(pattern[ws_size]); //NDE - warning: unused @@ -2682,7 +2682,7 @@ namespace BaskerNS - Int U_pattern_col = S(lvl)(kid); + Int U_pattern_col = S[lvl][kid]; Int U_pattern_row = BASKER_MAX_IDX; if(blk == l+1) @@ -2691,11 +2691,11 @@ namespace BaskerNS //S(0)(find_leader(kid,lvl)); //U_pattern_row = S(l+1)(kid) - //S(0)(my_pattern_leader); - U_pattern_row = S(l+1)(kid) - - S(0)(find_leader(kid,lvl-1)); + U_pattern_row = S[l+1][kid] - + S[0][find_leader(kid,lvl-1)]; } - Int L_pattern_col = S(lvl)(kid); + Int L_pattern_col = S[lvl][kid]; Int L_pattern_row = BASKER_MAX_IDX; if(lower == BASKER_TRUE) { @@ -2716,7 +2716,7 @@ namespace BaskerNS if(U_pattern_row != BASKER_MAX_IDX) { - BASKER_MATRIX &UP = LU(U_pattern_col)(U_pattern_row); + BASKER_MATRIX &UP = LU[U_pattern_col][U_pattern_row]; for(Int jj = UP.col_ptr(k); jj < UP.col_ptr(k+1); @@ -2730,7 +2730,7 @@ namespace BaskerNS }//if UPattern if(L_pattern_row != BASKER_MAX_IDX) { - BASKER_MATRIX &LP = LL(L_pattern_col)(L_pattern_row); + BASKER_MATRIX &LP = LL[L_pattern_col][L_pattern_row]; for(Int jj = LP.col_ptr(k); jj < LP.col_ptr(k+1); jj++) @@ -2769,11 +2769,11 @@ namespace BaskerNS //BASKER_MATRIX &B = AVM(A_col)(A_col); - const Int my_idx = S(0)(kid); + const Int my_idx = S[0][kid]; //should remove either as a paramter or here Int team_leader = find_leader(kid, sl); - const Int leader_idx = S(0)(team_leader); + const Int leader_idx = S[0][team_leader]; //Int loop_col_idx = S(l)(kid); NU //#ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -2807,13 +2807,13 @@ namespace BaskerNS { //const Int blk = l+1; - ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; + ENTRY_1DARRAY &XL = LL[leader_idx][blk].ews; // INT_1DARRAY &wsL = LL(leader_idx)(blk).iws; //NDE - warning: unused - Int p_sizeL = LL(leader_idx)(blk).p_size; + Int p_sizeL = LL[leader_idx][blk].p_size; // Int ws_sizeL = LL(leader_idx)(blk).iws_size; //NDE - warning: unused - ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; - INT_1DARRAY &ws = LL(my_idx)(blk).iws; - const Int ws_size = LL(my_idx)(blk).iws_size; + ENTRY_1DARRAY &X = LL[my_idx][blk].ews; + INT_1DARRAY &ws = LL[my_idx][blk].iws; + const Int ws_size = LL[my_idx][blk].iws_size; //Int p_size = LL(my_idx)(blk).p_size; Int *color = &(ws[0]); Int *pattern = &(color[ws_size]); @@ -2845,7 +2845,7 @@ namespace BaskerNS #endif //over all nnnz found - for(Int jj = 0; jj < LL(my_idx)(blk).nrow; ++jj) + for(Int jj = 0; jj < LL[my_idx][blk].nrow; ++jj) { color[jj] = 0; @@ -2910,7 +2910,7 @@ namespace BaskerNS //This can be removed in the future if(kid != team_leader) { - LL(my_idx)(blk).p_size = 0; + LL[my_idx][blk].p_size = 0; } else { @@ -2918,7 +2918,7 @@ namespace BaskerNS printf("SETTING PS: %d L:%d %d kid: %d\n", p_sizeL, leader_idx, blk, kid); #endif - LL(leader_idx)(blk).p_size = p_sizeL; + LL[leader_idx][blk].p_size = p_sizeL; //p_size = 0; NOT USED }//over all blks } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp index ccbd5a33b827..dc59708fe158 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp @@ -258,8 +258,8 @@ namespace BaskerNS Int btab = btf_tabs_offset; BASKER_MATRIX &M = (c >= btab ? BTF_C : BTF_D); - BASKER_MATRIX &U = (c >= btab ? UBTF(c-btab) : U_D(c)); - BASKER_MATRIX &L = (c >= btab ? LBTF(c-btab) : L_D(c)); + BASKER_MATRIX &U = (c >= btab ? UBTF[c-btab] : U_D[c]); + BASKER_MATRIX &L = (c >= btab ? LBTF[c-btab] : L_D[c]); Int k = btf_tabs(c); Int bcol = M.scol; @@ -294,9 +294,9 @@ namespace BaskerNS printf("Error: NaN diag in single factor\n"); } } - thread_array(kid).error_type = BASKER_ERROR_SINGULAR; - thread_array(kid).error_blk = c; - thread_array(kid).error_info = k; + thread_array[kid].error_type = BASKER_ERROR_SINGULAR; + thread_array[kid].error_blk = c; + thread_array[kid].error_info = k; return BASKER_ERROR; } @@ -336,8 +336,8 @@ namespace BaskerNS Int btab = btf_tabs_offset; BASKER_MATRIX &M = (c >= btab ? BTF_C : BTF_D); - BASKER_MATRIX &U = (c >= btab ? UBTF(c-btab) : U_D(c)); - BASKER_MATRIX &L = (c >= btab ? LBTF(c-btab) : L_D(c)); + BASKER_MATRIX &U = (c >= btab ? UBTF[c-btab] : U_D[c]); + BASKER_MATRIX &L = (c >= btab ? LBTF[c-btab] : L_D[c]); Int bcol = M.scol; //JDB: brow hack: fix. @@ -373,9 +373,9 @@ namespace BaskerNS Mag rmin_ (0.0); //workspace - Int ws_size = thread_array(kid).iws_size; - INT_1DARRAY ws = thread_array(kid).iws; - ENTRY_1DARRAY X = thread_array(kid).ews; + Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array[kid].iws; + ENTRY_1DARRAY X = thread_array[kid].ews; Int *color = &(ws(0)); Int *pattern = &(color[ws_size]); @@ -580,9 +580,9 @@ namespace BaskerNS << " Column: " << k << std::endl; } - thread_array(kid).error_type = BASKER_ERROR_NAN; - thread_array(kid).error_blk = c; - thread_array(kid).error_info = k; + thread_array[kid].error_type = BASKER_ERROR_NAN; + thread_array[kid].error_blk = c; + thread_array[kid].error_info = k; return BASKER_ERROR; } absv = abs(value); @@ -714,9 +714,9 @@ namespace BaskerNS pivot = normA_blk * eps; X(maxindex) = pivot; } else { - thread_array(kid).error_type = BASKER_ERROR_SINGULAR; - thread_array(kid).error_blk = c; - thread_array(kid).error_info = k; + thread_array[kid].error_type = BASKER_ERROR_SINGULAR; + thread_array[kid].error_blk = c; + thread_array[kid].error_info = k; return BASKER_ERROR; } } @@ -780,16 +780,16 @@ namespace BaskerNS (long)btf_tabs(c), (long)btf_tabs(c+1), (long)(btf_tabs(c+1)-btf_tabs(c))); } - thread_array(kid).error_blk = c; + thread_array[kid].error_blk = c; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } } @@ -804,16 +804,16 @@ namespace BaskerNS printf("blk: %ld column: %ld \n", (long)c, (long)k); } - thread_array(kid).error_blk = c; + thread_array[kid].error_blk = c; if(Options.realloc == BASKER_FALSE) { - thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; + thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array(kid).error_info = newsize; + thread_array[kid].error_type = BASKER_ERROR_REMALLOC; + thread_array[kid].error_info = newsize; return BASKER_ERROR; } } @@ -991,8 +991,8 @@ namespace BaskerNS ) { //printf("=======LOCAL REACH BTF SHORT CALLED (pattern[top=%d - 1] = %d) =====\n",(int)top, (int)j); - INT_1DARRAY ws = thread_array(kid).iws; - Int ws_size = thread_array(kid).iws_size; + INT_1DARRAY ws = thread_array[kid].iws; + Int ws_size = thread_array[kid].iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -1014,8 +1014,8 @@ namespace BaskerNS { //printf("=======LOCAL REACH BTF CALLED =====\n"); - INT_1DARRAY ws = thread_array(kid).iws; - Int ws_size = thread_array(kid).iws_size; + INT_1DARRAY ws = thread_array[kid].iws; + Int ws_size = thread_array[kid].iws_size; /*{ printf("ws_size: %d \n", ws_size); @@ -1144,8 +1144,8 @@ namespace BaskerNS ) { - INT_1DARRAY ws = thread_array(kid).iws; - Int ws_size = thread_array(kid).iws_size; + INT_1DARRAY ws = thread_array[kid].iws; + Int ws_size = thread_array[kid].iws_size; /* printf("ws_size: %d \n", ws_size); @@ -1289,9 +1289,9 @@ namespace BaskerNS { const Entry zero (0.0); - INT_1DARRAY ws = thread_array(kid).iws; - ENTRY_1DARRAY X = thread_array(kid).ews; - Int ws_size = thread_array(kid).iws_size; + INT_1DARRAY ws = thread_array[kid].iws; + ENTRY_1DARRAY X = thread_array[kid].ews; + Int ws_size = thread_array[kid].iws_size; Int brow = L.srow; Int *color = &(ws(0)); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp index 69d06a6bd72e..82ea04be3754 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp @@ -1096,11 +1096,19 @@ static int basker_sort_matrix_col(const void *arg1, const void *arg2) find_2D_convert(BTF_A); //now we can fill submatrices #ifdef BASKER_KOKKOS - kokkos_order_init_2D iO(this); - Kokkos::parallel_for(TeamPolicy(num_threads,1), iO); - Kokkos::fence(); + #ifdef BASKER_PARALLEL_INIT_2D + kokkos_order_init_2D iO(this); + Kokkos::parallel_for(TeamPolicy(num_threads,1), iO); + Kokkos::fence(); + #else + bool alloc = true; + //bool keep_zeros = true; + for (Int p = 0; p < num_threads; p++) { + this->t_init_2DA(p, alloc, keep_zeros); + } + #endif #else - //Comeback + //Comeback #endif #ifdef BASKER_TIMER double init_2d_time = scotch_timer.seconds(); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp index cc20d3b21e78..fd11208ea309 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp @@ -117,9 +117,11 @@ namespace BaskerNS // thread.team_rank()); Int kid = basker->t_get_kid(thread); #endif + printf( " * kokkos_sfactor_init_factor(%d) *\n",kid ); fflush(stdout); basker->t_init_factor(kid); + printf( " * kokkos_sfactor_init_factor(%d) done *\n",kid ); fflush(stdout); //This needs to be done earlier in ordering now //basker->t_init_2DA(kid); @@ -159,7 +161,7 @@ int Basker::sfactor() printf("Total NNZ: %ld \n", (long)global_nnz); printf(" > blk_matching = %d\n", (int)Options.blk_matching ); printf("----------------------------------\n"); - printf("\n"); + printf("\n"); fflush(stdout); } } @@ -169,28 +171,45 @@ int Basker::sfactor() } //Allocate Factorspace - //printf(" >> kokkos_sfactor_init_factor( btf_tabs_offset = %d, allocate_nd_workspace = %d ) <<\n", - // btf_tabs_offset,allocate_nd_workspace); + #ifdef BASKER_TIMER + printf(" >> kokkos_sfactor_init_factor( btf_tabs_offset = %d, allocate_nd_workspace = %d ) <<\n", + btf_tabs_offset,allocate_nd_workspace); fflush(stdout); + #endif if(btf_tabs_offset != 0 && allocate_nd_workspace) { #ifdef BASKER_KOKKOS + #ifdef BASKER_PARALLEL_INIT_FACTOR kokkos_sfactor_init_factor iF(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), iF); Kokkos::fence(); + #else + for (Int p = 0; p < num_threads; p++) { + this->t_init_factor(p); + } + #endif #else #endif } + #ifdef BASKER_TIMER + printf(" >> kokkos_sfactor_workspace <<\n"); fflush(stdout); + #endif //if(btf_tabs_offset != 0) { //Allocate workspace #ifdef BASKER_KOKKOS + #ifdef BASKER_PARALLEL_INIT_WORKSPACE typedef Kokkos::TeamPolicy TeamPolicy; kokkos_sfactor_init_workspace iWS(setup_flag, this); Kokkos::parallel_for(TeamPolicy(num_threads,1), iWS); Kokkos::fence(); + #else + for (Int p = 0; p < num_threads; p++) { + this->t_init_workspace(setup_flag, p); + } + #endif #endif } @@ -292,7 +311,9 @@ int Basker::sfactor() double time2 = 0.0; double time3 = 0.0; Kokkos::Timer timer1; + Kokkos::Timer timer2; timer.reset(); + timer2.reset(); #endif //split_num = num_threads/2; @@ -303,7 +324,7 @@ int Basker::sfactor() printf("\n --------------- OVER DOMS ---------------\n"); printf("\n"); } - #define SHYLU_BASKER_STREE_LIST + //#define SHYLU_BASKER_STREE_LIST std::vector stree_list (num_threads); #ifdef SHYLU_BASKER_STREE_LIST Kokkos::parallel_for( @@ -313,7 +334,7 @@ int Basker::sfactor() for(Int p = 0; p < num_threads; ++p) #endif { - Int blk = S(0)(p); + Int blk = S[0][p]; if(Options.verbose == BASKER_TRUE) { printf(" ============= DOMAIN BLK (p=%d) ============\n",(int)p); @@ -323,34 +344,34 @@ int Basker::sfactor() //printf("\n\n STREE SIZE: %d \n", AL[blk][0].ncol); //printf("Here 0\n"); //Find nnz_counts for leafs - #ifdef BASKER_TIMER + #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST auto stree_p = stree_list[p]; - e_tree (ALM(blk)(0), stree_p, 1); + e_tree (ALM[blk][0], stree_p, 1); #else - e_tree (ALM(blk)(0), stree, 1); + e_tree (ALM[blk][0], stree, 1); #endif - #ifdef BASKER_TIMER + #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time1_2 += timer1.seconds(); timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - post_order(ALM(blk)(0), stree_p); + post_order(ALM[blk][0], stree_p); #else - post_order(ALM(blk)(0), stree); + post_order(ALM[blk][0], stree); #endif - #ifdef BASKER_TIMER + #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time1_3 += timer1.seconds(); timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - col_count (ALM(blk)(0), stree_p); + col_count (ALM[blk][0], stree_p); #else - col_count (ALM(blk)(0), stree); + col_count (ALM[blk][0], stree); #endif - #ifdef BASKER_TIMER + #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time1 += timer1.seconds(); #endif @@ -362,17 +383,17 @@ int Basker::sfactor() printf( " >> leaf_assign_nnz(LL(%d)(%d))\n",(int)blk,0); printf( " >> leaf_assign_nnz(LL(%d)(%d))\n",(int)blk,(int)LU_size(blk)-1); } - #ifdef BASKER_TIMER + #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - leaf_assign_nnz(LL(blk)(0), stree_p, 0); - leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree_p, 0); + leaf_assign_nnz(LL[blk][0], stree_p, 0); + leaf_assign_nnz(LU[blk][LU_size(blk)-1], stree_p, 0); #else - leaf_assign_nnz(LL(blk)(0), stree, 0); - leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree, 0); + leaf_assign_nnz(LL[blk][0], stree, 0); + leaf_assign_nnz(LU[blk][LU_size(blk)-1], stree, 0); #endif - #ifdef BASKER_TIMER + #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time2 += timer1.seconds(); #endif } @@ -380,22 +401,26 @@ int Basker::sfactor() ); Kokkos::fence(); #endif + #ifdef BASKER_TIMER + double dom_time = timer2.seconds(); + std::cout << " DOMAIN BLKs done : " << dom_time << std::endl; + #endif for(Int p = 0; p < num_threads; ++p) { //Do off diag - Int blk = S(0)(p); + Int blk = S[0][p]; #ifdef SHYLU_BASKER_STREE_LIST auto stree_p = stree_list[p]; #endif for(Int l =0; l < tree.nlvls; l++) { - Int U_col = S(l+1)(p); + Int U_col = S[l+1][p]; //Note: Need to think more about this flow //Should be subtracted by how many times in the //future - Int my_row_leader = S(0)(find_leader(p,l)); + Int my_row_leader = S[0][find_leader(p,l)]; //Int my_new_row = // blk - my_row_leader; Int U_row = blk-my_row_leader; @@ -416,10 +441,10 @@ int Basker::sfactor() timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - U_blk_sfactor(AVM(U_col)(U_row), stree_p, + U_blk_sfactor(AVM[U_col][U_row], stree_p, gScol[l], gSrow[glvl], off_diag); #else - U_blk_sfactor(AVM(U_col)(U_row), stree, + U_blk_sfactor(AVM[U_col][U_row], stree, gScol[l], gSrow[glvl], off_diag); #endif #ifdef BASKER_TIMER @@ -448,11 +473,11 @@ int Basker::sfactor() //printf( " U_assign_nnz(LU(%d,%d))\n",U_col,U_row ); double fill_factor = BASKER_DOM_NNZ_OVER+Options.user_fill; #ifdef SHYLU_BASKER_STREE_LIST - U_assign_nnz(LU(U_col)(U_row), stree_p, fill_factor, 0); - L_assign_nnz(LL(blk)(l+1), stree_p, fill_factor, 0); + U_assign_nnz(LU[U_col][U_row], stree_p, fill_factor, 0); + L_assign_nnz(LL[blk][l+1], stree_p, fill_factor, 0); #else - U_assign_nnz(LU(U_col)(U_row), stree, fill_factor, 0); - L_assign_nnz(LL(blk)(l+1), stree, fill_factor, 0); + U_assign_nnz(LU[U_col][U_row], stree, fill_factor, 0); + L_assign_nnz(LL[blk][l+1], stree, fill_factor, 0); #endif #ifdef BASKER_TIMER time2 += timer1.seconds(); @@ -484,13 +509,17 @@ int Basker::sfactor() //over all the seps in a lvle #ifdef SHYLU_BASKER_STREE_LIST + //printf( " parallel for \n" ); Kokkos::parallel_for( "permute_col", p, KOKKOS_LAMBDA(const int pp) #else + //printf( " serial for \n" ); for(Int pp = 0; pp < p; pp++) #endif { - //printf( " -- level = %d separator = %d --\n",lvl,pp ); + #ifdef BASKER_TIMER + printf( " -- level = %d/%d separator = %d/%d --\n",lvl,tree.nlvls, pp,p ); fflush(stdout); + #endif //S blks Int ppp; ppp = pp*pow(tree.nparts, lvl+1); @@ -505,43 +534,50 @@ int Basker::sfactor() (long)U_col, (long)U_row, (long)lvl, (long)pp); #endif - Int U_col = S(lvl+1)(ppp); + Int U_col = S[lvl+1][ppp]; Int U_row = 0; //S_blk_sfactor(AL[U_col][U_row], stree, //gScol[lvl], gSrow[pp]); - //printf( " >>> S_blk_sfactor( ALM(%d)(%d) with %dx%d and nnz=%d) <<<\n",U_col,U_row, ALM(U_col)(U_row).nrow,ALM(U_col)(U_row).ncol,ALM(U_col)(U_row).nnz ); + #ifdef BASKER_TIMER + printf( " >>> S_blk_sfactor( ALM(%d)(%d) with %dx%d and nnz=%d) <<<\n",U_col,U_row, ALM[U_col][U_row].nrow,ALM[U_col][U_row].ncol,ALM[U_col][U_row].nnz ); fflush(stdout); + #endif #ifdef SHYLU_BASKER_STREE_LIST auto stree_p = stree_list[pp]; - S_blk_sfactor(ALM(U_col)(U_row), stree_p, - gScol(lvl), gSrow(pp)); + S_blk_sfactor(ALM[U_col][U_row], stree_p, + gScol[lvl], gSrow[pp]); #else - S_blk_sfactor(ALM(U_col)(U_row), stree, - gScol(lvl), gSrow(pp)); + S_blk_sfactor(ALM[U_col][U_row], stree, + gScol[lvl], gSrow[pp]); + #endif + #ifdef BASKER_TIMER + printf( " >>> -> nnz = %d\n",ALM[U_col][U_row].nnz ); fflush(stdout); #endif - //printf( " >>> -> nnz = %d\n",ALM(U_col)(U_row).nnz ); //S_assign_nnz(LL[U_col][U_row], stree, 0); if(Options.verbose == BASKER_TRUE) { - printf( " >> S_assign_nnz( LL(%d,%d) )\n",(int)U_col,(int)U_row ); + printf( " >> S_assign_nnz( LL(%d,%d) )\n",(int)U_col,(int)U_row ); fflush(stdout); } #ifdef SHYLU_BASKER_STREE_LIST - S_assign_nnz(LL(U_col)(U_row), stree_p, 0); + S_assign_nnz(LL[U_col][U_row], stree_p, 0); #else - S_assign_nnz(LL(U_col)(U_row), stree, 0); + S_assign_nnz(LL[U_col][U_row], stree, 0); #endif //S_assign_nnz(LU[U_col][LU_size[U_col]-1], stree,0); //printf( " >>> S_assign_nnz( LU(%d,%d) )\n",U_col,LU_size(U_col)-1 ); if(Options.verbose == BASKER_TRUE) { - printf( " ++ S_assign_nnz(LU(%d, %d))\n",(int)U_col,(int)LU_size(U_col)-1); + printf( " ++ S_assign_nnz(LU(%d, %d))\n",(int)U_col,(int)LU_size(U_col)-1); fflush(stdout); } #ifdef SHYLU_BASKER_STREE_LIST - S_assign_nnz(LU(U_col)(LU_size(U_col)-1), stree_p, 0); + S_assign_nnz(LU[U_col][LU_size(U_col)-1], stree_p, 0); #else - S_assign_nnz(LU(U_col)(LU_size(U_col)-1), stree, 0); + S_assign_nnz(LU[U_col][LU_size(U_col)-1], stree, 0); + #endif + #ifdef BASKER_TIMER + printf( " >>> -> nnz = %d\n",LU[U_col][LU_size(U_col)-1].nnz); fflush(stdout); #endif } #ifdef SHYLU_BASKER_STREE_LIST @@ -557,19 +593,20 @@ int Basker::sfactor() Int ppp; ppp = pp*pow(tree.nparts, lvl+1); - Int U_col = S(lvl+1)(ppp); + Int U_col = S[lvl+1][ppp]; Int U_row = 0; Int inner_blk = U_col; for(Int l = lvl+1; l < tree.nlvls; l++) { - U_col = S(l+1)(ppp); - U_row = S(lvl+1)(ppp)%LU_size(U_col); + //printf( " --- pp = %d/%d, l = %d/%d ---\n",pp,p, l,tree.nlvls ); fflush(stdout); + U_col = S[l+1][ppp]; + U_row = S[lvl+1][ppp]%LU_size(U_col); - Int my_row_leader = S(0)(find_leader(ppp,l)); + Int my_row_leader = S[0][find_leader(ppp,l)]; //Int my_new_row = // S(lvl+1)(ppp) - my_row_leader; - U_row = S(lvl+1)(ppp) - my_row_leader; + U_row = S[lvl+1][ppp] - my_row_leader; #ifdef BASKER_DEBUG_SFACTOR printf("offida sep, lvl: %d l: %d U_col: %d U_row: %d \n", lvl, l, U_col, U_row); @@ -578,11 +615,11 @@ int Basker::sfactor() Int off_diag = 1; #ifdef SHYLU_BASKER_STREE_LIST - U_blk_sfactor(AVM(U_col)(U_row), stree_p, - gScol(l), gSrow(pp), off_diag); + U_blk_sfactor(AVM[U_col][U_row], stree_p, + gScol[l], gSrow[pp], off_diag); #else - U_blk_sfactor(AVM(U_col)(U_row), stree, - gScol(l), gSrow(pp), off_diag); + U_blk_sfactor(AVM[U_col][U_row], stree, + gScol[l], gSrow[pp], off_diag); #endif //In symmetric will not need @@ -598,14 +635,15 @@ int Basker::sfactor() { printf( " ++ leaf_assign_nnz(LU(%d, %d))\n",(int)U_col,(int)U_row); printf( " ++ leaf_assign_nnz(LL(%d, %d))\n",(int)inner_blk,(int)(l-lvl)); + fflush(stdout); } double fill_factor = BASKER_SEP_NNZ_OVER+Options.user_fill; #ifdef SHYLU_BASKER_STREE_LIST - U_assign_nnz(LU(U_col)(U_row), stree_p, fill_factor, 0); - L_assign_nnz(LL(inner_blk)(l-lvl), stree_p, fill_factor, 0); + U_assign_nnz(LU[U_col][U_row], stree_p, fill_factor, 0); + L_assign_nnz(LL[inner_blk][l-lvl], stree_p, fill_factor, 0); #else - U_assign_nnz(LU(U_col)(U_row), stree, fill_factor, 0); - L_assign_nnz(LL(inner_blk)(l-lvl), stree, fill_factor, 0); + U_assign_nnz(LU[U_col][U_row], stree, fill_factor, 0); + L_assign_nnz(LL[inner_blk][l-lvl], stree, fill_factor, 0); #endif //printf("Here 1 \n"); } @@ -625,6 +663,9 @@ int Basker::sfactor() FREE(gScol); FREE(gSrow); + #ifdef BASKER_TIMER + std::cout << " >> symmetric_sfactor done << " << std::endl; + #endif return 0; }//end symmetric_symbolic() @@ -1151,7 +1192,6 @@ int Basker::sfactor() BASKER_SYMBOLIC_TREE &ST ) { -printf( " col_count:: view \n" ); //Still like to find a way to do this without transpose BASKER_MATRIX Mt; matrix_transpose(MV, Mt); @@ -2419,6 +2459,9 @@ printf( " col_count:: view \n" ); //printf("number of blks: %d \n", // btf_nblks-btf_tabs_offset); #endif + #ifdef BASKER_TIMER + printf( " > btf_last_dense(%s) <\n",(flag ? "true" : "false") ); fflush(stdout); + #endif Int max_blk_size = 0; #if defined(BASKER_SPLIT_A) @@ -2440,7 +2483,7 @@ printf( " col_count:: view \n" ); nnz = lblk_size*lblk_size; } //printf( " LBTF(%d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)nnz ); - L_D(i).init_matrix("LBFT", + L_D[i].init_matrix("LBFT", btf_tabs(i), lblk_size, btf_tabs(i), @@ -2448,9 +2491,9 @@ printf( " col_count:: view \n" ); nnz); //For pruning - L_D(i).init_pend(); + L_D[i].init_pend(); - U_D(i).init_matrix("UBFT", + U_D[i].init_matrix("UBFT", btf_tabs(i), lblk_size, btf_tabs(i), @@ -2459,6 +2502,9 @@ printf( " col_count:: view \n" ); }//over all blks } #endif + #ifdef BASKER_TIMER + printf( " > top blocks done <\n" ); fflush(stdout); + #endif //Malloc L and U #ifdef BASKER_DEBUG_SFACTOR @@ -2487,7 +2533,7 @@ printf( " col_count:: view \n" ); nnz = lblk_size*lblk_size; } //printf( " LBTF(%d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)nnz ); - LBTF(i-btf_tabs_offset).init_matrix("LBFT", + LBTF[i-btf_tabs_offset].init_matrix("LBFT", btf_tabs(i), lblk_size, btf_tabs(i), @@ -2496,10 +2542,10 @@ printf( " col_count:: view \n" ); //For pruning //printf( " LBTF(%d).init_pend()\n",(int)(i-btf_tabs_offset) ); - LBTF(i-btf_tabs_offset).init_pend(); + LBTF[i-btf_tabs_offset].init_pend(); //printf( " UBTF(%d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)nnz ); - UBTF(i-btf_tabs_offset).init_matrix("UBFT", + UBTF[i-btf_tabs_offset].init_matrix("UBFT", btf_tabs(i), lblk_size, btf_tabs(i), @@ -2511,6 +2557,9 @@ printf( " col_count:: view \n" ); //MALLOC workspace }//over all blks } + #ifdef BASKER_TIMER + printf( " > left blocks done <\n" ); fflush(stdout); + #endif //JDB: This needs to be fixed max_blk_size = BTF_D.nrow + BTF_C.nrow; @@ -2525,14 +2574,14 @@ printf( " col_count:: view \n" ); for(Int i = 0 ; i < num_threads; i++) { - thread_array(i).iws_size = max_blk_size; - thread_array(i).ews_size = max_blk_size; + thread_array[i].iws_size = max_blk_size; + thread_array[i].ews_size = max_blk_size; //BASKER_ASSERT((thread_array(i).iws_size*thread_array(i).iws_mult) > 0, "Basker btf_last_dense assert: sfactor threads iws > 0 failed"); //BASKER_ASSERT((thread_array(i).ews_size*thread_array(i).ews_mult) > 0, "Basker btf_last_dense assert: sfactor threads ews > 0 failed"); if (max_blk_size > 0) { - MALLOC_INT_1DARRAY(thread_array(i).iws, thread_array(i).iws_size*thread_array(i).iws_mult); - MALLOC_ENTRY_1DARRAY(thread_array(i).ews, thread_array(i).ews_size*thread_array(i).ews_mult); + MALLOC_INT_1DARRAY(thread_array[i].iws, thread_array[i].iws_size*thread_array[i].iws_mult); + MALLOC_ENTRY_1DARRAY(thread_array[i].ews, thread_array[i].ews_size*thread_array[i].ews_mult); } #ifdef BASKER_DEBUG_SFACTOR printf("Malloc Thread: %d iws: %d \n", @@ -2545,8 +2594,12 @@ printf( " col_count:: view \n" ); } } + #ifdef BASKER_TIMER + printf( " > btf_last_dense done <\n" ); + #endif }//end btf_last_dense() }//end namespace Bakser +#undef BASKER_TIMER #endif//endif BASKER_SFACTOR_NEWFRM_HPP diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp index 64c041a6536c..ec7774a43f13 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp @@ -99,20 +99,20 @@ namespace BaskerNS for(Int p=0; p < num_threads; ++p) { - Int blk = S(0)(p); - sfactor_nd_dom_estimate(ALM(blk)(0), - LL(blk)(0), - LU(blk)(LU_size(blk)-1)); + Int blk = S[0][p]; + sfactor_nd_dom_estimate(ALM[blk][0], + LL[blk][0], + LU[blk][LU_size(blk)-1]); for(Int l=0; l < tree.nlvls; l++) { - Int U_col = S(l+1)(p); + Int U_col = S[l+1][p]; Int my_row_leader = find_leader(p,l); Int my_new_row = - blk - S(0)(my_row_leader); + blk - S[0][my_row_leader]; - Int U_row = (l==0)?(p%2):S(0)(p)%LU_size(U_col); + Int U_row = (l==0)?(p%2):S[0][p]%LU_size(U_col); if((blk > 14) && (blk > LU_size(U_col)) && (l!=0)) @@ -124,11 +124,11 @@ namespace BaskerNS //JDB TEST PASSED U_row = my_new_row; - sfactor_nd_upper_estimate(AVM(U_col)(U_row), - LU(U_col)(U_row)); + sfactor_nd_upper_estimate(AVM[U_col][U_row], + LU[U_col][U_row]); - sfactor_nd_lower_estimate(ALM(blk)(l+1), - LL(blk)(l+1)); + sfactor_nd_lower_estimate(ALM[blk][l+1], + LL[blk][l+1]); } // end for l @@ -138,41 +138,41 @@ namespace BaskerNS for(Int pp=0; pp < pow(tree.nparts, tree.nlvls-lvl-1); pp++) { Int ppp = pp*pow(tree.nparts, lvl+1); - Int U_col = S(lvl+1)(ppp); + Int U_col = S[lvl+1][ppp]; Int U_row = 0; - sfactor_nd_sep_estimate(ALM(U_col)(U_row), - LL(U_col)(U_row), - LU(U_col)(LU_size(U_col)-1)); + sfactor_nd_sep_estimate(ALM[U_col][U_row], + LL[U_col][U_row], + LU[U_col][LU_size(U_col)-1]); Int innerblk = U_col; for(Int l = lvl+1; l < tree.nlvls; l++) { - U_col = S(l+1)(ppp); + U_col = S[l+1][ppp]; Int my_row_leader = find_leader(ppp,l); Int my_new_row = - S(lvl+1)(ppp) - S(0)(my_row_leader); + S[lvl+1][ppp] - S[0][my_row_leader]; - U_row = S(lvl+1)(ppp)%LU_size(U_col); - if((S(lvl+1)(ppp) > 14) && - (S(lvl+1)(ppp) > LU_size(U_col)) + U_row = S[lvl+1][ppp]%LU_size(U_col); + if((S[lvl+1][ppp] > 14) && + (S[lvl+1][ppp] > LU_size(U_col)) ) { - Int tm = (S(lvl+1)(ppp)+1)/16; - U_row = ((S(lvl+1)(ppp)+1) - + Int tm = (S[lvl+1][ppp]+1)/16; + U_row = ((S[lvl+1][ppp]+1) - (tm*16))%LU_size(U_col); } //JDB TEST PASS U_row = my_new_row; - sfactor_nd_sep_upper_estimate(AVM(U_col)(U_row), - LU(U_col)(U_row)); + sfactor_nd_sep_upper_estimate(AVM[U_col][U_row], + LU[U_col][U_row]); sfactor_nd_sep_lower_estimate( - ALM(innerblk)(l-lvl), - LL(innerblk)(l-lvl)); + ALM[innerblk][l-lvl], + LL[innerblk][l-lvl]); }//for - l }//for -p diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp index b01d3ec72632..b2fa1204cd86 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp @@ -293,7 +293,7 @@ namespace BaskerNS for(Int b = nblks_c-1; b>= 0; b--) { //---Lower solve - BASKER_MATRIX &LC = LBTF(b); + BASKER_MATRIX &LC = LBTF[b]; #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n btf b=%ld (%d x %d), LBTF(%d)\n", (long)b, (int)LC.nrow, (int)LC.ncol, (int)b); #endif @@ -303,7 +303,7 @@ namespace BaskerNS //printVec(y,gn); - BASKER_MATRIX &UC = UBTF(b); + BASKER_MATRIX &UC = UBTF[b]; //U(C)\x -> y upper_tri_solve(UC,x,y); @@ -420,7 +420,7 @@ namespace BaskerNS for(Int b = btf_top_tabs_offset-1; b>= 0; b--) { //L(C)\x -> y - BASKER_MATRIX &LC = L_D(b); + BASKER_MATRIX &LC = L_D[b]; lower_tri_solve(LC, x, y); #ifdef BASKER_DEBUG_SOLVE_RHS printf( "\n after L solve (b=%d)\n",b ); fflush(stdout); @@ -429,7 +429,7 @@ namespace BaskerNS #endif //U(C)\y -> x - BASKER_MATRIX &UC = U_D(b); + BASKER_MATRIX &UC = U_D[b]; upper_tri_solve(UC, y, x); #ifdef BASKER_DEBUG_SOLVE_RHS printf( "\n after U solve\n" ); fflush(stdout); @@ -476,7 +476,7 @@ namespace BaskerNS //Forward solve on A for(Int b = 0; b < tree.nblks; ++b) { - BASKER_MATRIX &L = LL(b)(0); + BASKER_MATRIX &L = LL[b][0]; //L\x -> y lower_tri_solve(L, x, y, scol_top); @@ -500,7 +500,7 @@ namespace BaskerNS //Update offdiag for(Int bb = 1; bb < LL_size(b); ++bb) { - BASKER_MATRIX &LD = LL(b)(bb); + BASKER_MATRIX &LD = LL[b][bb]; //x = LD*y; #ifdef BASKER_DEBUG_SOLVE_RHS char filename[200]; @@ -549,7 +549,7 @@ namespace BaskerNS #endif //U\y -> x - BASKER_MATRIX &U = LU(b)(LU_size(b)-1); + BASKER_MATRIX &U = LU[b][LU_size(b)-1]; upper_tri_solve(U, y, x, scol_top); // NDE: y , x positions swapped... // seems role of x and y changed... #ifdef BASKER_DEBUG_SOLVE_RHS @@ -568,7 +568,7 @@ namespace BaskerNS #endif //y = UB*x; - BASKER_MATRIX &UB = LU(b)(bb); + BASKER_MATRIX &UB = LU[b][bb]; neg_spmv(UB, x, y, scol_top); #ifdef BASKER_DEBUG_SOLVE_RHS diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp index f950e9bd6132..bfd6e2460062 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp @@ -346,10 +346,10 @@ namespace BaskerNS // Update off-diag in the block-row before the diag solve for(int bb = LL_size(b)-1; bb > 0; bb--) { - BASKER_MATRIX &LD = LL(b)(bb); + BASKER_MATRIX &LD = LL[b][bb]; neg_spmv_perm_tr(LD, x, y, scol_top); // update y as mod. rhs, x as solution } - BASKER_MATRIX &L = LL(b)(0); + BASKER_MATRIX &L = LL[b][0]; if (L.nrow != 0 && L.ncol != 0) // Avoid degenerate case e.g. empty block following nd-partitioning lower_tri_solve_tr(L, y, x, scol_top); // x and y should be equal after in M range... } @@ -373,10 +373,10 @@ namespace BaskerNS for(Int bb = 0; bb < LU_size(b)-1; bb++) { // update offdiag corresponding to the block-row - BASKER_MATRIX &UB = LU(b)(bb); + BASKER_MATRIX &UB = LU[b][bb]; neg_spmv_tr(UB, x, y, scol_top); } - BASKER_MATRIX &U = LU(b)(LU_size(b)-1); + BASKER_MATRIX &U = LU[b][LU_size(b)-1]; if (U.nrow != 0 && U.ncol != 0) // Avoid degenerate case upper_tri_solve_tr(U, x, y, scol_top); } @@ -410,7 +410,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of { for(Int b = 0; b < btf_top_tabs_offset; b++) { - BASKER_MATRIX &UC = U_D(b); + BASKER_MATRIX &UC = U_D[b]; if ( b > 0 ) spmv_BTF_tr(b, BTF_D, x, y, false); @@ -418,7 +418,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of if (UC.nrow != 0 && UC.ncol != 0) // Avoid degenerate case upper_tri_solve_tr(UC, x, y); - BASKER_MATRIX &LC = L_D(b); + BASKER_MATRIX &LC = L_D[b]; if (LC.nrow != 0 && LC.ncol != 0) // Avoid degenerate case lower_tri_solve_tr(LC, x, y); @@ -462,7 +462,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of if (nblks_c > 0) { Int offset = 0; for(Int b = 0; b < nblks_c; b++) { - BASKER_MATRIX &UC = UBTF(b); + BASKER_MATRIX &UC = UBTF[b]; // Update off-diag // Update X with Y @@ -472,7 +472,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of if (UC.nrow != 0 && UC.ncol != 0) // Avoid degenerate case upper_tri_solve_tr(UC,x,y); - BASKER_MATRIX &LC = LBTF(b); + BASKER_MATRIX &LC = LBTF[b]; if (LC.nrow != 0 && LC.ncol != 0) // Avoid degenerate case lower_tri_solve_tr(LC,x,y); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp index 1248d7472b0e..bd5bc82efdbc 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp @@ -54,7 +54,7 @@ namespace BaskerNS #ifndef BASKER_KOKKOS FREE_INT_1DARRAY(iws); FREE_ENTRY_1DARRAY(ews); - C.Finalize(); + //C.Finalize(); #endif } @@ -129,13 +129,12 @@ namespace BaskerNS BASKER_INLINE ~basker_tree() { - //Finalize(); + Finalize(); }//end ~basker_tree BASKER_INLINE void Finalize() { - //printf("basker_tree Finalize todo \n"); if(nroots > 0) { FREE_INT_1DARRAY(roots); @@ -267,7 +266,7 @@ namespace BaskerNS ~basker_symbolic_tree() { - //Finalize(); + Finalize(); }//end ~basker_symbolic_tree BASKER_INLINE diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp index be4c146e9c83..81e3c78c7f9c 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp @@ -827,16 +827,16 @@ namespace BaskerNS Int U_view_size = (U_view_count(i) > 0 ? U_view_count(i) : 1); if (U_view_size > 0) { - MALLOC_MATRIX_1DARRAY(AVM(i), U_view_size); - MALLOC_MATRIX_1DARRAY(LU(i), U_view_size); + MALLOC_MATRIX_1DARRAY(AVM[i], U_view_size); + MALLOC_MATRIX_1DARRAY(LU[i], U_view_size); } //Malloc AL subarray // NOTE: size at least one to allow empty block Int L_view_size = (L_view_count(i) > 0 ? L_view_count(i): 1); if (L_view_size > 0) { - MALLOC_MATRIX_1DARRAY(ALM(i), L_view_size); - MALLOC_MATRIX_1DARRAY(LL(i), L_view_size); + MALLOC_MATRIX_1DARRAY(ALM[i], L_view_size); + MALLOC_MATRIX_1DARRAY(LL[i], L_view_size); } LU_size(i) = U_view_count(i); @@ -1056,7 +1056,7 @@ namespace BaskerNS (r_idx < tree.nblks && tree.row_tabs(r_idx+1) == tree.row_tabs(r_idx))) // skip empty blocks { if((L_row+1 < LL_size(L_col)) && - (tree.row_tabs(r_idx+1) == ALM(L_col)(L_row+1).srow)) + (tree.row_tabs(r_idx+1) == ALM[L_col][L_row+1].srow)) { //printf( " > ALM(%d)(%d).srow = %d, row_tab(%d) = %d\n",L_col,L_row+1,ALM(L_col)(L_row+1).srow, r_idx+1,tree.row_tabs(r_idx+1) ); L_row++; @@ -1071,7 +1071,7 @@ namespace BaskerNS (r_idx < tree.nblks && tree.row_tabs(r_idx+1) == tree.row_tabs(r_idx))) // skip empty blocks { if((U_row+1 < LU_size(U_col)) && - (tree.row_tabs(r_idx+1) == AVM(U_col)(U_row+1).srow)) + (tree.row_tabs(r_idx+1) == AVM[U_col][U_row+1].srow)) { //printf( " + AVM(%d)(%d).srow = %d, row_tab(%d) = %d\n",U_col,U_row+1,AVM(U_col)(U_row+1).srow, r_idx+1,tree.row_tabs(r_idx+1) ); U_row++; @@ -1095,8 +1095,8 @@ namespace BaskerNS //Get Matrix Ref - BASKER_MATRIX &Ltemp = ALM(L_col)(L_row); - BASKER_MATRIX &Utemp = AVM(U_col)(U_row); + BASKER_MATRIX &Ltemp = ALM[L_col][L_row]; + BASKER_MATRIX &Utemp = AVM[U_col][U_row]; Int bcol = Ltemp.scol; //diag blk @@ -1162,11 +1162,11 @@ namespace BaskerNS for(Int sb = 0; sb < LL_size(b); ++sb) { //printf( " ALM(%d)(%d).clean_col()\n",b,sb ); - ALM(b)(sb).clean_col(); + ALM[b][sb].clean_col(); } for(Int sb = 0; sb < LU_size(b); ++sb) { - AVM(b)(sb).clean_col(); + AVM[b][sb].clean_col(); } }//for - over all blks @@ -1178,6 +1178,7 @@ namespace BaskerNS BASKER_INLINE int Basker::sfactor_copy() { + printf( " .. sfactor_copy ..\n" ); fflush(stdout); //Reorder A; //Match order if(match_flag == BASKER_TRUE) @@ -1322,9 +1323,15 @@ namespace BaskerNS #ifdef BASKER_KOKKOS BASKER_BOOL keep_zeros = BASKER_FALSE; BASKER_BOOL alloc = alloc_BTFA; //BASKER_FALSE; - kokkos_order_init_2D iO(this, alloc, keep_zeros); // t_init_2DA; fill row_idx, vals into ALM, AVM calling convert2D - Kokkos::parallel_for(TeamPolicy(num_threads,1), iO); - Kokkos::fence(); + #ifdef BASKER_PARALLEL_INIT_2D + kokkos_order_init_2D iO(this, alloc, keep_zeros); // t_init_2DA; fill row_idx, vals into ALM, AVM calling convert2D + Kokkos::parallel_for(TeamPolicy(num_threads,1), iO); + Kokkos::fence(); + #else + for (Int p = 0; p < num_threads; p++) { + this->t_init_2DA(p, alloc, keep_zeros); + } + #endif #else //Comeback #endif diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp index 6009e346f73b..8ea5c54c8e89 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp @@ -144,17 +144,18 @@ enum BASKER_INCOMPLETE_CODE #define BASKER_KOKKOS_NOINIT Kokkos::ViewAllocateWithoutInitializing #define INT_RANK2DARRAY Kokkos::View #define INT_1DARRAY Kokkos::View -#define INT_2DARRAY Kokkos::View #define ENTRY_1DARRAY Kokkos::View -#define ENTRY_2DARRAY Kokkos::View #define BOOL_1DARRAY Kokkos::View #define BOOL_2DARRAY Kokkos::View -#define MATRIX_1DARRAY Kokkos::View -#define MATRIX_2DARRAY Kokkos::View -#define MATRIX_VIEW_1DARRAY Kokkos::View -#define MATRIX_VIEW_2DARRAY Kokkos::View -#define THREAD_1DARRAY Kokkos::View -#define THREAD_2DARRAY Kokkos::View + +#define INT_2DARRAY std::vector +#define ENTRY_2DARRAY std::vector +#define MATRIX_1DARRAY std::vector +#define MATRIX_2DARRAY std::vector +#define MATRIX_VIEW_1DARRAY std::vector +#define MATRIX_VIEW_2DARRAY std::vector +#define THREAD_1DARRAY std::vector +#define THREAD_2DARRAY std::vector #define INT_1DARRAY_PAIRS Kokkos::View*, BASKER_EXE_SPACE> //Macro Memory Calls @@ -163,7 +164,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC malloc_pairs_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = INT_1DARRAY_PAIRS(BASKER_KOKKOS_NOINIT("pairs_1d"),s); \ + /*a = INT_1DARRAY_PAIRS(BASKER_KOKKOS_NOINIT("pairs_1d"),s);*/ \ + Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -172,7 +174,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC int_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = INT_1DARRAY(BASKER_KOKKOS_NOINIT("int_1d"),s); \ + /*a = INT_1DARRAY(BASKER_KOKKOS_NOINIT("int_1d"),s);*/ \ + Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -181,7 +184,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s0>0, "BASKER ASSERT MALLOC int_rank2d: size to alloc > 0 fails"); \ BASKER_ASSERT(s1>0, "BASKER ASSERT MALLOC int_rank2d: size to alloc > 0 fails"); \ - a = INT_RANK2DARRAY(BASKER_KOKKOS_NOINIT("int_rank2d"),s0,s1); \ + /*a = INT_RANK2DARRAY(BASKER_KOKKOS_NOINIT("int_rank2d"),s0,s1);*/ \ + Kokkos::resize(a, s0,s1); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } @@ -189,7 +193,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0,"BASKER ASSERT MALLOC int_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = INT_2DARRAY("int_2d",s); \ + /*a = INT_2DARRAY("int_2d",s);*/ \ + a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -198,7 +203,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC entry_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = ENTRY_1DARRAY(BASKER_KOKKOS_NOINIT("entry_1d"),s); \ + /*a = ENTRY_1DARRAY(BASKER_KOKKOS_NOINIT("entry_1d"),s);*/ \ + Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -207,7 +213,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC entry_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = ENTRY_2DARRAY("entry_2d",s); \ + /*a = ENTRY_2DARRAY("entry_2d",s);*/ \ + a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -216,7 +223,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC bool_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = BOOL_1DARRAY(BASKER_KOKKOS_NOINIT("bool_1d"), s); \ + /*a = BOOL_1DARRAY(BASKER_KOKKOS_NOINIT("bool_1d"), s);*/ \ + Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -225,7 +233,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC bool_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = BOOL_2DARRAY("bool_2d", s); \ + /*a = BOOL_2DARRAY("bool_2d", s);*/ \ + Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -234,7 +243,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = MATRIX_1DARRAY("matrix_1d",s); \ + /*a = MATRIX_1DARRAY("matrix_1d",s)*/ \ + a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -243,7 +253,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = MATRIX_2DARRAY("matrix_2d",s); \ + /*a = MATRIX_2DARRAY("matrix_2d",s);*/ \ + a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -252,7 +263,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_view_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = MATRIX_VIEW_1DARRAY("matrix_view_1d",s); \ + /*a = MATRIX_VIEW_1DARRAY("matrix_view_1d",s);*/ \ + a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -261,7 +273,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_view_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = MATRIX_VIEW_2DARRAY("matrix_view_2d",s); \ + /*a = MATRIX_VIEW_2DARRAY("matrix_view_2d",s);*/ \ + a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -270,7 +283,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC thread_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = THREAD_1DARRAY("thread_1d",s); \ + /*a = THREAD_1DARRAY("thread_1d",s);*/ \ + a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -279,9 +293,10 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC thread_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - a = THREAD_2DARRAY("thread_2d",s); \ + /*a = THREAD_2DARRAY("thread_2d",s);*/ \ + a.resize(s); \ if(a.data() == NULL) \ - throw std::bad_alloc(); \ + throw std::bad_alloc(); \ } \ } //RESIZE (with copy) @@ -334,77 +349,92 @@ enum BASKER_INCOMPLETE_CODE #define FREE(a) BASKER_NO_OP -#define FREE_INT_1DARRAY_PAIRS(a) \ - { \ - a = INT_1DARRAY_PAIRS(); \ +#define FREE_INT_1DARRAY_PAIRS(a) \ + { \ + /*a = INT_1DARRAY_PAIRS();*/ \ + Kokkos::resize(a,0); \ } -#define FREE_INT_1DARRAY(a) \ - { \ - a = INT_1DARRAY(); \ +#define FREE_INT_1DARRAY(a) \ + { \ + /*a = INT_1DARRAY();*/ \ + Kokkos::resize(a,0); \ } -#define FREE_INT_RANK2DARRAY(a) \ - { \ - a = INT_RANK2DARRAY(); \ +#define FREE_INT_RANK2DARRAY(a) \ + { \ + /*a = INT_RANK2DARRAY();*/ \ + Kokkos::resize(a,0); \ } -#define FREE_INT_2DARRAY(a,n) \ - { \ - a = INT_2DARRAY(); \ +#define FREE_INT_2DARRAY(a,n) \ + { \ + /*a = INT_2DARRAY();*/ \ + a.resize(0); \ } -#define FREE_ENTRY_1DARRAY(a) \ - { \ - a = ENTRY_1DARRAY(); \ +#define FREE_ENTRY_1DARRAY(a) \ + { \ + /*a = ENTRY_1DARRAY();*/ \ + Kokkos::resize(a,0); \ } -#define FREE_ENTRY_2DARRAY(a,n) \ - { \ - a = ENTRY_2DARRAY(); \ +#define FREE_ENTRY_2DARRAY(a,n) \ + { \ + /*a = ENTRY_2DARRAY();*/ \ + a.resize(0); \ } -#define FREE_BOOL_1DARRAY(a) \ - { \ - a = BOOL_1DARRAY(); \ +#define FREE_BOOL_1DARRAY(a) \ + { \ + /*a = BOOL_1DARRAY();*/ \ + Kokkos::resize(a,0); \ } -#define FREE_BOOL_2DARRAY(a,n) \ - { \ - a = BOOL_2DARRAY(); \ +#define FREE_BOOL_2DARRAY(a,n) \ + { \ + /*a = BOOL_2DARRAY();*/ \ + Kokkos::resize(a,0); \ } -#define FREE_MATRIX_1DARRAY(a) \ - { \ - a = MATRIX_1DARRAY(); \ +#define FREE_MATRIX_1DARRAY(a) \ + { \ + /*a = MATRIX_1DARRAY();*/ \ + a.resize(0); \ } -#define FREE_MATRIX_2DARRAY(a,n) \ - { \ - a = MATRIX_2DARRAY(); \ +#define FREE_MATRIX_2DARRAY(a,n) \ + { \ + /*a = MATRIX_2DARRAY();*/ \ + a.resize(0); \ } #define FREE_MATRIX_VIEW_1DARRAY(a) \ - { \ - a = MATRIX_VIEW_1DARRAY(); \ + { \ + /*a = MATRIX_VIEW_1DARRAY();*/ \ + Kokkos::resize(a,0); \ } -#define FREE_MATRIX_VIEW_2DARRAY(a,n) \ - { \ - a = MATRIX_VIEW_2DARRAY(); \ +#define FREE_MATRIX_VIEW_2DARRAY(a,n) \ + { \ + /*a = MATRIX_VIEW_2DARRAY();*/ \ + a.resize(0); \ } #define FREE_THREAD_1DARRAY(a) \ - { \ - a = THREAD_1DARRAY(); \ + { \ + /*a = THREAD_1DARRAY();*/ \ + a.resize(0); \ } -#define FREE_THREAD_2DARRAY(a,n) \ - { \ - a = TRHEAD_2DARRAY(); \ +#define FREE_THREAD_2DARRAY(a,n) \ + { \ + /*a = TRHEAD_2DARRAY();*/ \ + Kokkos::resize(a,0); \ } -#else +#else // not BASKER_KOKKOS + //Execution Space #define BASKER_EXE_SPACE void* //ReMacro Basker Classes diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp index 130f62ea6127..9cf52f3db66d 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp @@ -252,11 +252,11 @@ namespace BaskerNS typedef Kokkos::TeamPolicy TeamPolicy; typedef typename TeamPolicy::member_type TeamMember; Kokkos::parallel_for( - TeamPolicy(Exe_Space::thread_pool_size(),1), - KOKKOS_LAMBDA(const TeamMember& thread) + TeamPolicy(Exe_Space::thread_pool_size(),1), + KOKKOS_LAMBDA(const TeamMember& thread) #else #pragma omp parallel - #endif + #endif { #ifdef BASKER_KOKKOS if(kid == thread.league_rank()) @@ -291,12 +291,11 @@ namespace BaskerNS #ifdef BASKER_KOKKOS typedef Kokkos::TeamPolicy TeamPolicy; typedef typename TeamPolicy::member_type TeamMember; - Kokkos::parallel_for( - TeamPolicy(Exe_Space::thread_pool_size(),1), - KOKKOS_LAMBDA(const TeamMember& thread) + Kokkos::parallel_for(TeamPolicy(Exe_Space::thread_pool_size(),1), + KOKKOS_LAMBDA(const TeamMember& thread) #else #pragma omp parallel - #endif + #endif { #ifdef BASKER_KOKKOS if(kid == thread.league_rank()) @@ -328,7 +327,7 @@ namespace BaskerNS { for(Int b=chunk_start; b < chunk_end; b++) { - BASKER_MATRIX &L = LBTF(b-btf_tabs_offset); + BASKER_MATRIX &L = LBTF[b-btf_tabs_offset]; L.clear_pend(); L.nnz = L.mnnz; }//end-for over chunck @@ -343,7 +342,7 @@ namespace BaskerNS #if defined(BASKER_SPLIT_A) for(Int b=chunk_start; b < chunk_end; b++) { - BASKER_MATRIX &L = L_D(b); + BASKER_MATRIX &L = L_D[b]; L.clear_pend(); L.nnz = L.mnnz; }//end-for over chunck @@ -359,7 +358,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S(lvl)(kid); + Int b = S[lvl][kid]; for(Int row = 0; row < LL_size(b); row++) { @@ -368,8 +367,8 @@ namespace BaskerNS b, row, kid, LL[b][row].nnz); #endif - LL(b)(row).clear_pend(); - LL(b)(row).nnz = LL(b)(row).mnnz; + LL[b][row].clear_pend(); + LL[b][row].nnz = LL[b][row].mnnz; }//end over all row }//end select which thread @@ -379,7 +378,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S(lvl)(kid); + Int b = S[lvl][kid]; #ifdef BASKER_DEBUG_INIT printf("U Factor init: %d %d, nnz: %ld \n", @@ -388,9 +387,9 @@ namespace BaskerNS #endif //LU(b)(LU_size(b)-1).nnz = 0; - for(Int kk = 0; kk < LU(b)(LU_size(b)-1).ncol+1; kk++) + for(Int kk = 0; kk < LU[b][LU_size(b)-1].ncol+1; kk++) { - LU(b)(LU_size(b)-1).col_ptr(kk) = 0; + LU[b][LU_size(b)-1].col_ptr(kk) = 0; } /* @@ -400,16 +399,16 @@ namespace BaskerNS LU(b)(LU_size(b)-1).mnnz); */ - LU(b)(LU_size(b)-1).nnz = LU(b)(LU_size(b)-1).mnnz; + LU[b][LU_size(b)-1].nnz = LU[b][LU_size(b)-1].mnnz; for(Int l = lvl+1; l < tree.nlvls+1; l++) { - Int U_col = S(l)(kid); + Int U_col = S[l][kid]; Int my_row_leader = find_leader(kid, l-1); Int my_new_row = - b - S(0)(my_row_leader); + b - S[0][my_row_leader]; - Int U_row = (l==1)?(kid%2):S(lvl)(kid)%LU_size(U_col); + Int U_row = (l==1)?(kid%2):S[lvl][kid]%LU_size(U_col); //JDB TEST PASS U_row = my_new_row; @@ -420,9 +419,9 @@ namespace BaskerNS LU[U_col][U_row].nnz); #endif - for(Int kk = 0; kk < LU(U_col)(U_row).ncol+1; kk++) + for(Int kk = 0; kk < LU[U_col][U_row].ncol+1; kk++) { - LU(U_col)(U_row).col_ptr(kk) = 0; + LU[U_col][U_row].col_ptr(kk) = 0; } /* printf("flipU (%d,%d) %d %d \n", @@ -431,7 +430,7 @@ namespace BaskerNS LU(U_col)(U_row).mnnz); */ - LU(U_col)(U_row).nnz = LU(U_col)(U_row).mnnz; + LU[U_col][U_row].nnz = LU[U_col][U_row].mnnz; //LU(U_col)(U_row).nnz = 0; }//over inner lvls @@ -455,13 +454,13 @@ namespace BaskerNS Kokkos::Timer timer_init_matrixL; Kokkos::Timer timer_fill_matrixL; timer_initL.reset(); + printf( " > t_init_factor( tid = %d, nlvls = %d ) <\n",kid,tree.nlvls+1 ); fflush(stdout); #endif - //printf( " > t_init_factor( tid = %d ) <\n",kid ); for(Int lvl = 0; lvl < tree.nlvls+1; lvl++) { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S(lvl)(kid); + Int b = S[lvl][kid]; for(Int row = 0; row < LL_size(b); row++) { @@ -472,34 +471,39 @@ namespace BaskerNS #ifdef BASKER_TIMER timer_init_matrixL.reset(); + printf( " ++ lvl=%d: LL(%d,%d): nnz=%d, mnnz=%d ++\n",(int)lvl, (int)b, (int)row, (int)LL[b][row].nnz, (int)LL[b][row].mnnz); fflush(stdout); #endif - //printf( " lvl=%d: LL(%d,%d): nnz=%d, mnnz=%d\n",(int)lvl, (int)b, (int)row, (int)LL(b)(row).nnz, (int)LL(b)(row).mnnz); - LL(b)(row).init_matrix("Loffdig", - LL(b)(row).srow, - LL(b)(row).nrow, - LL(b)(row).scol, - LL(b)(row).ncol, - LL(b)(row).nnz); + LL[b][row].init_matrix("Loffdig", + LL[b][row].srow, + LL[b][row].nrow, + LL[b][row].scol, + LL[b][row].ncol, + LL[b][row].nnz); #ifdef BASKER_TIMER + printf( " >> LL(%d,%d).init_matrix done <<\n",b,row ); fflush(stdout); init_matrixL_time += timer_init_matrixL.seconds(); #endif //Fix when this all happens in the future if(Options.incomplete == BASKER_TRUE) { - LL(b)(row).init_inc_lvl(); + LL[b][row].init_inc_lvl(); } #ifdef BASKER_TIMER timer_fill_matrixL.reset(); + printf( " ++ zero out (%d) ++\n",int(LL[b][row].col_ptr.extent(0)) ); fflush(stdout); #endif //LL(b)(row).fill(); - Kokkos::deep_copy(LL(b)(row).col_ptr, 0); + LL[b][row].init_ptr(); + //Kokkos::deep_copy(LL(b)(row).col_ptr, 0); #ifdef BASKER_TIMER + printf( " LL(%d)(%d).init_pend(ncol = %d)\n",b,row,LL[b][row].ncol ); fflush(stdout); fill_matrixL_time += timer_fill_matrixL.seconds(); #endif - //printf( " LL(%d)(%d).init_pend(ncol = %d)\n",b,row,LL(b)(row).ncol ); - LL(b)(row).init_pend(); - + LL[b][row].init_pend(); + #ifdef BASKER_TIMER + printf( " (b=%d: row=%d) done\n\n",b,row ); fflush(stdout); + #endif }//end over all row }//end select which thread }//end for over all lvl @@ -508,6 +512,7 @@ namespace BaskerNS std::cout << " > Basker t_init_factor::initL(" << kid << "): time: " << initL_time << std::endl; std::cout << " > + Basker t_init_factor::initL::initMatrix(" << kid << "): time: " << init_matrixL_time << std::endl; std::cout << " > + Basker t_init_factor::initL::fillMatrix(" << kid << "): time: " << fill_matrixL_time << std::endl; + fflush(stdout); #endif //U @@ -519,7 +524,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S(lvl)(kid); + Int b = S[lvl][kid]; #ifdef BASKER_DEBUG_INIT printf("U Factor init: %d %d, nnz: %ld \n", @@ -528,25 +533,26 @@ namespace BaskerNS #endif //printf( " lvl=%d: LU(%d,%d): nnz=%d, mnnz=%d\n", (int)lvl, (int)b, (int)LU_size(b)-1, (int)LU(b)(LU_size(b)-1).nnz, (int)LU(b)(LU_size(b)-1).mnnz); - LU(b)(LU_size(b)-1).init_matrix("Udiag", - LU(b)(LU_size(b)-1).srow, - LU(b)(LU_size(b)-1).nrow, - LU(b)(LU_size(b)-1).scol, - LU(b)(LU_size(b)-1).ncol, - LU(b)(LU_size(b)-1).nnz); + LU[b][LU_size(b)-1].init_matrix("Udiag", + LU[b][LU_size(b)-1].srow, + LU[b][LU_size(b)-1].nrow, + LU[b][LU_size(b)-1].scol, + LU[b][LU_size(b)-1].ncol, + LU[b][LU_size(b)-1].nnz); //LU(b)(LU_size(b)-1).fill(); - Kokkos::deep_copy(LU(b)(LU_size(b)-1).col_ptr, 0); + LU[b][LU_size(b)-1].init_ptr(); + //Kokkos::deep_copy(LU(b)(LU_size(b)-1).col_ptr, 0); for(Int l = lvl+1; l < tree.nlvls+1; l++) { - Int U_col = S(l)(kid); + Int U_col = S[l][kid]; Int my_row_leader = find_leader(kid, l-1); Int my_new_row = - b - S(0)(my_row_leader); + b - S[0][my_row_leader]; - Int U_row = (l==1)?(kid%2):S(lvl)(kid)%LU_size(U_col); + Int U_row = (l==1)?(kid%2):S[lvl][kid]%LU_size(U_col); if( (b > 14) && // NDE: Why is 14 specifically used here? (b > LU_size(U_col)) && @@ -577,19 +583,20 @@ namespace BaskerNS #endif //printf( " > l=%d: LU(%d,%d): nnz=%d, mnnz=%d\n", (int)l, (int)U_col, (int)U_row, (int)LU(U_col)(U_row).nnz, (int)LU(U_col)(U_row).mnnz); - LU(U_col)(U_row).init_matrix("Uoffdiag", - LU(U_col)(U_row).srow, - LU(U_col)(U_row).nrow, - LU(U_col)(U_row).scol, - LU(U_col)(U_row).ncol, - LU(U_col)(U_row).nnz); + LU[U_col][U_row].init_matrix("Uoffdiag", + LU[U_col][U_row].srow, + LU[U_col][U_row].nrow, + LU[U_col][U_row].scol, + LU[U_col][U_row].ncol, + LU[U_col][U_row].nnz); //LU(U_col)(U_row).fill(); - Kokkos::deep_copy(LU(U_col)(U_row).col_ptr, 0); + LU[U_col][U_row].init_ptr(); + //Kokkos::deep_copy(LU(U_col)(U_row).col_ptr, 0); if(Options.incomplete == BASKER_TRUE) { - LU(U_col)(U_row).init_inc_lvl(); + LU[U_col][U_row].init_inc_lvl(); } }//over inner lvls @@ -624,15 +631,15 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S(lvl)(kid); + Int b = S[lvl][kid]; for(Int row = 0; row < LL_size(b); row++) { #ifdef BASKER_DEBUG_INIT printf("ALM Factor Init: %d %d , kid: %d, nnz: %d nrow: %d ncol: %d \n", - b, row, kid, ALM(b)(row).nnz, - ALM(b)(row).nrow, - ALM(b)(row).ncol); + b, row, kid, ALM[b][row].nnz, + ALM[b][row].nrow, + ALM[b][row].ncol); #endif /*if (kid == 1) @@ -647,7 +654,7 @@ namespace BaskerNS printf("ALM(%d,%d: %dx%d) alloc with A: kid=%d btf=%d\n", b, row, ALM(b)(row).nrow, ALM(b)(row).ncol, kid, Options.btf); #endif - ALM(b)(row).convert2D(A, alloc, kid); + ALM[b][row].convert2D(A, alloc, kid); } else { @@ -656,7 +663,7 @@ namespace BaskerNS printf("ALM(%d,%d, %dx%d) alloc (btf) with BTF_A: kid=%d \n", b, row, ALM(b)(row).nrow, ALM(b)(row).ncol, kid); #endif - ALM(b)(row).convert2D(BTF_A, alloc, kid); + ALM[b][row].convert2D(BTF_A, alloc, kid); } /*if (kid == 0) { for(Int j = 0; j < ALM(b)(row).ncol; j++) { @@ -676,14 +683,14 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S(lvl)(kid); + Int b = S[lvl][kid]; #ifdef BASKER_DEBUG_INTI printf("AUM Factor init: %d %d, kid: %d nnz: %d nrow: %d ncol: %d \n", b, LU_size(b)-1, kid, - AVM(b)(LU_size(b)-1).nnz, - AVM(b)(LU_size(b)-1).nrow, - AVM(b)(LU_size(b)-1).ncol); + AVM[b][LU_size(b)-1].nnz, + AVM[b][LU_size(b)-1].nrow, + AVM[b][LU_size(b)-1].ncol); #endif /*if (kid == 1) { @@ -692,13 +699,13 @@ namespace BaskerNS }*/ if(Options.btf == BASKER_FALSE) { - AVM(b)(LU_size(b)-1).convert2D(A, alloc, kid); + AVM[b][LU_size(b)-1].convert2D(A, alloc, kid); } else { //printf("Using BTF AU\n"); //printf(" > kid=%d: convert2D AVM(%d,%d)\n", kid, b, LU_size(b)-1); - AVM(b)(LU_size(b)-1).convert2D(BTF_A, alloc, kid); + AVM[b][LU_size(b)-1].convert2D(BTF_A, alloc, kid); } /*if (kid == 0) { for(Int j = 0; j < AVM(b)(LU_size(b)-1).ncol; j++) { @@ -715,10 +722,10 @@ namespace BaskerNS //TEST Int my_leader = find_leader(kid,l-1); - Int my_leader_row = S(0)(my_leader); + Int my_leader_row = S[0][my_leader]; //Int my_col_size = pow(2,l); Not used Int my_new_row = - (S(lvl)(kid) - my_leader_row); + (S[lvl][kid] - my_leader_row); //my_new_row = my_new_row%my_col_size; /* @@ -729,7 +736,7 @@ namespace BaskerNS my_col_size, my_new_row); */ - Int U_col = S(l)(kid); + Int U_col = S[l][kid]; Int U_row = my_new_row; //Int U_row = (l==1)?(kid%2):S(lvl)(kid)%LU_size(U_col); @@ -755,9 +762,9 @@ namespace BaskerNS #ifdef BASKER_DEBUG_INIT printf("Init AUM: %d %d lvl: %d l: %d kid: %d nnz: %d nrow: %d ncol: %d \n", U_col, U_row, lvl, l, kid, - AVM(U_col)(U_row).nnz, - AVM(U_col)(U_row).nrow, - AVM(U_col)(U_row).ncol); + AVM[U_col][U_row].nnz, + AVM[U_col][U_row].nrow, + AVM[U_col][U_row].ncol); #endif #if 0 @@ -775,9 +782,9 @@ namespace BaskerNS { //printf(" %d: Using BTF AVM(%d,%d), %dx%d\n",kid,U_col,U_row, AVM(U_col)(U_row).nrow,AVM(U_col)(U_row).ncol); //printf("2nd convert AVM: %d %d size:%d kid: %d\n", - // U_col, U_row, AVM(U_col)(U_row).nnz, + // U_col, U_row, AVM(U_col)(U_row).nnz, // kid); - AVM(U_col)(U_row).convert2D(BTF_A, alloc, kid); + AVM[U_col][U_row].convert2D(BTF_A, alloc, kid); //printf(" %d: Using BTF AU(%d,%d) done\n",kid,U_col,U_row); } @@ -807,22 +814,22 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S(lvl)(kid); + Int b = S[lvl][kid]; for(Int l = 0; l < LL_size(b); l++) { //defining here - LL(b)(l).iws_size = LL(b)(l).nrow; + LL[b][l].iws_size = LL[b][l].nrow; //This can be made smaller, see notes in Sfactor_old - LL(b)(l).iws_mult = 5; - LL(b)(l).ews_size = LL(b)(l).nrow; + LL[b][l].iws_mult = 5; + LL[b][l].ews_size = LL[b][l].nrow; //This can be made smaller, see notes in sfactor_old - LL(b)(l).ews_mult = 2; + LL[b][l].ews_mult = 2; - Int iws_size = LL(b)(l).iws_size; - Int iws_mult = LL(b)(l).iws_mult; - Int ews_size = LL(b)(l).ews_size; - Int ews_mult = LL(b)(l).ews_mult; + Int iws_size = LL[b][l].iws_size; + Int iws_mult = LL[b][l].iws_mult; + Int ews_size = LL[b][l].ews_size; + Int ews_mult = LL[b][l].ews_mult; if(iws_size > max_sep_size) { @@ -835,10 +842,10 @@ namespace BaskerNS } BASKER_ASSERT((iws_size*iws_mult)>0, "util iws"); - MALLOC_INT_1DARRAY(LL(b)(l).iws, iws_size*iws_mult); + MALLOC_INT_1DARRAY(LL[b][l].iws, iws_size*iws_mult); for(Int i=0; i 0) { BASKER_ASSERT((ews_size*ews_mult)>0, "util ews"); - MALLOC_ENTRY_1DARRAY(LL(b)(l).ews, ews_size*ews_mult); + MALLOC_ENTRY_1DARRAY(LL[b][l].ews, ews_size*ews_mult); for(Int i=0; i::find_leader(Int kid, Int l) { l = l+1; - Int my_token = S(l)(kid); + Int my_token = S[l][kid]; Int my_loc = kid; while((my_loc > 0)) { my_loc--; - if(S(l)(my_loc) != my_token) + if(S[l][my_loc] != my_token) { my_loc++; break; @@ -2477,4 +2485,5 @@ namespace BaskerNS }//end namespace basker +#undef BASKER_TIMER #endif //end basker_util_hpp From e484f62bbdade30959fa3541503bd993ea7faf64 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Thu, 24 Oct 2024 16:12:13 -0600 Subject: [PATCH 067/243] ShyLU - Basker : memory tune Signed-off-by: iyamazaki --- .../basker/src/shylubasker_decl.hpp | 7 -- .../basker/src/shylubasker_nfactor.hpp | 17 +--- .../basker/src/shylubasker_sfactor.hpp | 87 +++++++++++-------- .../basker/src/shylubasker_structs.hpp | 1 - .../basker/src/shylubasker_types.hpp | 33 +------ .../basker/src/shylubasker_util.hpp | 13 ++- 6 files changed, 69 insertions(+), 89 deletions(-) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp index 94f4ba1df086..f9b33e325bd7 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp @@ -1494,13 +1494,6 @@ namespace BaskerNS //end NDE - - //RHS and solutions (These are not used anymore) - ENTRY_2DARRAY rhs; - ENTRY_2DARRAY sol; - Int nrhs; - - BASKER_TREE part_tree; BASKER_TREE tree; BASKER_SYMBOLIC_TREE stree; diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor.hpp index d2c6a5690528..cef593230d5e 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor.hpp @@ -171,13 +171,9 @@ namespace BaskerNS }//end while if(Options.verbose == BASKER_TRUE) { - printf("Time DOMAIN: %lf \n", timer.seconds()); + printf("Time DOMAIN: %lf \n\n", timer.seconds()); timer.reset(); } - #ifdef BASKER_TIMER - printf("Time DOMAIN: %lf \n", timer.seconds()); - timer.reset(); - #endif #else// else basker_kokkos #pragma omp parallel @@ -282,13 +278,9 @@ namespace BaskerNS //printf( " End Sep: info = %d (%d, %d)\n",info,BASKER_SUCCESS,BASKER_ERROR ); if(Options.verbose == BASKER_TRUE) { - printf("Time SEP: %lf \n", timer.seconds()); + printf("Time SEP: %lf \n\n", timer.seconds()); timer.reset(); } - #ifdef BASKER_TIMER - printf("Time SEP: %lf \n", timer.seconds()); - timer.reset(); - #endif } // ---------------------------------------------------------------------------------------- // @@ -363,11 +355,8 @@ namespace BaskerNS if(Options.verbose == BASKER_TRUE) { - printf("Time BTF: %lf \n", timer.seconds()); + printf("Time BTF: %lf \n\n", timer.seconds()); } - #ifdef BASKER_TIMER - printf("Time BTF: %lf \n", timer.seconds()); - #endif }//end btf call Kokkos::Timer tzback; diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp index fd11208ea309..d48f0e720a7e 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp @@ -381,7 +381,7 @@ int Basker::sfactor() if(Options.verbose == BASKER_TRUE) { printf( " >> leaf_assign_nnz(LL(%d)(%d))\n",(int)blk,0); - printf( " >> leaf_assign_nnz(LL(%d)(%d))\n",(int)blk,(int)LU_size(blk)-1); + printf( " >> leaf_assign_nnz(LU(%d)(%d))\n",(int)blk,(int)LU_size(blk)-1); } #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) timer1.reset(); @@ -403,7 +403,7 @@ int Basker::sfactor() #endif #ifdef BASKER_TIMER double dom_time = timer2.seconds(); - std::cout << " DOMAIN BLKs done : " << dom_time << std::endl; + std::cout << " DOMAIN BLKs done : " << dom_time << std::endl << std::endl; #endif for(Int p = 0; p < num_threads; ++p) @@ -462,16 +462,15 @@ int Basker::sfactor() //Assign nnz counts for leaf off-diag //U_assign_nnz(LU[U_col][U_row], stree, 0); //L_assign_nnz(LL[blk][l+1], stree, 0); - if(Options.verbose == BASKER_TRUE) - { - printf( " ++ leaf_assign_nnz(LU(%d, %d))\n",(int)U_col,(int)U_row); - printf( " ++ leaf_assign_nnz(LL(%d, %d))\n",(int)blk,(int)l+1); - } #ifdef BASKER_TIMER timer1.reset(); #endif - //printf( " U_assign_nnz(LU(%d,%d))\n",U_col,U_row ); double fill_factor = BASKER_DOM_NNZ_OVER+Options.user_fill; + if(Options.verbose == BASKER_TRUE) + { + printf( " ++ U_assign_nnz(LU(%d, %d)) fill-factor x(%f+%f = %f)\n",(int)U_col,(int)U_row, BASKER_DOM_NNZ_OVER,Options.user_fill,fill_factor); + printf( " ++ L_assign_nnz(LL(%d, %d)) fill-factor x(%f+%f = %f)\n",(int)blk,(int)l+1, BASKER_DOM_NNZ_OVER,Options.user_fill,fill_factor); + } #ifdef SHYLU_BASKER_STREE_LIST U_assign_nnz(LU[U_col][U_row], stree_p, fill_factor, 0); L_assign_nnz(LL[blk][l+1], stree_p, fill_factor, 0); @@ -490,7 +489,7 @@ int Basker::sfactor() std::cout << " >> symmetric_sfactor::domain : " << timer.seconds() << " seconds" << std::endl; std::cout << " ++ symmetric_sfactor::domain::postorder : " << time1_2 << " + " << time1_3 << " + " << time1 << " seconds" << std::endl; std::cout << " ++ symmetric_sfactor::domain::init : " << time2 << " seconds" << std::endl; - std::cout << " ++ symmetric_sfactor::domain::sfactor : " << time3 << " seconds" << std::endl; + std::cout << " ++ symmetric_sfactor::domain::sfactor : " << time3 << " seconds" << std::endl << std::endl; timer.reset(); #endif @@ -631,13 +630,13 @@ int Basker::sfactor() //Assign nnz + double fill_factor = BASKER_SEP_NNZ_OVER+Options.user_fill; if(Options.verbose == BASKER_TRUE) { - printf( " ++ leaf_assign_nnz(LU(%d, %d))\n",(int)U_col,(int)U_row); - printf( " ++ leaf_assign_nnz(LL(%d, %d))\n",(int)inner_blk,(int)(l-lvl)); + printf( " ++ leaf_assign_nnz(LU(%d, %d)) fill-factor x(%d+%f = %f)\n",(int)U_col,(int)U_row, (int)BASKER_SEP_NNZ_OVER,Options.user_fill,fill_factor); + printf( " ++ leaf_assign_nnz(LL(%d, %d)) fill-factor x(%d+%f = %f)\n",(int)inner_blk,(int)(l-lvl), (int)BASKER_SEP_NNZ_OVER,Options.user_fill,fill_factor); fflush(stdout); } - double fill_factor = BASKER_SEP_NNZ_OVER+Options.user_fill; #ifdef SHYLU_BASKER_STREE_LIST U_assign_nnz(LU[U_col][U_row], stree_p, fill_factor, 0); L_assign_nnz(LL[inner_blk][l-lvl], stree_p, fill_factor, 0); @@ -2260,6 +2259,9 @@ int Basker::sfactor() Int option ) { + #ifdef BASKER_TIMER + printf("leaf_assign_nnz:\n"); + #endif if(option == 0) { const Int Int_MAX = std::numeric_limits::max(); @@ -2268,19 +2270,23 @@ int Basker::sfactor() for(Int i = 0; i < M.ncol; i++) { if (t_nnz <= Int_MAX - ST.col_counts[i]) { + #ifdef BASKER_TIMER + //printf( " > %d: %d += %d\n",i,t_nnz, ST.col_counts[i] ); + #endif t_nnz += ST.col_counts[i]; } else { // let's just hope it is enough, if overflow break; } } - #ifdef BASKER_DEBUG_SFACTOR - printf("leaf nnz: %ld \n", (long)t_nnz); + #ifdef BASKER_TIMER + printf(" > leaf nnz: (%ld + %ld) / 2 = %ld\n", (long)t_nnz,(long)M.ncol,(long)(t_nnz+M.ncol)/2); #endif + t_nnz = long(t_nnz+M.ncol)/2; //double nnz_shoulder = 1.05; double fill_factor = BASKER_DOM_NNZ_OVER+Options.user_fill; // used to boost fill estimate - Int temp = fill_factor*t_nnz; + Int temp = fill_factor*t_nnz; // assuming (t_nnz/2) as triangular part if (temp > t_nnz) { M.nnz = temp; } else { @@ -2298,8 +2304,8 @@ int Basker::sfactor() } if(Options.verbose == BASKER_TRUE) { - printf("leaf with elbow-room global_nnz = %ld, t_nnz = %ld, M.nnz = %ld (%ld x %ld)\n", - (long)global_nnz,(long)t_nnz,(long)M.nnz,(long)M.nrow,(long)M.ncol); + printf("leaf with elbow-room global_nnz = %ld, t_nnz = %ld, M.nnz = %ld (%ld x %ld) with fill-factor x(%d+%f = %f)\n", + (long)global_nnz,(long)t_nnz,(long)M.nnz,(long)M.nrow,(long)M.ncol,(int)BASKER_DOM_NNZ_OVER,Options.user_fill,fill_factor); } } }//end assign_leaf_nnz @@ -2330,12 +2336,12 @@ int Basker::sfactor() } } - #ifdef BASKER_DEBUG_SFACTOR + #ifdef BASKER_TIMER printf("U_assing_nnz: %ld \n", t_nnz); #endif //double fill_factor = 1.05; - Int temp = fill_factor*t_nnz; + Int temp = min(M.nrow*M.ncol, Int(fill_factor*t_nnz)); if (temp >= t_nnz) { M.nnz = temp; } else { @@ -2352,8 +2358,8 @@ int Basker::sfactor() #endif if(Options.verbose == BASKER_TRUE) { - printf("U_assing with elbow global_nnz = %ld, t_nnz = %ld (fill_factor = %f + %f = %f), M.nnz = %ld (%ld x %ld)\n", - (long)global_nnz,(long)t_nnz, BASKER_DOM_NNZ_OVER,Options.user_fill,fill_factor, (long)M.nnz,(long)M.nrow,(long)M.ncol); + printf("U_assing with elbow global_nnz = %ld, t_nnz = %ld (fill_factor = %f), M.nnz = %ld (%ld x %ld)\n", + (long)global_nnz,(long)t_nnz, fill_factor, (long)M.nnz,(long)M.nrow,(long)M.ncol); } } }//end assign_upper_nnz @@ -2384,13 +2390,13 @@ int Basker::sfactor() } } - #ifdef BASKER_DEBUG_SFACTOR + #ifdef BASKER_TIMER printf("L_assign_nnz: %ld \n", t_nnz); #endif // double fill_factor = 2.05; double old_nnz = M.nnz; - Int temp = fill_factor*t_nnz; + Int temp = min(M.nrow*M.ncol, Int(fill_factor*t_nnz)); if (temp >= t_nnz) { M.nnz = temp; } else { @@ -2407,8 +2413,8 @@ int Basker::sfactor() } if(Options.verbose == BASKER_TRUE) { - printf("L_assign with elbow global_nnz = %ld, t_nnz = %ld (fill_factor = %e + %e = %e), M.nnz = %ld -> %ld (%ld x %ld)\n", - (long)global_nnz,(long)t_nnz, BASKER_DOM_NNZ_OVER,Options.user_fill, fill_factor, (long)old_nnz,(long)M.nnz, (long)M.nrow,(long)M.ncol); + printf("L_assign with elbow global_nnz = %ld, t_nnz = %ld (fill_factor = %f), M.nnz = %ld -> %ld (%ld x %ld)\n", + (long)global_nnz,(long)t_nnz, fill_factor, (long)old_nnz,(long)M.nnz, (long)M.nrow,(long)M.ncol); } } }//end assign_lower_nnz @@ -2482,7 +2488,9 @@ int Basker::sfactor() if ((double)nnz > ((double)lblk_size)*((double)lblk_size)) { nnz = lblk_size*lblk_size; } - //printf( " LBTF(%d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)nnz ); + #ifdef BASKER_TIMER + printf( " L_D[%d](%d, size = %d, nnz = %d)\n",i,(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); + #endif L_D[i].init_matrix("LBFT", btf_tabs(i), lblk_size, @@ -2493,6 +2501,9 @@ int Basker::sfactor() //For pruning L_D[i].init_pend(); + #ifdef BASKER_TIMER + printf( " U_D[%d](%d, size = %d, nnz = %d)\n",i,(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); + #endif U_D[i].init_matrix("UBFT", btf_tabs(i), lblk_size, @@ -2532,7 +2543,9 @@ int Basker::sfactor() if ((double)nnz > ((double)lblk_size)*((double)lblk_size)) { nnz = lblk_size*lblk_size; } - //printf( " LBTF(%d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)nnz ); + #ifdef BASKER_TIMER + printf( " LBTF(%d, size = %d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); + #endif LBTF[i-btf_tabs_offset].init_matrix("LBFT", btf_tabs(i), lblk_size, @@ -2544,7 +2557,9 @@ int Basker::sfactor() //printf( " LBTF(%d).init_pend()\n",(int)(i-btf_tabs_offset) ); LBTF[i-btf_tabs_offset].init_pend(); - //printf( " UBTF(%d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)nnz ); + #ifdef BASKER_TIMER + printf( " UBTF(%d, size = %d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); + #endif UBTF[i-btf_tabs_offset].init_matrix("UBFT", btf_tabs(i), lblk_size, @@ -2579,18 +2594,18 @@ int Basker::sfactor() //BASKER_ASSERT((thread_array(i).iws_size*thread_array(i).iws_mult) > 0, "Basker btf_last_dense assert: sfactor threads iws > 0 failed"); //BASKER_ASSERT((thread_array(i).ews_size*thread_array(i).ews_mult) > 0, "Basker btf_last_dense assert: sfactor threads ews > 0 failed"); + #ifdef BASKER_TIMER + printf("Malloc Thread: %d iws: %d \n", + i, (thread_array[i].iws_size* + thread_array[i].iws_mult)); + printf("Malloc Thread: %d ews: %d \n", + i, (thread_array[i].ews_size* + thread_array[i].ews_mult)); + #endif if (max_blk_size > 0) { MALLOC_INT_1DARRAY(thread_array[i].iws, thread_array[i].iws_size*thread_array[i].iws_mult); MALLOC_ENTRY_1DARRAY(thread_array[i].ews, thread_array[i].ews_size*thread_array[i].ews_mult); } - #ifdef BASKER_DEBUG_SFACTOR - printf("Malloc Thread: %d iws: %d \n", - i, (thread_array(i).iws_size* - thread_array(i).iws_mult)); - printf("Malloc Thread: %d ews: %d \n", - i, (thread_array(i).ews_size* - thread_array(i).ews_mult)); - #endif } } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp index bd5bc82efdbc..ef1e29d597e4 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp @@ -54,7 +54,6 @@ namespace BaskerNS #ifndef BASKER_KOKKOS FREE_INT_1DARRAY(iws); FREE_ENTRY_1DARRAY(ews); - //C.Finalize(); #endif } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp index 8ea5c54c8e89..c55c222ec7c7 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp @@ -90,7 +90,7 @@ enum BASKER_INCOMPLETE_CODE #define BASKER_INC_TOL_VALUE 0.0001 //MACRO INC FILL (this will become dynamic in the future) -#define BASKER_FILL_USER 1.00 +#define BASKER_FILL_USER 0.00 #define BASKER_FILL_LESTIMATE 1.50 #define BASKER_FILL_UESTIMATE 1.50 #define BASKER_FILL_LLOWERESTIMATE 2.00 @@ -149,13 +149,12 @@ enum BASKER_INCOMPLETE_CODE #define BOOL_2DARRAY Kokkos::View #define INT_2DARRAY std::vector -#define ENTRY_2DARRAY std::vector +#define ENTRY_2DARRAY std::vector #define MATRIX_1DARRAY std::vector #define MATRIX_2DARRAY std::vector #define MATRIX_VIEW_1DARRAY std::vector #define MATRIX_VIEW_2DARRAY std::vector #define THREAD_1DARRAY std::vector -#define THREAD_2DARRAY std::vector #define INT_1DARRAY_PAIRS Kokkos::View*, BASKER_EXE_SPACE> //Macro Memory Calls @@ -193,7 +192,7 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0,"BASKER ASSERT MALLOC int_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = INT_2DARRAY("int_2d",s);*/ \ + /*a = INT_2DARRAY(Kokkos::view_alloc("int_2d", Kokkos::SequentialHostInit),s);*/ \ a.resize(s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ @@ -289,16 +288,7 @@ enum BASKER_INCOMPLETE_CODE throw std::bad_alloc(); \ } \ } -#define MALLOC_THREAD_2DARRAY(a,s) \ - { \ - BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC thread_2d: size to alloc >= 0 fails"); \ - if (s > 0) { \ - /*a = THREAD_2DARRAY("thread_2d",s);*/ \ - a.resize(s); \ - if(a.data() == NULL) \ - throw std::bad_alloc(); \ - } \ - } + //RESIZE (with copy) #define RESIZE_1DARRAY(a,os,s) \ { \ @@ -427,12 +417,6 @@ enum BASKER_INCOMPLETE_CODE a.resize(0); \ } -#define FREE_THREAD_2DARRAY(a,n) \ - { \ - /*a = TRHEAD_2DARRAY();*/ \ - Kokkos::resize(a,0); \ - } - #else // not BASKER_KOKKOS //Execution Space @@ -458,7 +442,6 @@ enum BASKER_INCOMPLETE_CODE #define MATRIX_VIEW_1DARRAY BASKER_MATRIX_VIEW* #define MATRIX_VIEW_2DARRAY BASKER_MATRIX_VIEW** #define THREAD_1DARRAY BASKER_THREAD* -#define THREAD_2DARRAY BASKER_THREAD** //Macro Memory Calls //Malloc @@ -473,7 +456,6 @@ enum BASKER_INCOMPLETE_CODE #define MALLOC_MATRIX_VIEW_1DARRAY(a,s) a = new BASKER_MATRIX_VIEW [s] #define MALLOC_MATRIX_VIEW_2DARRAY(a,s) a = new MATRIX_VIEW_1DARRAY[s] #define MALLOC_THREAD_1DARRAY(a,s) a = new BASKER_THREAD [s] -#define MALLOC_THREAD_2DARRAY(a,s) a = new THREAD_1DARRAY [s] //Resize (copy old data) (come back and add) #define RESIZE_1DARRAY(a,os,s) BASKER_NO_OP #define RESIZE_2DARRAY(a,os1,os2,s1,s2) BASKER_NO_OP @@ -555,13 +537,6 @@ enum BASKER_INCOMPLETE_CODE FREE(a); \ } -#define FREE_THREAD_2DARRAY(a,n) \ - { \ - for(BASKER_INT MACRO_I = 0; MACRO_I < s; MACRO_I++) \ - FREE(a[MACRO_I]); \ - FREE(a); \ - } - #endif //end ifdef BASKER_KOKKOS //Inline command diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp index 9cf52f3db66d..3691d73c63ba 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp @@ -532,7 +532,11 @@ namespace BaskerNS LU[b][LU_size[b]-1].nnz); #endif - //printf( " lvl=%d: LU(%d,%d): nnz=%d, mnnz=%d\n", (int)lvl, (int)b, (int)LU_size(b)-1, (int)LU(b)(LU_size(b)-1).nnz, (int)LU(b)(LU_size(b)-1).mnnz); + #ifdef BASKER_TIMER + printf( " lvl=%d: LU(%d,%d): %dx%d, nnz=%d, mnnz=%d, at (%d,%d)\n", (int)lvl, (int)b, (int)LU_size(b)-1, + (int)LU[b][LU_size(b)-1].nrow,(int)LU[b][LU_size(b)-1].ncol,(int)LU[b][LU_size(b)-1].nnz, (int)LU[b][LU_size(b)-1].mnnz, + (int)LU[b][LU_size(b)-1].srow,(int)LU[b][LU_size(b)-1].scol); + #endif LU[b][LU_size(b)-1].init_matrix("Udiag", LU[b][LU_size(b)-1].srow, LU[b][LU_size(b)-1].nrow, @@ -582,7 +586,12 @@ namespace BaskerNS LU[U_col][U_row].nnz); #endif - //printf( " > l=%d: LU(%d,%d): nnz=%d, mnnz=%d\n", (int)l, (int)U_col, (int)U_row, (int)LU(U_col)(U_row).nnz, (int)LU(U_col)(U_row).mnnz); + #ifdef BASKER_TIMER + printf( " +++ l=%d: LU(%d,%d): %dx%d, nnz=%d, mnnz=%d at (%d,%d)\n", (int)l, (int)U_col, (int)U_row, + (int)LU[U_col][U_row].nrow,(int)LU[U_col][U_row].ncol, + (int)LU[U_col][U_row].nnz, (int)LU[U_col][U_row].mnnz, + (int)LU[U_col][U_row].srow,(int)LU[U_col][U_row].scol); + #endif LU[U_col][U_row].init_matrix("Uoffdiag", LU[U_col][U_row].srow, LU[U_col][U_row].nrow, From 03b1ab0362ca3aa4c620146b57229652570d9c15 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Fri, 25 Oct 2024 17:33:32 -0600 Subject: [PATCH 068/243] ShyLU - Basker : replace std::vector with SequentialHostInit Signed-off-by: iyamazaki --- .../basker/src/shylubasker_error_manager.hpp | 150 +++++++-------- .../basker/src/shylubasker_nfactor_blk.hpp | 74 ++++---- .../src/shylubasker_nfactor_blk_inc.hpp | 171 +++++++++--------- .../basker/src/shylubasker_nfactor_col.hpp | 148 +++++++-------- .../basker/src/shylubasker_nfactor_col2.hpp | 44 ++--- .../src/shylubasker_nfactor_col_inc.hpp | 158 ++++++++-------- .../basker/src/shylubasker_nfactor_diag.hpp | 58 +++--- .../basker/src/shylubasker_sfactor.hpp | 64 +++---- .../basker/src/shylubasker_sfactor_inc.hpp | 24 +-- .../basker/src/shylubasker_tree.hpp | 22 +-- .../basker/src/shylubasker_types.hpp | 85 +++------ .../basker/src/shylubasker_util.hpp | 68 +++---- 12 files changed, 513 insertions(+), 553 deletions(-) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp index 84cbb8b801b7..d9695c6e5c78 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp @@ -46,66 +46,66 @@ namespace BaskerNS for(Int ti = 0; ti < num_threads; ti++) { //Note: jdb we can make this into a switch - if(thread_array[ti].error_type == BASKER_ERROR_NOERROR) + if(thread_array(ti).error_type == BASKER_ERROR_NOERROR) { threads_start(ti) = BASKER_MAX_IDX; continue; - } else if(thread_array[ti].error_type == BASKER_ERROR_SINGULAR) + } else if(thread_array(ti).error_type == BASKER_ERROR_SINGULAR) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREAD: " << ti - << " DOMBLK SINGULAR: blk=" << thread_array[ti].error_blk + << " DOMBLK SINGULAR: blk=" << thread_array(ti).error_blk << std::endl; } return BASKER_ERROR; - } else if(thread_array[ti].error_type == BASKER_ERROR_NOMALLOC) + } else if(thread_array(ti).error_type == BASKER_ERROR_NOMALLOC) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREAD: " << ti - << " DOMBLK NOMALLOC : blk=" << thread_array[ti].error_blk + << " DOMBLK NOMALLOC : blk=" << thread_array(ti).error_blk << std::endl; } return BASKER_ERROR; - } else if(thread_array[ti].error_type == BASKER_ERROR_REMALLOC) + } else if(thread_array(ti).error_type == BASKER_ERROR_REMALLOC) { - BASKER_ASSERT(thread_array[ti].error_blk >= 0, "nfactor_dom_error error_blk"); + BASKER_ASSERT(thread_array(ti).error_blk >= 0, "nfactor_dom_error error_blk"); if(Options.verbose == BASKER_TRUE) { std::cout << " > THREAD: " << ti - << " DOMBLK MALLOC : blk=" << thread_array[ti].error_blk - << " subblk=" << thread_array[ti].error_subblk - << " newsize=" << thread_array[ti].error_info + << " DOMBLK MALLOC : blk=" << thread_array(ti).error_blk + << " subblk=" << thread_array(ti).error_subblk + << " newsize=" << thread_array(ti).error_info << std::endl; } //If on diagonal, want to compare L and U Int resize_L = BASKER_MAX_IDX; Int resize_U = BASKER_MAX_IDX; - if(thread_array[ti].error_subblk != BASKER_MAX_IDX) + if(thread_array(ti).error_subblk != BASKER_MAX_IDX) { - BASKER_ASSERT(thread_array[ti].error_info > 0, "L) newsize not big enough"); - resize_L = thread_array[ti].error_info; + BASKER_ASSERT(thread_array(ti).error_info > 0, "L) newsize not big enough"); + resize_L = thread_array(ti).error_info; //if L is already bigger and U, //We will want re size U as, well - if(thread_array[ti].error_subblk == 0) + if(thread_array(ti).error_subblk == 0) { - Int blkcol = thread_array[ti].error_blk; + Int blkcol = thread_array(ti).error_blk; Int blkUrow = LU_size(blkcol)-1; if(LL[blkcol][0].nnz >= LU[blkcol][blkUrow].nnz) { - resize_U = thread_array[ti].error_info; + resize_U = thread_array(ti).error_info; } }//if - a domain } //We don't care about the other way since, //L is already checked before U. - if(thread_array[ti].error_subblk == -1) + if(thread_array(ti).error_subblk == -1) { - resize_U = thread_array[ti].error_info; + resize_U = thread_array(ti).error_info; } //Resize L, if resize_L != -1 (meaning realloc-L is requested) @@ -116,7 +116,7 @@ namespace BaskerNS std::cout << " ++ resize L( tid = " << ti << " ): new size = " << resize_L << std::endl; } BASKER_MATRIX &L = - LL[thread_array[ti].error_blk][thread_array[ti].error_subblk]; + LL[thread_array(ti).error_blk][thread_array(ti).error_subblk]; REALLOC_INT_1DARRAY(L.row_idx, L.nnz, resize_L); @@ -142,7 +142,7 @@ namespace BaskerNS std::cout << " ++ resize U( tid = " << ti << " ): new size = " << resize_U << std::endl; } BASKER_MATRIX &U = - LU[thread_array[ti].error_blk][0]; + LU[thread_array(ti).error_blk][0]; REALLOC_INT_1DARRAY(U.row_idx, U.nnz, resize_U); @@ -153,7 +153,7 @@ namespace BaskerNS U.nnz = resize_U; //Still need to clear pend BASKER_MATRIX &L = - LL[thread_array[ti].error_blk][0]; + LL[thread_array(ti).error_blk][0]; L.clear_pend(); } @@ -163,11 +163,11 @@ namespace BaskerNS { //Clear workspace, whole column for(Int sb = 0; - sb < LL_size(thread_array[ti].error_blk); + sb < LL_size(thread_array(ti).error_blk); sb++) { BASKER_MATRIX &SL = - LL[thread_array[ti].error_blk][sb]; + LL[thread_array(ti).error_blk][sb]; for(Int i = 0; i < SL.iws_size*SL.iws_mult; ++i) { SL.iws(i) = (Int) 0; @@ -198,13 +198,13 @@ namespace BaskerNS }//for - sb (subblks) }//if ws is filled - threads_start(ti) = thread_array[ti].error_blk; + threads_start(ti) = thread_array(ti).error_blk; //Reset - thread_array[ti].error_type = BASKER_ERROR_NOERROR; - thread_array[ti].error_blk = BASKER_MAX_IDX; - thread_array[ti].error_info = BASKER_MAX_IDX; + thread_array(ti).error_type = BASKER_ERROR_NOERROR; + thread_array(ti).error_blk = BASKER_MAX_IDX; + thread_array(ti).error_info = BASKER_MAX_IDX; nthread_remalloc++; }//if REMALLOC @@ -231,26 +231,26 @@ namespace BaskerNS for(Int ti = 0; ti < num_threads; ti++) { //Note: jdb we can make this into a switch - if(thread_array[ti].error_type == BASKER_ERROR_NOERROR) + if(thread_array(ti).error_type == BASKER_ERROR_NOERROR) { thread_start(ti) = BASKER_MAX_IDX; continue; } - else if(thread_array[ti].error_type == BASKER_ERROR_SINGULAR) + else if(thread_array(ti).error_type == BASKER_ERROR_SINGULAR) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREAD: " << ti - << " SEPBLK SINGULAR: blk=" << thread_array[ti].error_blk + << " SEPBLK SINGULAR: blk=" << thread_array(ti).error_blk << std::endl; } return BASKER_ERROR; - } else if(thread_array[ti].error_type == BASKER_ERROR_NOMALLOC) + } else if(thread_array(ti).error_type == BASKER_ERROR_NOMALLOC) { if(Options.verbose == BASKER_TRUE) { std::cout << "ERROR THREADS: " << ti - << " SEPBLK NOMALLOC: blk=" << thread_array[ti].error_blk + << " SEPBLK NOMALLOC: blk=" << thread_array(ti).error_blk << std::endl; } return BASKER_ERROR; @@ -260,22 +260,22 @@ namespace BaskerNS Int error_sep_lvl = BASKER_MAX_IDX; for(Int l = 1; l < tree.nlvls+1; l++) { - if(thread_array[ti].error_blk == S[l][ti]) + if(thread_array(ti).error_blk == S(l)(ti)) { error_sep_lvl = l; break; } } - if(thread_array[ti].error_type == BASKER_ERROR_REMALLOC) + if(thread_array(ti).error_type == BASKER_ERROR_REMALLOC) { - BASKER_ASSERT(thread_array[ti].error_blk >= 0, "nfactor_SEP_error error_blk"); + BASKER_ASSERT(thread_array(ti).error_blk >= 0, "nfactor_SEP_error error_blk"); if(Options.verbose == BASKER_TRUE) { std::cout << " > THREADS: " << ti - << " SEPBLK MALLOC: blk=" << thread_array[ti].error_blk - << " subblk=" << thread_array[ti].error_subblk - << " newsize=" << thread_array[ti].error_info + << " SEPBLK MALLOC: blk=" << thread_array(ti).error_blk + << " subblk=" << thread_array(ti).error_subblk + << " newsize=" << thread_array(ti).error_info << std::endl; std::cout << " > SEPLVL: " << error_sep_lvl << std::endl; } @@ -283,9 +283,9 @@ namespace BaskerNS //If on diagonal, want to compare L and U Int resize_L = BASKER_MAX_IDX; Int resize_U = BASKER_MAX_IDX; - if(thread_array[ti].error_subblk <= -1) + if(thread_array(ti).error_subblk <= -1) { - resize_L = thread_array[ti].error_info; + resize_L = thread_array(ti).error_info; if(Options.verbose == BASKER_TRUE) { std::cout << " ++ L size: " << resize_L << std::endl; @@ -293,9 +293,9 @@ namespace BaskerNS } //We don't care about the other way since, //L is already checked before U. - if(thread_array[ti].error_subblk > -1) + if(thread_array(ti).error_subblk > -1) { - resize_U = thread_array[ti].error_info; + resize_U = thread_array(ti).error_info; if(Options.verbose == BASKER_TRUE) { std::cout << " ++ U size: " << resize_U << std::endl; @@ -305,9 +305,9 @@ namespace BaskerNS //Resize L, if resize_L != -1 (meaning realloc-L is requested) if(resize_L != BASKER_MAX_IDX) { - const Int tsb = (-1*thread_array[ti].error_subblk)-1; + const Int tsb = (-1*thread_array(ti).error_subblk)-1; BASKER_MATRIX &L = - LL[thread_array[ti].error_blk][tsb]; + LL[thread_array(ti).error_blk][tsb]; REALLOC_INT_1DARRAY(L.row_idx, L.nnz, resize_L); @@ -322,9 +322,9 @@ namespace BaskerNS //Resize U, if resize_U != -1 (meaning realloc-U is requested) if(resize_U != BASKER_MAX_IDX) { - const Int tsb = thread_array[ti].error_subblk; + const Int tsb = thread_array(ti).error_subblk; BASKER_MATRIX &U = - LU[thread_array[ti].error_blk][tsb]; + LU[thread_array(ti).error_blk][tsb]; REALLOC_INT_1DARRAY(U.row_idx, U.nnz, resize_U); @@ -346,7 +346,7 @@ namespace BaskerNS //Though this could be done in parallel in the future for(Int p = 0; p < num_threads; p++) { - Int blk = S[0][p]; + Int blk = S(0)(p); //if(LL(blk)(0).w_fill == BASKER_TRUE) { //Clear workspace, whole column @@ -369,7 +369,7 @@ namespace BaskerNS Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A for(Int p = 0; p < num_threads; p++) { - Int blk = S[error_sep_lvl][p]; + Int blk = S(error_sep_lvl)(p); //if(LL(blk)(0).w_fill == BASKER_TRUE) { BASKER_MATRIX &TM = LL[blk][0]; @@ -386,7 +386,7 @@ namespace BaskerNS //Note, will have to clear the perm in all sep blk in that level //Clear permuation BASKER_MATRIX &SL = - LL[thread_array[ti].error_blk][0]; + LL[thread_array(ti).error_blk][0]; //printf( " + scol_top = %d, srow = %d, nrowl = %d\n",scol_top,SL.srow,SL.nrow ); for(Int i = scol_top + SL.srow; i < scol_top + (SL.srow+SL.nrow); i++) { @@ -394,12 +394,12 @@ namespace BaskerNS gperm(i) = BASKER_MAX_IDX; }//for--to clear perm - thread_start(ti) = thread_array[ti].error_blk; + thread_start(ti) = thread_array(ti).error_blk; //Reset - thread_array[ti].error_type = BASKER_ERROR_NOERROR; - thread_array[ti].error_blk = BASKER_MAX_IDX; - thread_array[ti].error_info = BASKER_MAX_IDX; + thread_array(ti).error_type = BASKER_ERROR_NOERROR; + thread_array(ti).error_blk = BASKER_MAX_IDX; + thread_array(ti).error_info = BASKER_MAX_IDX; for(Int i = 0; i < num_threads; i++) { @@ -451,9 +451,9 @@ namespace BaskerNS Int btab = btf_tabs_offset; for(Int ti = 0; ti < num_threads; ti++) { - Int c = thread_array[ti].error_blk; + Int c = thread_array(ti).error_blk; //Note: jdb we can make this into a switch - if(thread_array[ti].error_type == BASKER_ERROR_NOERROR) + if(thread_array(ti).error_type == BASKER_ERROR_NOERROR) { if (c >= btab) { thread_start(ti) = BASKER_MAX_IDX; @@ -463,7 +463,7 @@ namespace BaskerNS continue; }//end if NOERROR - if(thread_array[ti].error_type == BASKER_ERROR_SINGULAR) + if(thread_array(ti).error_type == BASKER_ERROR_SINGULAR) { if(Options.verbose == BASKER_TRUE) { @@ -474,7 +474,7 @@ namespace BaskerNS return BASKER_ERROR; }//end if SINGULAR - if(thread_array[ti].error_type == BASKER_ERROR_NOMALLOC) + if(thread_array(ti).error_type == BASKER_ERROR_NOMALLOC) { std::cout << "ERROR_THREADS: " << ti << " DIAGBLK NOMALLOC blk=" << c @@ -482,16 +482,16 @@ namespace BaskerNS return BASKER_ERROR; }//end if NOMALLOC - if(thread_array[ti].error_type == BASKER_ERROR_REMALLOC) + if(thread_array(ti).error_type == BASKER_ERROR_REMALLOC) { - Int liwork = thread_array[ti].iws_size*thread_array[ti].iws_mult; - Int lework = thread_array[ti].ews_size*thread_array[ti].ews_mult; + Int liwork = thread_array(ti).iws_size*thread_array(ti).iws_mult; + Int lework = thread_array(ti).ews_size*thread_array(ti).ews_mult; BASKER_ASSERT(c >= 0, "nfactor_diag_error error_blk"); if(Options.verbose == BASKER_TRUE) { std::cout << " > THREADS: " << ti << " DIAGBLK MALLOC blk=" << c - << " newsize=" << thread_array[ti].error_info + << " newsize=" << thread_array(ti).error_info << " for both L( " << c << " ) and U( " << c << " )" << std::endl; @@ -504,11 +504,11 @@ namespace BaskerNS for(Int i = 0; i < liwork; i++) { - thread_array[ti].iws(i) = (Int) 0; + thread_array(ti).iws(i) = (Int) 0; } for(Int i = 0; i < lework; i++) { - thread_array[ti].ews(i) = zero; + thread_array(ti).ews(i) = zero; } //Resize L @@ -516,12 +516,12 @@ namespace BaskerNS L.clear_pend(); REALLOC_INT_1DARRAY(L.row_idx, L.nnz, - thread_array[ti].error_info); + thread_array(ti).error_info); REALLOC_ENTRY_1DARRAY(L.val, L.nnz, - thread_array[ti].error_info); - L.mnnz = thread_array[ti].error_info; - L.nnz = thread_array[ti].error_info; + thread_array(ti).error_info); + L.mnnz = thread_array(ti).error_info; + L.nnz = thread_array(ti).error_info; for(Int i = 0; i < L.ncol; i++) { L.col_ptr(i) = 0; @@ -536,12 +536,12 @@ namespace BaskerNS BASKER_MATRIX &U = (c >= btab ? UBTF[c-btab] : U_D[c]); REALLOC_INT_1DARRAY(U.row_idx, U.nnz, - thread_array[ti].error_info); + thread_array(ti).error_info); REALLOC_ENTRY_1DARRAY(U.val, U.nnz, - thread_array[ti].error_info); - U.mnnz = thread_array[ti].error_info; - U.nnz = thread_array[ti].error_info; + thread_array(ti).error_info); + U.mnnz = thread_array(ti).error_info; + U.nnz = thread_array(ti).error_info; for(Int i = 0; i < U.ncol; i++) { U.col_ptr(i) = 0; @@ -561,9 +561,9 @@ namespace BaskerNS } //Reset - thread_array[ti].error_type = BASKER_ERROR_NOERROR; - thread_array[ti].error_blk = BASKER_MAX_IDX; - thread_array[ti].error_info = BASKER_MAX_IDX; + thread_array(ti).error_type = BASKER_ERROR_NOERROR; + thread_array(ti).error_blk = BASKER_MAX_IDX; + thread_array(ti).error_info = BASKER_MAX_IDX; nthread_remalloc++; @@ -593,7 +593,7 @@ namespace BaskerNS { for(Int ti = 0; ti < num_threads; ti++) { - thread_array[ti].error_type = BASKER_ERROR_NOERROR; + thread_array(ti).error_type = BASKER_ERROR_NOERROR; } } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp index 6613d992dbc2..030d526299a1 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp @@ -149,7 +149,7 @@ namespace BaskerNS const Mag normA = BTF_A.gnorm; const Mag normA_blk = BTF_A.anorm; - Int b = S[0][kid]; //Which blk from schedule + Int b = S(0)(kid); //Which blk from schedule BASKER_MATRIX &L = LL[b][0]; BASKER_MATRIX &U = LU[b][LU_size(b)-1]; BASKER_MATRIX &M = ALM[b][0]; //A->blk @@ -159,9 +159,9 @@ namespace BaskerNS ENTRY_1DARRAY X = LL[b][0].ews; Int ws_size = LL[b][0].iws_size; #else //else if BASKER_2DL - INT_1DARRAY ws = thread_array[kid].iws; - ENTRY_1DARRAY X = thread_array[kid].ews; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + ENTRY_1DARRAY X = thread_array(kid).ews; + Int ws_size = thread_array(kid).iws_size; #endif //Int bcol = L.scol; //begining col //NOT UD Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A @@ -577,11 +577,11 @@ namespace BaskerNS } } if (!explicit_pivot) { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_SINGULAR; - thread_array[kid].error_blk = b; - thread_array[kid].error_subblk = 0; - thread_array[kid].error_info = k; + thread_array(kid).error_blk = b; + thread_array(kid).error_subblk = 0; + thread_array(kid).error_info = k; return BASKER_ERROR; } } @@ -676,17 +676,17 @@ namespace BaskerNS (int)kid, (long)b, (long)llnnz, (long)lnnz, (long)lcnt, (int)lnnz, (int)M.nrow, (long)newsize); } - thread_array[kid].error_blk = b; - thread_array[kid].error_subblk = 0; + thread_array(kid).error_blk = b; + thread_array(kid).error_subblk = 0; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } @@ -701,17 +701,17 @@ namespace BaskerNS (int)kid, (long)b, (long)uunnz, (long)unnz+ucnt, (long)k, (int)uunnz, (int)M.nrow, (int)newsize); } - thread_array[kid].error_blk = b; - thread_array[kid].error_subblk = -1; + thread_array(kid).error_blk = b; + thread_array(kid).error_subblk = -1; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } @@ -981,7 +981,7 @@ namespace BaskerNS ) { //Setup variables - const Int wsb = S[0][kid]; + const Int wsb = S(0)(kid); INT_1DARRAY ws = LL[wsb][l].iws; const Int ws_size = LL[wsb][l].iws_size; @@ -1011,11 +1011,11 @@ namespace BaskerNS ) { const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A - const Int b = S[lvl][kid]; + const Int b = S(lvl)(kid); //const Int wsb = S(0)(kid); BASKER_MATRIX &L = LL[b][0]; - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); Int U_row = LU_size(U_col)-1; if(lvl > 0) { @@ -1128,8 +1128,8 @@ namespace BaskerNS { //Setup variables - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); BASKER_MATRIX &L = LL[b][0]; const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A const Int brow_g = L.srow + scol_top; // global offset @@ -1279,15 +1279,15 @@ namespace BaskerNS { //Setup variables - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); BASKER_MATRIX &L = LL[b][0]; #ifdef BASKER_2DL INT_1DARRAY ws = LL[wsb][l].iws; const Int ws_size = LL[wsb][l].iws_size; #else - INT_1DARRAY ws = thread_array[kid].iws; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + Int ws_size = thread_array(kid).iws_size; #endif const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A @@ -1452,17 +1452,17 @@ namespace BaskerNS Int k, Int top, Int xnnz) { - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); BASKER_MATRIX &L = LL[b][0]; #ifdef BASKER_2DL INT_1DARRAY ws = LL[wsb][l].iws; ENTRY_1DARRAY X = LL[wsb][l].ews; Int ws_size = LL[wsb][l].iws_size; #else - INT_1DARRAY ws = thread_array[kid].iws; - ENTRY_1DARRAY X = thread_array[kid].ews; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + ENTRY_1DARRAY X = thread_array(kid).ews; + Int ws_size = thread_array(kid).iws_size; #endif const Entry zero (0.0); @@ -1658,17 +1658,17 @@ namespace BaskerNS (long)blkcol, (long)blkrow, (long)kid, (long)llnnz, (long)lnnz, (long)p_size ); } - thread_array[kid].error_blk = blkcol; - thread_array[kid].error_subblk = blkrow; + thread_array(kid).error_blk = blkcol; + thread_array(kid).error_subblk = blkrow; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } //BASKER_ASSERT(0==1, "REALLOC LOWER BLOCK\n"); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp index 48dae30f95c9..cf6fd8b3c0d9 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp @@ -130,14 +130,14 @@ namespace BaskerNS BASKER_INLINE int Basker::t_nfactor_blk_inc_lvl(Int kid) { - Int b = S[0][kid]; //Which blk from schedule - BASKER_MATRIX &L = LL[b][0]; - BASKER_MATRIX &U = LU[b][LU_size(b)-1]; - BASKER_MATRIX &M = ALM[b][0]; //A->blk + Int b = S(0)(kid); //Which blk from schedule + BASKER_MATRIX &L = LL(b)(0); + BASKER_MATRIX &U = LU(b)(LU_size(b)-1); + BASKER_MATRIX &M = ALM(b)(0); //A->blk - INT_1DARRAY ws = LL[b][0].iws; - ENTRY_1DARRAY X = LL[b][0].ews; - Int ws_size = LL[b][0].iws_size; + INT_1DARRAY ws = LL(b)(0).iws; + ENTRY_1DARRAY X = LL(b)(0).ews; + Int ws_size = LL(b)(0).iws_size; Int brow = L.srow; //begining row Int lval = 0; @@ -384,10 +384,10 @@ namespace BaskerNS << pivot << endl; cout << "lcnt: " << lcnt << endl; } - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_SINGULAR; - thread_array[kid].error_blk = b; - thread_array[kid].error_info = k; + thread_array(kid).error_blk = b; + thread_array(kid).error_info = k; return BASKER_ERROR; } @@ -410,17 +410,17 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_blk = b; - thread_array[kid].error_subblk = 0; - thread_array[kid].error_info = newsize; + thread_array(kid).error_blk = b; + thread_array(kid).error_subblk = 0; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } @@ -441,17 +441,17 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_blk = b; - thread_array[kid].error_subblk = -1; - thread_array[kid].error_info = newsize; + thread_array(kid).error_blk = b; + thread_array(kid).error_subblk = -1; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } @@ -665,9 +665,9 @@ namespace BaskerNS { //Setup variables - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; - BASKER_MATRIX &L = LL[b][0]; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); + BASKER_MATRIX &L = LL(b)(0); const Int brow = L.srow; INT_1DARRAY ws = LL[wsb][l].iws; @@ -936,12 +936,12 @@ namespace BaskerNS ) { //Setup variables - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; - BASKER_MATRIX &L = LL[b][0]; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); + BASKER_MATRIX &L = LL(b)(0); - INT_1DARRAY ws = LL[wsb][l].iws; - const Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + const Int ws_size = LL(wsb)(l).iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -985,9 +985,9 @@ namespace BaskerNS //Will want to make this backward in the future //Setup variables - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; - BASKER_MATRIX &L = LL[b][0]; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); + BASKER_MATRIX &L = LL(b)(0); const Int brow = L.srow; INT_1DARRAY ws = LL[wsb][l].iws; @@ -1353,12 +1353,12 @@ namespace BaskerNS //We note that this can be fixed to be faster - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; - BASKER_MATRIX &L = LL[b][0]; - INT_1DARRAY ws = LL[wsb][l].iws; - ENTRY_1DARRAY X = LL[wsb][l].ews; - const Int ws_size = LL[wsb][l].iws_size; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); + BASKER_MATRIX &L = LL(b)(0); + INT_1DARRAY ws = LL(wsb)(l).iws; + ENTRY_1DARRAY X = LL(wsb)(l).ews; + const Int ws_size = LL(wsb)(l).iws_size; Int brow = L.srow; @@ -1441,12 +1441,12 @@ namespace BaskerNS { //We note that this can be fixed to be faster - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; - BASKER_MATRIX &L = LL[b][0]; - INT_1DARRAY ws = LL[wsb][l].iws; - ENTRY_1DARRAY X = LL[wsb][l].ews; - const Int ws_size = LL[wsb][l].iws_size; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); + BASKER_MATRIX &L = LL(b)(0); + INT_1DARRAY ws = LL(wsb)(l).iws; + ENTRY_1DARRAY X = LL(wsb)(l).ews; + const Int ws_size = LL(wsb)(l).iws_size; Int brow = L.srow; Int *color = &(ws(0)); @@ -2105,18 +2105,18 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_blk = blkcol; - thread_array[kid].error_subblk = blkrow; - thread_array[kid].error_info = newsize; + thread_array(kid).error_blk = blkcol; + thread_array(kid).error_subblk = blkrow; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } @@ -2176,18 +2176,18 @@ namespace BaskerNS BASKER_INLINE int Basker::t_nfactor_blk_old(Int kid) { - Int b = S[0][kid]; //Which blk from schedule - BASKER_MATRIX &L = LL[b][0]; - BASKER_MATRIX &U = LU[b][LU_size[b]-1]; + Int b = S(0)(kid); //Which blk from schedule + BASKER_MATRIX &L = LL(b)(0); + BASKER_MATRIX &U = LU(b)(LU_size[b]-1); #ifdef BASKER_2DL printf("Accessing blk: %d \n", b); - INT_1DARRAY ws = LL[b][0].iws; - ENTRY_1DARRAY X = LL[b][0].ews; - Int ws_size = LL[b][0].iws_size; + INT_1DARRAY ws = LL(b)(0).iws; + ENTRY_1DARRAY X = LL(b)(0).ews; + Int ws_size = LL(b)(0).iws_size; #else //else if BASKER_2DL - INT_1DARRAY ws = thread_array[kid].iws; - ENTRY_1DARRAY X = thread_array[kid].ews; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + ENTRY_1DARRAY X = thread_array(kid).ews; + Int ws_size = thread_array(kid).iws_size; #endif Int bcol = L.scol; //begining col @@ -2576,15 +2576,15 @@ namespace BaskerNS { //Setup variables - const Int b = S[lvl][kid]; - const Int wsb = S[0][kid]; - BASKER_MATRIX &L = LL[b][0]; + const Int b = S(lvl)(kid); + const Int wsb = S(0)(kid); + BASKER_MATRIX &L = LL(b)(0); #ifdef BASKER_2DL - INT_1DARRAY ws = LL[wsb][l].iws; - Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + Int ws_size = LL(wsb)(l).iws_size; #else - INT_1DARRAY ws = thread_array[kid].iws; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + Int ws_size = thread_array(kid).iws_size; #endif const Int brow = L.srow; @@ -2729,8 +2729,8 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM[blkcol][blkrow]; /* @@ -2750,17 +2750,16 @@ namespace BaskerNS BASKER_MATRIX *LPP = &LU[LP_col][0]; if(LP_row != BASKER_MAX_IDX) { - LPP = &(LL[LP_col][LP_row]); + LPP = &(LL(LP_col)(LP_row)); } BASKER_MATRIX &LP = *(LPP); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; - - Int nnz = LL[X_col][X_row].p_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; + Int nnz = LL(X_col)(X_row).p_size; @@ -2948,7 +2947,7 @@ namespace BaskerNS }//over all nonzero in left - LL[X_col][X_row].p_size = nnz; + LL(X_col)(X_row).p_size = nnz; return; @@ -2969,7 +2968,7 @@ namespace BaskerNS Int x_size, Int x_offset, BASKER_BOOL A_option) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); BASKER_MATRIX &B = ALM[blkcol][blkrow]; INT_1DARRAY ws = LL[X_col][X_row].iws; @@ -3840,9 +3839,9 @@ namespace BaskerNS ) { - const Int my_idx = S[0][kid]; + const Int my_idx = S(0)(kid); const Int team_leader = find_leader(kid,sl); - const Int leader_idx = S[0][team_leader]; + const Int leader_idx = S(0)(team_leader); //Int loop_col_idx = S(l)(kid); //printf("Reduce col fill called, kid: %d leader: %d \n", @@ -3941,12 +3940,12 @@ namespace BaskerNS //printf("===========T ADD ORIG FILL CALLED\n"); const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S[lvl][leader_id]; + const Int L_col = S(lvl)(leader_id); Int L_row = 0; //const Int U_col = S(lvl)(leader_id); //Int U_row = LU_size(U_col)-1; //Int X_col = S(0)(leader_id); - Int X_col = S[0][kid]; + Int X_col = S(0)(kid); Int X_row = l+1; @@ -3977,7 +3976,7 @@ namespace BaskerNS //Int L_row = 0; //const Int U_col = S(lvl)(leader_id); //Int U_row = LU_size(U_col)-1; - Int X_col = S[0][leader_id]; + Int X_col = S(0)(leader_id); Int X_row = l+1; //printf("=***== fill MY ID: %d LEADER ID: %d ===** \n", @@ -3987,7 +3986,7 @@ namespace BaskerNS { Int bl = l+1; - Int A_col = S[lvl][kid]; + Int A_col = S(lvl)(kid); /* printf("leader_id: %d kid: %d lvl: %d l: %d blk: %d %d \n", @@ -3996,16 +3995,16 @@ namespace BaskerNS */ Int my_row_leader = find_leader(kid, lvl-1); Int my_new_row = - S[bl][kid] - S[0][my_row_leader]; + S(bl)(kid) - S(0)(my_row_leader); - Int A_row = (lvl==l)?(2):S[bl][kid]%(LU_size(A_col)); - if((S[bl](kid)>14) && - (S[bl](kid)>LU_size(A_col)) && + Int A_row = (lvl==l)?(2):S(bl)(kid)%(LU_size(A_col)); + if((S(bl)(kid)>14) && + (S(bl)(kid)>LU_size(A_col)) && (lvl != 1)) { - Int tm = (S[bl][kid]+1)/16; - A_row = ((S[bl][kid]+1)-(tm*16))%LU_size(A_col); + Int tm = (S(bl)(kid)+1)/16; + A_row = ((S(bl)(kid)+1)-(tm*16))%LU_size(A_col); } /* diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp index 9c77c1f38994..68246464f757 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp @@ -134,7 +134,7 @@ namespace BaskerNS double barrier_time = 0; #endif - Int U_col = S[lvl][kid]; + Int U_col = S(lvl)(kid); Int U_row = 0; const Int scol = LU[U_col][U_row].scol; @@ -435,8 +435,8 @@ namespace BaskerNS for(Int l = 0; l < lvl; l++) { printf("OPS. KID : %d LVL: %d OPS : %d \n", - kid, l, thread_array[kid].ops_counts[l][0]); - thread_array[kid].ops_count[1][0] = 0; + kid, l, thread_array(kid).ops_counts[l][0]); + thread_array(kid).ops_count[1][0] = 0; } #endif @@ -460,15 +460,15 @@ namespace BaskerNS const Entry zero (0.0); //Get needed variables - const Int L_col = S[l][kid]; - const Int U_col = S[lvl][kid]; + const Int L_col = S(l)(kid); + const Int U_col = S(lvl)(kid); - Int my_row_leader = S[0][find_leader(kid,lvl-1)]; + Int my_row_leader = S(0)(find_leader(kid,lvl-1)); //Int my_new_row = // L_col - my_row_leader; Int U_row = L_col - my_row_leader; - const Int X_col = S[0][kid]; + const Int X_col = S(0)(kid); const Int X_row = l; //X_row = lower(L) //const Int col_idx_offset = 0; //we might be able to remove @@ -493,7 +493,7 @@ namespace BaskerNS } else { - Bp = &(thread_array[kid].C); + Bp = &(thread_array(kid).C); //printf("Using temp matrix, kid: %d\n", kid); //Bp->print(); } @@ -613,7 +613,7 @@ namespace BaskerNS //Count ops to show imbalance #ifdef BASKER_COUNT_OPS - thread_array[kid].ops_counts[0][l] += xnnz; + thread_array(kid).ops_counts[0][l] += xnnz; #endif //WE SHOUD DO A UNNZ COUNT @@ -649,17 +649,17 @@ namespace BaskerNS Int newsize = (unnz+U.nrow) * 1.2 ; - thread_array[kid].error_blk = U_col; - thread_array[kid].error_subblk = U_row; + thread_array(kid).error_blk = U_col; + thread_array(kid).error_subblk = U_row; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; }//if/else realloc } @@ -741,10 +741,10 @@ namespace BaskerNS std::cout << "----Error--- kid = " << kid << ": extra L[" << j << "]=" << X[j] << " with gperm( " << brow_g << " + " << j << " ) = " << t << std::endl; - thread_array[kid].error_type = BASKER_ERROR_OTHER; - thread_array[kid].error_blk = lvl; - thread_array[kid].error_subblk = l; - thread_array[kid].error_info = k; + thread_array(kid).error_type = BASKER_ERROR_OTHER; + thread_array(kid).error_blk = lvl; + thread_array(kid).error_subblk = l; + thread_array(kid).error_info = k; info = BASKER_ERROR; //BASKER_ASSERT(t != BASKER_MAX_IDX, "lower entry in U"); #endif @@ -864,11 +864,11 @@ namespace BaskerNS int lteam_size = pow(2, l); #ifdef BASKER_2DL - Int L_col = S[l][my_leader]; + Int L_col = S(l)(my_leader); Int L_row = 0; - Int U_col = S[lvl][kid]; - Int U_row = (lvl==1)?(kid%2):S[l][kid]%LU_size(U_col); - Int X_col = S[0][my_leader]; + Int U_col = S(lvl)(kid); + Int U_row = (lvl==1)?(kid%2):S(l)(kid)%LU_size(U_col); + Int X_col = S(0)(my_leader); Int X_row = l; //this will change for us Int col_idx_offset = 0; BASKER_MATRIX &U = LU[U_col][U_row]; @@ -959,12 +959,12 @@ namespace BaskerNS ) { - Int b = S[l][kid]; + Int b = S(l)(kid); BASKER_MATRIX &L = LL[b][0]; - INT_1DARRAY ws = thread_array[kid].iws; - ENTRY_1DARRAY X = thread_array[team_leader].ews; - Int ws_size = thread_array[kid].iws_size; - Int ews_size = thread_array[team_leader].ews_size; + INT_1DARRAY ws = thread_array(kid).iws; + ENTRY_1DARRAY X = thread_array(team_leader).ews; + Int ws_size = thread_array(kid).iws_size; + Int ews_size = thread_array(team_leader).ews_size; #ifdef BASKER_DEBUG_NFACTOR_COL if(kid>3) @@ -1066,11 +1066,11 @@ namespace BaskerNS const Mag normA_blk = BTF_A.anorm; //Get needed variables - const Int L_col = S[lvl][kid]; + const Int L_col = S(lvl)(kid); const Int L_row = 0; - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); const Int U_row = LU_size(U_col)-1; - const Int X_col = S[0][kid]; + const Int X_col = S(0)(kid); //Int col_idx_offset = 0; //can we get rid of now? #ifdef BASKER_DEBUG_NFACTOR_COL @@ -1083,7 +1083,7 @@ namespace BaskerNS BASKER_MATRIX &L = LL[L_col][L_row]; BASKER_MATRIX &U = LU[U_col][U_row]; - BASKER_MATRIX &B = thread_array[kid].C; + BASKER_MATRIX &B = thread_array(kid).C; #ifdef BASKER_DEBUG_NFACTOR_COL if(kid >= 0) @@ -1237,7 +1237,7 @@ namespace BaskerNS #endif #ifdef BASKER_OPS_COUNT - thread_array[kid].ops_counts[0][l] += xnnz; + thread_array(kid).ops_counts[0][l] += xnnz; #endif t_back_solve(kid, lvl,l+1, k, top, xnnz); // note: l not lvl given @@ -1327,10 +1327,10 @@ namespace BaskerNS X(maxindex) = pivot; } else { // replace-tiny-pivot not requested, or the current column is structurally empty after elimination - thread_array[kid].error_type = BASKER_ERROR_SINGULAR; - thread_array[kid].error_blk = L_col; - thread_array[kid].error_subblk = -1; - thread_array[kid].error_info = k; + thread_array(kid).error_type = BASKER_ERROR_SINGULAR; + thread_array(kid).error_blk = L_col; + thread_array(kid).error_subblk = -1; + thread_array(kid).error_info = k; return BASKER_ERROR; } } else if (Options.replace_tiny_pivot && normA_blk > abs(zero) && abs(pivot) < normA_blk * sqrt(eps)) { @@ -1374,17 +1374,17 @@ namespace BaskerNS //cout << " > L_col = " << L_col << " L_row = " << L_row << endl; } - thread_array[kid].error_blk = L_col; - thread_array[kid].error_subblk = -1; + thread_array(kid).error_blk = L_col; + thread_array(kid).error_subblk = -1; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } } @@ -1399,17 +1399,17 @@ namespace BaskerNS << endl; } - thread_array[kid].error_blk = U_col; - thread_array[kid].error_subblk = U_row; + thread_array(kid).error_blk = U_col; + thread_array(kid).error_subblk = U_row; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } } @@ -1640,11 +1640,11 @@ namespace BaskerNS const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S[lvl][leader_id]; + const Int L_col = S(lvl)(leader_id); Int L_row = 0; - const Int U_col = S[lvl][leader_id]; + const Int U_col = S(lvl)(leader_id); Int U_row = LU_size(U_col)-1; - Int X_col = S[0][leader_id]; + Int X_col = S(0)(leader_id); Int X_row = l+1; Int col_idx_offset = 0; //can get rid of? @@ -1743,15 +1743,15 @@ namespace BaskerNS //Setup - Int A_col = S[lvl][kid]; - Int A_row = (lvl==1)?(2):S[l+1][kid]%(LU_size(A_col)); + Int A_col = S(lvl)(kid); + Int A_row = (lvl==1)?(2):S(l+1)(kid)%(LU_size(A_col)); BASKER_MATRIX &B = AVM[A_col][A_col]; - const Int my_idx = S[0][kid]; + const Int my_idx = S(0)(kid); team_leader = find_leader(kid, l); - const Int leader_idx = S[0][team_leader]; - Int loop_col_idx = S[l][kid]; + const Int leader_idx = S(0)(team_leader); + Int loop_col_idx = S(l)(kid); #ifdef BASKER_DEBUG_NFACTOR_COL printf("Called t_blk_col_copy_atomic kid: %d " , kid); @@ -1909,15 +1909,15 @@ namespace BaskerNS //Setup - Int A_col = S[lvl][kid]; - Int A_row = (lvl==1)?(2):S[l+1][kid]%(LU_size(A_col)); + Int A_col = S(lvl)(kid); + Int A_row = (lvl==1)?(2):S(l+1)(kid)%(LU_size(A_col)); BASKER_MATRIX &B = AVM[A_col][A_col]; - const Int my_idx = S[0][kid]; + const Int my_idx = S(0)(kid); team_leader = find_leader(kid, l); - const Int leader_idx = S[0][team_leader]; - Int loop_col_idx = S[l][kid]; + const Int leader_idx = S(0)(team_leader); + Int loop_col_idx = S(l)(kid); #ifdef BASKER_DEBUG_NFACTOR_COL printf("Called t_blk_col_copy_atomic kid: %d " , kid); @@ -2073,8 +2073,8 @@ namespace BaskerNS //printf("-----------------copy_update_matrx----------"); //printf("\n\n\n\n"); - Int leader_idx = S[0][kid]; - BASKER_MATRIX &C = thread_array[kid].C; + Int leader_idx = S(0)(kid); + BASKER_MATRIX &C = thread_array(kid).C; Int nnz = 0; //COME BACK HERE @@ -2089,8 +2089,8 @@ namespace BaskerNS // for(Int bl = l+1; bl < last_blk; bl++) { Int bl = l+1; - Int A_col = S[lvl][kid]; - Int A_row = (lvl==1)?(2):S[bl][kid]%(LU_size(A_col)); + Int A_col = S(lvl)(kid); + Int A_row = (lvl==1)?(2):S(bl)(kid)%(LU_size(A_col)); Int CM_idx = kid; BASKER_MATRIX *Bp; @@ -2173,8 +2173,8 @@ namespace BaskerNS // l, last_blk, kid); for(Int bl=l+1; bl BB; - BB.Barrier(thread_array[leader_kid].token[sublvl][function_n], - thread_array[leader_kid].token[sublvl][1], + BB.Barrier(thread_array(leader_kid).token[sublvl][function_n], + thread_array(leader_kid).token[sublvl][1], size); */ } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp index 342835bd640c..4389365a84d6 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp @@ -135,7 +135,7 @@ namespace BaskerNS // 3) t_lower_col_factor : factor A(7,7), sequential // 4) t_lower_col_factor_offdiag2 : compute L(8:end, 7) - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); const Int U_row = 0; Int ncol = LU[U_col][U_row].ncol; Int my_leader = find_leader(kid, 0); @@ -181,7 +181,7 @@ namespace BaskerNS t_basker_barrier(thread, kid, my_leader, b_size, 0, LU[U_col][U_row].scol, 0); for(Int tid = 0; tid < num_threads; tid++) { - if (thread_array[tid].error_type != BASKER_SUCCESS) { + if (thread_array(tid).error_type != BASKER_SUCCESS) { info = BASKER_ERROR; } } @@ -250,7 +250,7 @@ namespace BaskerNS t_basker_barrier(thread, kid, my_leader, b_size, 3, LU[U_col][U_row].scol, 0); for(Int ti = 0; ti < num_threads; ti++) { - if (thread_array[kid].error_type != BASKER_SUCCESS) { + if (thread_array(kid).error_type != BASKER_SUCCESS) { info = BASKER_ERROR; } } @@ -336,7 +336,7 @@ namespace BaskerNS t_basker_barrier(thread, kid, my_leader, b_size, 4, k, lvl-1); for(Int tid = 0; tid < num_threads; tid++) { - if (thread_array[tid].error_type != BASKER_SUCCESS) { + if (thread_array(tid).error_type != BASKER_SUCCESS) { info = BASKER_ERROR; } } @@ -395,7 +395,7 @@ namespace BaskerNS #ifdef BASKER_TIMER double time_factot = timer.seconds(); if((kid%(Int)(pow(2,lvl))) == 0) { - const Int L_col = S[lvl][kid]; + const Int L_col = S(lvl)(kid); const Int L_row = LU_size(U_col)-1; printf("Time Lower-Col(%d): %lf, n = %d, nnz(L) = %d, nnz(U) = %d \n", (int)kid, time_factot, @@ -446,7 +446,7 @@ namespace BaskerNS #endif //This will do the correct spmv - if(thread_array[kid].error_type == BASKER_ERROR_NOERROR) { + if(thread_array(kid).error_type == BASKER_ERROR_NOERROR) { t_upper_col_factor_offdiag2(kid, lvl, sl,l, k, lower); } //Barrier--Start @@ -461,7 +461,7 @@ namespace BaskerNS //Barrier--End if(kid%((Int)pow(2,sl)) == 0 && - thread_array[kid].error_type == BASKER_ERROR_NOERROR) { + thread_array(kid).error_type == BASKER_ERROR_NOERROR) { t_dense_blk_col_copy_atomic2(kid, my_leader, lvl, sl, l, k, lower); } @@ -477,7 +477,7 @@ namespace BaskerNS #endif }//over all sublevels - if(thread_array[kid].error_type == BASKER_ERROR_NOERROR) { + if(thread_array(kid).error_type == BASKER_ERROR_NOERROR) { t_dense_copy_update_matrix2(kid, my_leader, lvl, l, k); } }//end t_add_add @@ -507,10 +507,10 @@ namespace BaskerNS return; } - Int my_row_leader = S[0][find_leader(kid,lvl-1)]; - const Int L_col = S[sl][my_leader]; - const Int U_col = S[lvl][kid]; - const Int X_col = S[0][my_leader]; + Int my_row_leader = S(0)(find_leader(kid,lvl-1)); + const Int L_col = S(sl)(my_leader); + const Int U_col = S(lvl)(kid); + const Int X_col = S(0)(my_leader); Int L_row = l-sl+1; //Might have to think about th Int U_row = L_col-my_row_leader; Int X_row = l+1; //this will change for us @@ -588,10 +588,10 @@ namespace BaskerNS //Setup //printf("DEBUG, kid: %d k: %d A_col: %d A_row: %d \n", // kid, k, A_col, A_row); - const Int my_idx = S[0][kid]; + const Int my_idx = S(0)(kid); //should remove either as a paramter or here Int team_leader = find_leader(kid, sl); - const Int leader_idx = S[0][team_leader]; + const Int leader_idx = S(0)(team_leader); #ifdef BASKER_DEBUG_NFACTOR_COL2 if(lower == BASKER_TRUE) { @@ -709,8 +709,8 @@ namespace BaskerNS //printf("\n\n\n\n"); const Entry zero (0.0); - const Int leader_idx = S[0][kid]; - BASKER_MATRIX &C = thread_array[kid].C; + const Int leader_idx = S(0)(kid); + BASKER_MATRIX &C = thread_array(kid).C; Int nnz = 0; //Over each blk @@ -724,10 +724,10 @@ namespace BaskerNS // X += B(:, k) { Int bl = l+1; - Int A_col = S[lvl][kid]; + Int A_col = S(lvl)(kid); - Int my_row_leader = S[0][find_leader(kid,lvl-1)]; - Int A_row = S[bl][kid] - my_row_leader; + Int my_row_leader = S(0)(find_leader(kid,lvl-1)); + Int A_row = S(bl)(kid) - my_row_leader; BASKER_MATRIX *Bp; if(A_row != (LU_size(A_col)-1)) @@ -875,13 +875,13 @@ namespace BaskerNS const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S[lvl][leader_id]; - const Int U_col = S[lvl][leader_id]; + const Int L_col = S(lvl)(leader_id); + const Int U_col = S(lvl)(leader_id); Int L_row = 0; Int U_row = LU_size(U_col)-1; - Int X_col = S[0][leader_id]; + Int X_col = S(0)(leader_id); Int X_row = l+1; Int col_idx_offset = 0; //can get rid of? diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp index ee72c5d32c7b..02fde7c7ccad 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp @@ -83,7 +83,7 @@ namespace BaskerNS ) { - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); Int U_row = 0; //const Int scol = LU(U_col)(U_row).scol; @@ -577,12 +577,12 @@ namespace BaskerNS ) { l = l+1; - Int my_token = S[l][kid]; + Int my_token = S(l)(kid); Int my_loc = kid; while((my_loc > 0)) { my_loc--; - if(S[l][my_loc] != my_token) + if(S(l)(my_loc) != my_token) { my_loc++; break; @@ -615,14 +615,14 @@ namespace BaskerNS //Get needed variables - const Int L_col = S[l][kid]; + const Int L_col = S(l)(kid); // const Int L_row = 0; //NDE - warning: unused - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); Int my_row_leader = find_leader(kid,lvl-1); //Int my_new_row = // L_col - S(0)(my_row_leader); - Int U_row = L_col - S[0][my_row_leader]; + Int U_row = L_col - S(0)(my_row_leader); /* Int U_row = (lvl==1)?(kid%2):S(l)(kid)%LU_size(U_col); @@ -642,7 +642,7 @@ namespace BaskerNS //U_row = my_new_row; - const Int X_col = S[0][kid]; + const Int X_col = S(0)(kid); const Int X_row = l; //X_row = lower(L) //const Int col_idx_offset = 0; //we might be able to remove @@ -664,7 +664,7 @@ namespace BaskerNS } else { - Bp = &(thread_array[kid].C); + Bp = &(thread_array(kid).C); } BASKER_MATRIX &B = *Bp; //if(kid ==0) @@ -920,18 +920,18 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { //printf("HERE\n"); - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_blk = U_col; - thread_array[kid].error_subblk = U_row; - thread_array[kid].error_info = newsize; + thread_array(kid).error_blk = U_col; + thread_array(kid).error_subblk = U_row; + thread_array(kid).error_info = newsize; return BASKER_ERROR; }//if/else realloc }//if need to realloc @@ -1086,26 +1086,26 @@ namespace BaskerNS // kid, lvl, sl, l); } - const Int L_col = S[sl][my_leader]; + const Int L_col = S(sl)(my_leader); Int L_row = l-sl+1; //Might have to think about th - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - L_col - S[0][my_row_leader]; + L_col - S(0)(my_row_leader); // Int U_row = my_new_row; Int U_row = - (lvl==1)?(kid%2):S[sl][kid]%LU_size(U_col); - if((S[sl][kid] > 14) && - (S[sl][kid] > LU_size(U_col)) && + (lvl==1)?(kid%2):S(sl)(kid)%LU_size(U_col); + if((S(sl)(kid) > 14) && + (S(sl)(kid) > LU_size(U_col)) && (lvl != 1)) { //printf("lower offdiag new num, %d %d \n", // S(sl)(kid), LU_size(U_col)); - Int tm = (S[sl][kid]+1)/16; - U_row = ((S[sl][kid]+1) - (tm*16))%LU_size(U_col); + Int tm = (S(sl)(kid)+1)/16; + U_row = ((S(sl)(kid)+1) - (tm*16))%LU_size(U_col); } //printf("UFF kid:%d U: %d %d new: %d leader: %d %d lvl: %d l: %d sl: %d \n", @@ -1116,7 +1116,7 @@ namespace BaskerNS //JDB PASS TEST U_row = my_new_row; - const Int X_col = S[0][my_leader]; + const Int X_col = S(0)(my_leader); Int X_row = l+1; //this will change for us //Int col_idx_offset = 0; @@ -1256,18 +1256,18 @@ namespace BaskerNS return; } - const Int L_col = S[sl][my_leader]; + const Int L_col = S(sl)(my_leader); Int L_row = l-sl+1; //Might have to think about th - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - L_col - S[0][my_row_leader]; + L_col - S(0)(my_row_leader); Int U_row = 0; U_row = my_new_row; - const Int X_col = S[0][my_leader]; + const Int X_col = S(0)(my_leader); Int X_row = l+1; //this will change for us Int col_idx_offset = 0; @@ -1276,11 +1276,11 @@ namespace BaskerNS //Need to give them the output pattern - Int U_pattern_col = S[lvl][kid]; + Int U_pattern_col = S(lvl)(kid); Int my_pattern_leader = find_leader_inc_lvl(kid,l); - Int U_pattern_row = S[l+1][my_pattern_leader] - - S[0][my_row_leader]; + Int U_pattern_row = S(l+1)(my_pattern_leader) - + S(0)(my_row_leader); /* printf("Test mypleader: %d myrowleader: %d kid: %d\n", @@ -1292,7 +1292,7 @@ namespace BaskerNS */ - Int L_pattern_col = S[lvl][kid]; + Int L_pattern_col = S(lvl)(kid); Int L_pattern_row = BASKER_MAX_IDX; if(lower == BASKER_TRUE) { @@ -1418,26 +1418,26 @@ namespace BaskerNS return; } - const Int L_col = S[sl][my_leader]; + const Int L_col = S(sl)(my_leader); Int L_row = l-sl+1; //Might have to think about th - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - L_col - S[0][my_row_leader]; + L_col - S(0)(my_row_leader); // Int U_row = my_new_row; Int U_row = - (lvl==1)?(kid%2):S[sl][kid]%LU_size(U_col); - if((S[sl][kid] > 14) && - (S[sl][kid] > LU_size(U_col)) && + (lvl==1)?(kid%2):S(sl)(kid)%LU_size(U_col); + if((S(sl)(kid) > 14) && + (S(sl)(kid) > LU_size(U_col)) && (lvl != 1)) { - Int tm = (S[sl][kid]+1)/16; - U_row = ((S[sl][kid]+1) - (tm*16))%LU_size(U_col); + Int tm = (S(sl)(kid)+1)/16; + U_row = ((S(sl)(kid)+1) - (tm*16))%LU_size(U_col); } // printf("lowerspmv kid: %d U: %d %d new %d leader: %d %d lvl: %d %d %d \n", @@ -1448,7 +1448,7 @@ namespace BaskerNS U_row = my_new_row; - const Int X_col = S[0][my_leader]; + const Int X_col = S(0)(my_leader); Int X_row = l+1; //this will change for us Int col_idx_offset = 0; @@ -1538,8 +1538,8 @@ namespace BaskerNS ) { - const Int leader_idx = S[0][kid]; - BASKER_MATRIX &C = thread_array[kid].C; + const Int leader_idx = S(0)(kid); + BASKER_MATRIX &C = thread_array(kid).C; Int nnz = 0; // Int gbrow = 0; //NDE - warning: unused @@ -1549,11 +1549,11 @@ namespace BaskerNS { //Copy B -> C Int bl = l+1; - Int A_col = S[lvl][kid]; + Int A_col = S(lvl)(kid); Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - S[bl][kid] - S[0][my_row_leader]; + S(bl)(kid) - S(0)(my_row_leader); Int A_row = 0; A_row = my_new_row; @@ -1653,8 +1653,8 @@ namespace BaskerNS ) { - const Int leader_idx = S[0][kid]; - BASKER_MATRIX &C = thread_array[kid].C; + const Int leader_idx = S(0)(kid); + BASKER_MATRIX &C = thread_array(kid).C; Int nnz = 0; Int gbrow = 0; @@ -1672,24 +1672,24 @@ namespace BaskerNS { //Copy B -> C Int bl = l+1; - Int A_col = S[lvl][kid]; + Int A_col = S(lvl)(kid); Int my_row_leader = find_leader(kid,lvl-1); Int my_new_row = - S[bl][kid] - S[0][my_row_leader]; + S(bl)(kid) - S(0)(my_row_leader); //Int A_row = my_new_row; - Int A_row = (lvl==1)?(2):S[bl][kid]%(LU_size(A_col)); - if((S[bl][kid] > 14) && - (S[bl][kid] > LU_size(A_col)) && + Int A_row = (lvl==1)?(2):S(bl)(kid)%(LU_size(A_col)); + if((S(bl)(kid) > 14) && + (S(bl)(kid) > LU_size(A_col)) && (lvl != 1)) { //printf("test cm %d %d %d \n", // kid, S(bl)(kid), LU_size(A_col)); - Int tm = (S[bl][kid]+1)/16; - A_row = ((S[bl][kid]+1) - (tm*16))%LU_size(A_col); + Int tm = (S(bl)(kid)+1)/16; + A_row = ((S(bl)(kid)+1) - (tm*16))%LU_size(A_col); } @@ -1883,12 +1883,12 @@ namespace BaskerNS ) { //Get needed variables - const Int L_col = S[lvl][kid]; + const Int L_col = S(lvl)(kid); const Int L_row = 0; - const Int U_col = S[lvl][kid]; + const Int U_col = S(lvl)(kid); const Int U_row = LU_size(U_col)-1; - const Int X_col = S[0][kid]; + const Int X_col = S(0)(kid); //Int col_idx_offset = 0; //can we get rid of now? @@ -1905,7 +1905,7 @@ namespace BaskerNS BASKER_MATRIX &L = LL[L_col][L_row]; BASKER_MATRIX &U = LU[U_col][U_row]; - BASKER_MATRIX &B = thread_array[kid].C; + BASKER_MATRIX &B = thread_array(kid).C; #ifdef BASKER_DEBUG_NFACTOR_COL if(kid >= 0) @@ -2201,17 +2201,17 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_blk = L_col; - thread_array[kid].error_subblk = -1; - thread_array[kid].error_info = newsize; + thread_array(kid).error_blk = L_col; + thread_array(kid).error_subblk = -1; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } } @@ -2229,16 +2229,16 @@ namespace BaskerNS if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; } else { - thread_array[kid].error_type = + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_blk = U_col; - thread_array[kid].error_subblk = U_row; - thread_array[kid].error_info = newsize; + thread_array(kid).error_blk = U_col; + thread_array(kid).error_subblk = U_row; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } } @@ -2462,11 +2462,11 @@ namespace BaskerNS const Int leader_id = find_leader(kid, l); const Int lteam_size = pow(2,l+1); - const Int L_col = S[lvl][leader_id]; + const Int L_col = S(lvl)(leader_id); Int L_row = 0; - const Int U_col = S[lvl][leader_id]; + const Int U_col = S(lvl)(leader_id); Int U_row = LU_size(U_col)-1; - Int X_col = S[0][leader_id]; + Int X_col = S(0)(leader_id); Int X_row = l+1; Int col_idx_offset = 0; //can get rid of? @@ -2585,9 +2585,9 @@ namespace BaskerNS //const Int lteam_size = pow(2,l+1); //NDE - warning: unused // const Int L_col = S(lvl)(leader_id); //NDE - warning: unused // Int L_row = 0; //NDE - warning: unused - const Int U_col = S[lvl][leader_id]; + const Int U_col = S(lvl)(leader_id); Int U_row = LU_size(U_col)-1; - Int X_col = S[0][leader_id]; + Int X_col = S(0)(leader_id); Int X_row = l+1; //Int col_idx_offset = 0; //can get rid of?//NDE - warning: unused @@ -2621,11 +2621,11 @@ namespace BaskerNS const BASKER_BOOL lower ) { - const Int my_idx = S[0][kid]; + const Int my_idx = S(0)(kid); //should remove either as a paramter or here Int team_leader = find_leader(kid, sl); - const Int leader_idx = S[0][team_leader]; + const Int leader_idx = S(0)(team_leader); //If I an not a leader, then need to copy over if(kid != team_leader) @@ -2682,7 +2682,7 @@ namespace BaskerNS - Int U_pattern_col = S[lvl][kid]; + Int U_pattern_col = S(lvl)(kid); Int U_pattern_row = BASKER_MAX_IDX; if(blk == l+1) @@ -2691,11 +2691,11 @@ namespace BaskerNS //S(0)(find_leader(kid,lvl)); //U_pattern_row = S(l+1)(kid) - //S(0)(my_pattern_leader); - U_pattern_row = S[l+1][kid] - - S[0][find_leader(kid,lvl-1)]; + U_pattern_row = S(l+1)(kid) - + S(0)(find_leader(kid,lvl-1)); } - Int L_pattern_col = S[lvl][kid]; + Int L_pattern_col = S(lvl)(kid); Int L_pattern_row = BASKER_MAX_IDX; if(lower == BASKER_TRUE) { @@ -2769,11 +2769,11 @@ namespace BaskerNS //BASKER_MATRIX &B = AVM(A_col)(A_col); - const Int my_idx = S[0][kid]; + const Int my_idx = S(0)(kid); //should remove either as a paramter or here Int team_leader = find_leader(kid, sl); - const Int leader_idx = S[0][team_leader]; + const Int leader_idx = S(0)(team_leader); //Int loop_col_idx = S(l)(kid); NU //#ifdef BASKER_DEBUG_NFACTOR_COL2 diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp index dc59708fe158..b87a0f48eadf 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp @@ -294,9 +294,9 @@ namespace BaskerNS printf("Error: NaN diag in single factor\n"); } } - thread_array[kid].error_type = BASKER_ERROR_SINGULAR; - thread_array[kid].error_blk = c; - thread_array[kid].error_info = k; + thread_array(kid).error_type = BASKER_ERROR_SINGULAR; + thread_array(kid).error_blk = c; + thread_array(kid).error_info = k; return BASKER_ERROR; } @@ -373,9 +373,9 @@ namespace BaskerNS Mag rmin_ (0.0); //workspace - Int ws_size = thread_array[kid].iws_size; - INT_1DARRAY ws = thread_array[kid].iws; - ENTRY_1DARRAY X = thread_array[kid].ews; + Int ws_size = thread_array(kid).iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + ENTRY_1DARRAY X = thread_array(kid).ews; Int *color = &(ws(0)); Int *pattern = &(color[ws_size]); @@ -580,9 +580,9 @@ namespace BaskerNS << " Column: " << k << std::endl; } - thread_array[kid].error_type = BASKER_ERROR_NAN; - thread_array[kid].error_blk = c; - thread_array[kid].error_info = k; + thread_array(kid).error_type = BASKER_ERROR_NAN; + thread_array(kid).error_blk = c; + thread_array(kid).error_info = k; return BASKER_ERROR; } absv = abs(value); @@ -714,9 +714,9 @@ namespace BaskerNS pivot = normA_blk * eps; X(maxindex) = pivot; } else { - thread_array[kid].error_type = BASKER_ERROR_SINGULAR; - thread_array[kid].error_blk = c; - thread_array[kid].error_info = k; + thread_array(kid).error_type = BASKER_ERROR_SINGULAR; + thread_array(kid).error_blk = c; + thread_array(kid).error_info = k; return BASKER_ERROR; } } @@ -780,16 +780,16 @@ namespace BaskerNS (long)btf_tabs(c), (long)btf_tabs(c+1), (long)(btf_tabs(c+1)-btf_tabs(c))); } - thread_array[kid].error_blk = c; + thread_array(kid).error_blk = c; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } } @@ -804,16 +804,16 @@ namespace BaskerNS printf("blk: %ld column: %ld \n", (long)c, (long)k); } - thread_array[kid].error_blk = c; + thread_array(kid).error_blk = c; if(Options.realloc == BASKER_FALSE) { - thread_array[kid].error_type = BASKER_ERROR_NOMALLOC; + thread_array(kid).error_type = BASKER_ERROR_NOMALLOC; return BASKER_ERROR; } else { - thread_array[kid].error_type = BASKER_ERROR_REMALLOC; - thread_array[kid].error_info = newsize; + thread_array(kid).error_type = BASKER_ERROR_REMALLOC; + thread_array(kid).error_info = newsize; return BASKER_ERROR; } } @@ -991,8 +991,8 @@ namespace BaskerNS ) { //printf("=======LOCAL REACH BTF SHORT CALLED (pattern[top=%d - 1] = %d) =====\n",(int)top, (int)j); - INT_1DARRAY ws = thread_array[kid].iws; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + Int ws_size = thread_array(kid).iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -1014,8 +1014,8 @@ namespace BaskerNS { //printf("=======LOCAL REACH BTF CALLED =====\n"); - INT_1DARRAY ws = thread_array[kid].iws; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + Int ws_size = thread_array(kid).iws_size; /*{ printf("ws_size: %d \n", ws_size); @@ -1144,8 +1144,8 @@ namespace BaskerNS ) { - INT_1DARRAY ws = thread_array[kid].iws; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + Int ws_size = thread_array(kid).iws_size; /* printf("ws_size: %d \n", ws_size); @@ -1289,9 +1289,9 @@ namespace BaskerNS { const Entry zero (0.0); - INT_1DARRAY ws = thread_array[kid].iws; - ENTRY_1DARRAY X = thread_array[kid].ews; - Int ws_size = thread_array[kid].iws_size; + INT_1DARRAY ws = thread_array(kid).iws; + ENTRY_1DARRAY X = thread_array(kid).ews; + Int ws_size = thread_array(kid).iws_size; Int brow = L.srow; Int *color = &(ws(0)); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp index d48f0e720a7e..ef9bdb8084ef 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp @@ -285,8 +285,8 @@ int Basker::sfactor() for(Int ii=0; ii < split_num; ii++) { BASKER_ASSERT(A.ncol > 0, "Basker symmetric_sfactor assert: A.ncol malloc > 0 failed"); - MALLOC_INT_1DARRAY(gScol[ii], A.ncol); - init_value(gScol[ii], A.ncol, (Int)0); + MALLOC_INT_1DARRAY(gScol(ii), A.ncol); + init_value(gScol(ii), A.ncol, (Int)0); } @@ -298,8 +298,8 @@ int Basker::sfactor() for(Int ii=0; ii < split_num; ii++) { BASKER_ASSERT(A.nrow > 0, "sfactor A.nrow malloc"); - MALLOC_INT_1DARRAY(gSrow[ii], A.nrow); - init_value(gSrow[ii], A.nrow, (Int)0); + MALLOC_INT_1DARRAY(gSrow(ii), A.nrow); + init_value(gSrow(ii), A.nrow, (Int)0); } #ifdef BASKER_TIMER @@ -334,7 +334,7 @@ int Basker::sfactor() for(Int p = 0; p < num_threads; ++p) #endif { - Int blk = S[0][p]; + Int blk = S(0)(p); if(Options.verbose == BASKER_TRUE) { printf(" ============= DOMAIN BLK (p=%d) ============\n",(int)p); @@ -409,18 +409,18 @@ int Basker::sfactor() for(Int p = 0; p < num_threads; ++p) { //Do off diag - Int blk = S[0][p]; + Int blk = S(0)(p); #ifdef SHYLU_BASKER_STREE_LIST auto stree_p = stree_list[p]; #endif for(Int l =0; l < tree.nlvls; l++) { - Int U_col = S[l+1][p]; + Int U_col = S(l+1)(p); //Note: Need to think more about this flow //Should be subtracted by how many times in the //future - Int my_row_leader = S[0][find_leader(p,l)]; + Int my_row_leader = S(0)(find_leader(p,l)); //Int my_new_row = // blk - my_row_leader; Int U_row = blk-my_row_leader; @@ -436,16 +436,16 @@ int Basker::sfactor() Int off_diag = 1; //printf( " U_blk_sfactor(AVM(%d,%d))\n",U_col,U_row ); //U_blk_sfactor(AV[U_col][U_row], stree, - // gScol[l], gSrow[glvl],0); + // gScol(l), gSrow(glvl),0); #ifdef BASKER_TIMER timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST U_blk_sfactor(AVM[U_col][U_row], stree_p, - gScol[l], gSrow[glvl], off_diag); + gScol(l), gSrow(glvl), off_diag); #else U_blk_sfactor(AVM[U_col][U_row], stree, - gScol[l], gSrow[glvl], off_diag); + gScol(l), gSrow(glvl), off_diag); #endif #ifdef BASKER_TIMER time3 += timer1.seconds(); @@ -533,11 +533,11 @@ int Basker::sfactor() (long)U_col, (long)U_row, (long)lvl, (long)pp); #endif - Int U_col = S[lvl+1][ppp]; + Int U_col = S(lvl+1)(ppp); Int U_row = 0; //S_blk_sfactor(AL[U_col][U_row], stree, - //gScol[lvl], gSrow[pp]); + //gScol(lvl), gSrow(pp)); #ifdef BASKER_TIMER printf( " >>> S_blk_sfactor( ALM(%d)(%d) with %dx%d and nnz=%d) <<<\n",U_col,U_row, ALM[U_col][U_row].nrow,ALM[U_col][U_row].ncol,ALM[U_col][U_row].nnz ); fflush(stdout); @@ -545,10 +545,10 @@ int Basker::sfactor() #ifdef SHYLU_BASKER_STREE_LIST auto stree_p = stree_list[pp]; S_blk_sfactor(ALM[U_col][U_row], stree_p, - gScol[lvl], gSrow[pp]); + gScol(lvl), gSrow(pp)); #else S_blk_sfactor(ALM[U_col][U_row], stree, - gScol[lvl], gSrow[pp]); + gScol(lvl), gSrow(pp)); #endif #ifdef BASKER_TIMER printf( " >>> -> nnz = %d\n",ALM[U_col][U_row].nnz ); fflush(stdout); @@ -592,20 +592,20 @@ int Basker::sfactor() Int ppp; ppp = pp*pow(tree.nparts, lvl+1); - Int U_col = S[lvl+1][ppp]; + Int U_col = S(lvl+1)(ppp); Int U_row = 0; Int inner_blk = U_col; for(Int l = lvl+1; l < tree.nlvls; l++) { //printf( " --- pp = %d/%d, l = %d/%d ---\n",pp,p, l,tree.nlvls ); fflush(stdout); - U_col = S[l+1][ppp]; - U_row = S[lvl+1][ppp]%LU_size(U_col); + U_col = S(l+1)(ppp); + U_row = S(lvl+1)(ppp)%LU_size(U_col); - Int my_row_leader = S[0][find_leader(ppp,l)]; + Int my_row_leader = S(0)(find_leader(ppp,l)); //Int my_new_row = // S(lvl+1)(ppp) - my_row_leader; - U_row = S[lvl+1][ppp] - my_row_leader; + U_row = S(lvl+1)(ppp) - my_row_leader; #ifdef BASKER_DEBUG_SFACTOR printf("offida sep, lvl: %d l: %d U_col: %d U_row: %d \n", lvl, l, U_col, U_row); @@ -615,10 +615,10 @@ int Basker::sfactor() Int off_diag = 1; #ifdef SHYLU_BASKER_STREE_LIST U_blk_sfactor(AVM[U_col][U_row], stree_p, - gScol[l], gSrow[pp], off_diag); + gScol(l), gSrow(pp), off_diag); #else U_blk_sfactor(AVM[U_col][U_row], stree, - gScol[l], gSrow[pp], off_diag); + gScol(l), gSrow(pp), off_diag); #endif //In symmetric will not need @@ -656,8 +656,8 @@ int Basker::sfactor() for(Int ii = 0 ; ii < split_num; ++ii) { //printf("split\n"); - FREE(gScol[ii]); - FREE(gSrow[ii]); + FREE(gScol(ii)); + FREE(gSrow(ii)); } FREE(gScol); FREE(gSrow); @@ -2589,22 +2589,22 @@ int Basker::sfactor() for(Int i = 0 ; i < num_threads; i++) { - thread_array[i].iws_size = max_blk_size; - thread_array[i].ews_size = max_blk_size; + thread_array(i).iws_size = max_blk_size; + thread_array(i).ews_size = max_blk_size; //BASKER_ASSERT((thread_array(i).iws_size*thread_array(i).iws_mult) > 0, "Basker btf_last_dense assert: sfactor threads iws > 0 failed"); //BASKER_ASSERT((thread_array(i).ews_size*thread_array(i).ews_mult) > 0, "Basker btf_last_dense assert: sfactor threads ews > 0 failed"); #ifdef BASKER_TIMER printf("Malloc Thread: %d iws: %d \n", - i, (thread_array[i].iws_size* - thread_array[i].iws_mult)); + i, (thread_array(i).iws_size* + thread_array(i).iws_mult)); printf("Malloc Thread: %d ews: %d \n", - i, (thread_array[i].ews_size* - thread_array[i].ews_mult)); + i, (thread_array(i).ews_size* + thread_array(i).ews_mult)); #endif if (max_blk_size > 0) { - MALLOC_INT_1DARRAY(thread_array[i].iws, thread_array[i].iws_size*thread_array[i].iws_mult); - MALLOC_ENTRY_1DARRAY(thread_array[i].ews, thread_array[i].ews_size*thread_array[i].ews_mult); + MALLOC_INT_1DARRAY(thread_array(i).iws, thread_array(i).iws_size*thread_array(i).iws_mult); + MALLOC_ENTRY_1DARRAY(thread_array(i).ews, thread_array(i).ews_size*thread_array(i).ews_mult); } } } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp index ec7774a43f13..890bc8a17fca 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp @@ -99,20 +99,20 @@ namespace BaskerNS for(Int p=0; p < num_threads; ++p) { - Int blk = S[0][p]; + Int blk = S(0)(p); sfactor_nd_dom_estimate(ALM[blk][0], LL[blk][0], LU[blk][LU_size(blk)-1]); for(Int l=0; l < tree.nlvls; l++) { - Int U_col = S[l+1][p]; + Int U_col = S(l+1)(p); Int my_row_leader = find_leader(p,l); Int my_new_row = - blk - S[0][my_row_leader]; + blk - S(0)(my_row_leader); - Int U_row = (l==0)?(p%2):S[0][p]%LU_size(U_col); + Int U_row = (l==0)?(p%2):S(0)(p)%LU_size(U_col); if((blk > 14) && (blk > LU_size(U_col)) && (l!=0)) @@ -138,7 +138,7 @@ namespace BaskerNS for(Int pp=0; pp < pow(tree.nparts, tree.nlvls-lvl-1); pp++) { Int ppp = pp*pow(tree.nparts, lvl+1); - Int U_col = S[lvl+1][ppp]; + Int U_col = S(lvl+1)(ppp); Int U_row = 0; sfactor_nd_sep_estimate(ALM[U_col][U_row], @@ -148,19 +148,19 @@ namespace BaskerNS Int innerblk = U_col; for(Int l = lvl+1; l < tree.nlvls; l++) { - U_col = S[l+1][ppp]; + U_col = S(l+1)(ppp); Int my_row_leader = find_leader(ppp,l); Int my_new_row = - S[lvl+1][ppp] - S[0][my_row_leader]; + S(lvl+1)(ppp) - S(0)(my_row_leader); - U_row = S[lvl+1][ppp]%LU_size(U_col); - if((S[lvl+1][ppp] > 14) && - (S[lvl+1][ppp] > LU_size(U_col)) + U_row = S(lvl+1)(ppp)%LU_size(U_col); + if((S(lvl+1)(ppp) > 14) && + (S(lvl+1)(ppp) > LU_size(U_col)) ) { - Int tm = (S[lvl+1][ppp]+1)/16; - U_row = ((S[lvl+1][ppp]+1) - + Int tm = (S(lvl+1)(ppp)+1)/16; + U_row = ((S(lvl+1)(ppp)+1) - (tm*16))%LU_size(U_col); } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp index 81e3c78c7f9c..74a478591636 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp @@ -118,7 +118,7 @@ namespace BaskerNS for(Int i =0; i < tree.nblks+1; i++) { BASKER_ASSERT(num_threads > 0, "tree num_threads"); - MALLOC_INT_1DARRAY(S[i], num_threads); + MALLOC_INT_1DARRAY(S(i), num_threads); } //this will want to be across all threads @@ -335,7 +335,7 @@ namespace BaskerNS l, t, lvl_counter ,lvl_idx, tree.nblks); #endif - S[l][t] = tree.lvlset[lvl_idx]; + S(l)(t) = tree.lvlset[lvl_idx]; if(lvl_counter >= (pow(tree.nparts,l)-1)) { lvl_idx++; @@ -356,7 +356,7 @@ namespace BaskerNS { for(Int t=0; t < num_threads; t++) { - cout << S[l][t] << " , " ; + cout << S(l)(t) << " , " ; }//end over nhreads cout << endl; }//end over nlvls @@ -368,11 +368,11 @@ namespace BaskerNS { for(Int t=0; t < num_threads; t++) { - Int s_element = S[l][t]; + Int s_element = S(l)(t); Int row_size = (tree.row_tabs[s_element+1] - tree.row_tabs[s_element]); - thread_array[t].iws_size += row_size; - thread_array[t].ews_size += row_size; + thread_array(t).iws_size += row_size; + thread_array(t).ews_size += row_size; }//end over threads }//end over lvls @@ -592,7 +592,7 @@ namespace BaskerNS l, t, lvl_counter ,lvl_idx, tree.nblks); #endif - S[l][t] = tree.lvlset[lvl_idx]; + S(l)(t) = tree.lvlset[lvl_idx]; if(lvl_counter >= (pow(tree.nparts,l)-1)) { lvl_idx++; @@ -611,7 +611,7 @@ namespace BaskerNS { for(Int t=0; t < num_threads; t++) { - cout << S[l][t] << " , " ; + cout << S(l)(t) << " , " ; }//end over nhreads cout << endl; }//end over nlvls @@ -624,10 +624,10 @@ namespace BaskerNS { for(Int t=0; t < num_threads; t++) { - Int s_element = S[l][t]; + Int s_element = S(l)(t); Int row_size = (tree.row_tabs[s_element+1] - tree.row_tabs[s_element]); - thread_array[t].iws_size += row_size; - thread_array[t].ews_size += row_size; + thread_array(t).iws_size += row_size; + thread_array(t).ews_size += row_size; }//end over threads }//end over lvls diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp index c55c222ec7c7..193ecb11e24a 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp @@ -148,13 +148,13 @@ enum BASKER_INCOMPLETE_CODE #define BOOL_1DARRAY Kokkos::View #define BOOL_2DARRAY Kokkos::View -#define INT_2DARRAY std::vector -#define ENTRY_2DARRAY std::vector -#define MATRIX_1DARRAY std::vector -#define MATRIX_2DARRAY std::vector -#define MATRIX_VIEW_1DARRAY std::vector -#define MATRIX_VIEW_2DARRAY std::vector -#define THREAD_1DARRAY std::vector +#define INT_2DARRAY Kokkos::View +#define ENTRY_2DARRAY Kokkos::View +#define MATRIX_1DARRAY Kokkos::View +#define MATRIX_2DARRAY Kokkos::View +#define MATRIX_VIEW_1DARRAY Kokkos::View +#define MATRIX_VIEW_2DARRAY Kokkos::View +#define THREAD_1DARRAY Kokkos::View #define INT_1DARRAY_PAIRS Kokkos::View*, BASKER_EXE_SPACE> //Macro Memory Calls @@ -163,7 +163,6 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC malloc_pairs_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = INT_1DARRAY_PAIRS(BASKER_KOKKOS_NOINIT("pairs_1d"),s);*/ \ Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ @@ -192,8 +191,7 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0,"BASKER ASSERT MALLOC int_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = INT_2DARRAY(Kokkos::view_alloc("int_2d", Kokkos::SequentialHostInit),s);*/ \ - a.resize(s); \ + a = INT_2DARRAY(Kokkos::view_alloc("int_2d", Kokkos::SequentialHostInit),s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -202,7 +200,6 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC entry_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = ENTRY_1DARRAY(BASKER_KOKKOS_NOINIT("entry_1d"),s);*/ \ Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ @@ -212,8 +209,7 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC entry_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = ENTRY_2DARRAY("entry_2d",s);*/ \ - a.resize(s); \ + a = ENTRY_2DARRAY(Kokkos::view_alloc("matrix_2d", Kokkos::SequentialHostInit),s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -222,7 +218,6 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC bool_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = BOOL_1DARRAY(BASKER_KOKKOS_NOINIT("bool_1d"), s);*/ \ Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ @@ -232,7 +227,6 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC bool_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = BOOL_2DARRAY("bool_2d", s);*/ \ Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ @@ -242,8 +236,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = MATRIX_1DARRAY("matrix_1d",s)*/ \ - a.resize(s); \ + a = MATRIX_1DARRAY(Kokkos::view_alloc("matrix_1d", Kokkos::SequentialHostInit),s); \ + Kokkos::resize(a,s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -252,8 +246,8 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = MATRIX_2DARRAY("matrix_2d",s);*/ \ - a.resize(s); \ + a = MATRIX_2DARRAY(Kokkos::view_alloc("matrix_2d", Kokkos::SequentialHostInit),s); \ + Kokkos::resize(a,s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -262,8 +256,7 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_view_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = MATRIX_VIEW_1DARRAY("matrix_view_1d",s);*/ \ - a.resize(s); \ + a = MATRIX_VIEW_1DARRAY(Kokkos::view_alloc("matrix_view_1d", Kokkos::SequentialHostInit),s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -272,8 +265,7 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_view_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = MATRIX_VIEW_2DARRAY("matrix_view_2d",s);*/ \ - a.resize(s); \ + a = MATRIX_VIEW_2DARRAY(Kokkos::view_alloc("matrix_view_2d", Kokkos::SequentialHostInit),s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -282,26 +274,12 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC thread_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = THREAD_1DARRAY("thread_1d",s);*/ \ - a.resize(s); \ + a = THREAD_1DARRAY(Kokkos::view_alloc("thread_1d", Kokkos::SequentialHostInit),s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ } -//RESIZE (with copy) -#define RESIZE_1DARRAY(a,os,s) \ - { \ - BASKER_ASSERT(s >= 0, "BASKER ASSERT RESIZE 1D ARRAY: size to alloc >= 0 fails"); \ - Kokkos::resize(a,s); \ - } -#define RESIZE_2DARRAY(a,os1,os2,s1,s2) \ - { \ - BASKER_ASSERT(s1 >= 0 && s2 >= 0, "BASKER ASSERT RESIZE 2D ARRAY: size to alloc >= 0 fails"); \ - Kokkos::resize(a,s1,s2); \ - } -#define RESIZE_INT_1DARRAY(a,os,s) RESIZE_1DARRAY(a,os,s) -#define RESIZE_ENTRY_1DARRAY(a,os,s) RESIZE_1DARRAY(a,os,s) //REALLOC (no copy) #define REALLOC_1DARRAY(a,os,s) \ { \ @@ -315,6 +293,7 @@ enum BASKER_INCOMPLETE_CODE } #define REALLOC_INT_1DARRAY(a,os,s) REALLOC_1DARRAY(a,os,s) #define REALLOC_ENTRY_1DARRAY(a,os,s) REALLOC_1DARRAY(a,os,s) + //Set values #define SET_INT_1DARRAY(a, b, s) \ { \ @@ -341,80 +320,67 @@ enum BASKER_INCOMPLETE_CODE #define FREE_INT_1DARRAY_PAIRS(a) \ { \ - /*a = INT_1DARRAY_PAIRS();*/ \ Kokkos::resize(a,0); \ } #define FREE_INT_1DARRAY(a) \ { \ - /*a = INT_1DARRAY();*/ \ Kokkos::resize(a,0); \ } #define FREE_INT_RANK2DARRAY(a) \ { \ - /*a = INT_RANK2DARRAY();*/ \ Kokkos::resize(a,0); \ } #define FREE_INT_2DARRAY(a,n) \ { \ - /*a = INT_2DARRAY();*/ \ - a.resize(0); \ + Kokkos::resize(a,0); \ } #define FREE_ENTRY_1DARRAY(a) \ { \ - /*a = ENTRY_1DARRAY();*/ \ Kokkos::resize(a,0); \ } #define FREE_ENTRY_2DARRAY(a,n) \ { \ - /*a = ENTRY_2DARRAY();*/ \ - a.resize(0); \ + Kokkos::resize(a,0); \ } #define FREE_BOOL_1DARRAY(a) \ { \ - /*a = BOOL_1DARRAY();*/ \ Kokkos::resize(a,0); \ } #define FREE_BOOL_2DARRAY(a,n) \ { \ - /*a = BOOL_2DARRAY();*/ \ Kokkos::resize(a,0); \ } #define FREE_MATRIX_1DARRAY(a) \ { \ - /*a = MATRIX_1DARRAY();*/ \ - a.resize(0); \ + Kokkos::resize(a,0); \ } #define FREE_MATRIX_2DARRAY(a,n) \ { \ - /*a = MATRIX_2DARRAY();*/ \ - a.resize(0); \ + Kokkos::resize(a,0); \ } #define FREE_MATRIX_VIEW_1DARRAY(a) \ { \ - /*a = MATRIX_VIEW_1DARRAY();*/ \ Kokkos::resize(a,0); \ } #define FREE_MATRIX_VIEW_2DARRAY(a,n) \ { \ - /*a = MATRIX_VIEW_2DARRAY();*/ \ - a.resize(0); \ + Kokkos::resize(a,0); \ } #define FREE_THREAD_1DARRAY(a) \ { \ - /*a = THREAD_1DARRAY();*/ \ - a.resize(0); \ + Kokkos::resize(a,0); \ } #else // not BASKER_KOKKOS @@ -456,11 +422,6 @@ enum BASKER_INCOMPLETE_CODE #define MALLOC_MATRIX_VIEW_1DARRAY(a,s) a = new BASKER_MATRIX_VIEW [s] #define MALLOC_MATRIX_VIEW_2DARRAY(a,s) a = new MATRIX_VIEW_1DARRAY[s] #define MALLOC_THREAD_1DARRAY(a,s) a = new BASKER_THREAD [s] -//Resize (copy old data) (come back and add) -#define RESIZE_1DARRAY(a,os,s) BASKER_NO_OP -#define RESIZE_2DARRAY(a,os1,os2,s1,s2) BASKER_NO_OP -#define RESIZE_INT_1DARRAY(a,os,s) BASKER_NO_OP -#define RESIZE_ENTRY_1DARRAY(a,os,s) BASKER_NO_OP //Realloc (dont copy old data) #define REALLOC_1DARRAY(a,os,s) BASKER_NO_OP #define REALLOC_2DARRAY(a,os1,os2,s1,s2) BASKER_NO_OP diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp index 3691d73c63ba..715ac1c13f5f 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp @@ -358,7 +358,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S[lvl][kid]; + Int b = S(lvl)(kid); for(Int row = 0; row < LL_size(b); row++) { @@ -378,7 +378,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S[lvl][kid]; + Int b = S(lvl)(kid); #ifdef BASKER_DEBUG_INIT printf("U Factor init: %d %d, nnz: %ld \n", @@ -402,13 +402,13 @@ namespace BaskerNS LU[b][LU_size(b)-1].nnz = LU[b][LU_size(b)-1].mnnz; for(Int l = lvl+1; l < tree.nlvls+1; l++) { - Int U_col = S[l][kid]; + Int U_col = S(l)(kid); Int my_row_leader = find_leader(kid, l-1); Int my_new_row = - b - S[0][my_row_leader]; + b - S(0)(my_row_leader); - Int U_row = (l==1)?(kid%2):S[lvl][kid]%LU_size(U_col); + Int U_row = (l==1)?(kid%2):S(lvl)(kid)%LU_size(U_col); //JDB TEST PASS U_row = my_new_row; @@ -460,7 +460,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S[lvl][kid]; + Int b = S(lvl)(kid); for(Int row = 0; row < LL_size(b); row++) { @@ -524,7 +524,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S[lvl][kid]; + Int b = S(lvl)(kid); #ifdef BASKER_DEBUG_INIT printf("U Factor init: %d %d, nnz: %ld \n", @@ -550,13 +550,13 @@ namespace BaskerNS for(Int l = lvl+1; l < tree.nlvls+1; l++) { - Int U_col = S[l][kid]; + Int U_col = S(l)(kid); Int my_row_leader = find_leader(kid, l-1); Int my_new_row = - b - S[0][my_row_leader]; + b - S(0)(my_row_leader); - Int U_row = (l==1)?(kid%2):S[lvl][kid]%LU_size(U_col); + Int U_row = (l==1)?(kid%2):S(lvl)(kid)%LU_size(U_col); if( (b > 14) && // NDE: Why is 14 specifically used here? (b > LU_size(U_col)) && @@ -640,7 +640,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S[lvl][kid]; + Int b = S(lvl)(kid); for(Int row = 0; row < LL_size(b); row++) { @@ -692,7 +692,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S[lvl][kid]; + Int b = S(lvl)(kid); #ifdef BASKER_DEBUG_INTI printf("AUM Factor init: %d %d, kid: %d nnz: %d nrow: %d ncol: %d \n", @@ -731,10 +731,10 @@ namespace BaskerNS //TEST Int my_leader = find_leader(kid,l-1); - Int my_leader_row = S[0][my_leader]; + Int my_leader_row = S(0)(my_leader); //Int my_col_size = pow(2,l); Not used Int my_new_row = - (S[lvl][kid] - my_leader_row); + (S(lvl)(kid) - my_leader_row); //my_new_row = my_new_row%my_col_size; /* @@ -745,7 +745,7 @@ namespace BaskerNS my_col_size, my_new_row); */ - Int U_col = S[l][kid]; + Int U_col = S(l)(kid); Int U_row = my_new_row; //Int U_row = (l==1)?(kid%2):S(lvl)(kid)%LU_size(U_col); @@ -823,7 +823,7 @@ namespace BaskerNS { if(kid%((Int)pow(2,lvl)) == 0) { - Int b = S[lvl][kid]; + Int b = S(lvl)(kid); for(Int l = 0; l < LL_size(b); l++) { @@ -885,7 +885,7 @@ namespace BaskerNS //printf("C: size: %d kid: %d \n", // iws_size, kid); - //thread_array[kid].C.init_matrix("cwork", + //thread_array(kid).C.init_matrix("cwork", // 0, iws_size, // 0, 2, // iws_size*2); @@ -895,7 +895,7 @@ namespace BaskerNS } //Also workspace matrix //This could be made smaller - thread_array[kid].C.init_matrix("cwork", 0, max_sep_size, + thread_array(kid).C.init_matrix("cwork", 0, max_sep_size, 0, 2, max_sep_size*2); } //end if btf_tabs_offset != 0 @@ -905,19 +905,19 @@ namespace BaskerNS { // if any left over for BLK factorization if(Options.btf == BASKER_TRUE) { - Int iws_mult = thread_array[kid].iws_mult; - Int iws_size = thread_array[kid].iws_size; - Int ews_mult = thread_array[kid].ews_mult; - Int ews_size = thread_array[kid].ews_size; + Int iws_mult = thread_array(kid).iws_mult; + Int iws_size = thread_array(kid).iws_size; + Int ews_mult = thread_array(kid).ews_mult; + Int ews_size = thread_array(kid).ews_size; for(Int i=0; i < iws_mult*iws_size; i++) { - thread_array[kid].iws[i] = 0; + thread_array(kid).iws[i] = 0; } for(Int i = 0; i < ews_mult*ews_size; i++) { - thread_array[kid].ews[i] = 0.0; + thread_array(kid).ews[i] = 0.0; } } } @@ -928,23 +928,23 @@ namespace BaskerNS { if(btf_tabs_offset != 0) { - INT_1DARRAY &ws = thread_array[kid].iws; - ENTRY_1DARRAY &X = thread_array[kid].ews; - Int iws_size = thread_array[kid].iws_size; - Int iws_mult = thread_array[kid].iws_mult; - Int ews_size = thread_array[kid].ews_size; - Int ews_mult = thread_array[kid].ews_mult; + INT_1DARRAY &ws = thread_array(kid).iws; + ENTRY_1DARRAY &X = thread_array(kid).ews; + Int iws_size = thread_array(kid).iws_size; + Int iws_mult = thread_array(kid).iws_mult; + Int ews_size = thread_array(kid).ews_size; + Int ews_mult = thread_array(kid).ews_mult; } } printf("init_workspace 1d, kid: %d size: %d %d %d %d \n", kid, iws_mult, iws_size, ews_mult, ews_size); for(Int i=0; i< iws_mult*iws_size; i++) { - thread_array[kid].iws[i] = 0; + thread_array(kid).iws[i] = 0; } for(Int i = 0; i < ews_mult*ews_size; i++) { - thread_array[kid].ews[i] = 0; + thread_array(kid).ews[i] = 0; } #endif //endif def basker_2dl //return 0; @@ -2467,12 +2467,12 @@ namespace BaskerNS Int Basker::find_leader(Int kid, Int l) { l = l+1; - Int my_token = S[l][kid]; + Int my_token = S(l)(kid); Int my_loc = kid; while((my_loc > 0)) { my_loc--; - if(S[l][my_loc] != my_token) + if(S(l)(my_loc) != my_token) { my_loc++; break; From da3a195e7b4c9753da8628a830fcd57617b975f9 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Fri, 25 Oct 2024 20:27:04 -0600 Subject: [PATCH 069/243] ShyLU - Basker : replace brackes back to parenthesis Signed-off-by: iyamazaki --- .../basker/src/shylubasker_error_manager.hpp | 26 +-- .../basker/src/shylubasker_nfactor_blk.hpp | 94 ++++---- .../src/shylubasker_nfactor_blk_inc.hpp | 144 ++++++------ .../basker/src/shylubasker_nfactor_col.hpp | 180 +++++++-------- .../basker/src/shylubasker_nfactor_col2.hpp | 40 ++-- .../src/shylubasker_nfactor_col_inc.hpp | 110 ++++----- .../basker/src/shylubasker_nfactor_diag.hpp | 8 +- .../basker/src/shylubasker_sfactor.hpp | 86 +++---- .../basker/src/shylubasker_sfactor_inc.hpp | 28 +-- .../basker/src/shylubasker_solve_rhs.hpp | 16 +- .../basker/src/shylubasker_solve_rhs_tr.hpp | 16 +- .../basker/src/shylubasker_stats.hpp | 10 +- .../basker/src/shylubasker_tree.hpp | 28 +-- .../basker/src/shylubasker_util.hpp | 214 +++++++++--------- 14 files changed, 490 insertions(+), 510 deletions(-) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp index d9695c6e5c78..cd2c9f57bf0a 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp @@ -94,8 +94,8 @@ namespace BaskerNS { Int blkcol = thread_array(ti).error_blk; Int blkUrow = LU_size(blkcol)-1; - if(LL[blkcol][0].nnz >= - LU[blkcol][blkUrow].nnz) + if(LL(blkcol)(0).nnz >= + LU(blkcol)(blkUrow).nnz) { resize_U = thread_array(ti).error_info; } @@ -116,7 +116,7 @@ namespace BaskerNS std::cout << " ++ resize L( tid = " << ti << " ): new size = " << resize_L << std::endl; } BASKER_MATRIX &L = - LL[thread_array(ti).error_blk][thread_array(ti).error_subblk]; + LL(thread_array(ti).error_blk)(thread_array(ti).error_subblk); REALLOC_INT_1DARRAY(L.row_idx, L.nnz, resize_L); @@ -142,7 +142,7 @@ namespace BaskerNS std::cout << " ++ resize U( tid = " << ti << " ): new size = " << resize_U << std::endl; } BASKER_MATRIX &U = - LU[thread_array(ti).error_blk][0]; + LU(thread_array(ti).error_blk)(0); REALLOC_INT_1DARRAY(U.row_idx, U.nnz, resize_U); @@ -153,7 +153,7 @@ namespace BaskerNS U.nnz = resize_U; //Still need to clear pend BASKER_MATRIX &L = - LL[thread_array(ti).error_blk][0]; + LL(thread_array(ti).error_blk)(0); L.clear_pend(); } @@ -167,7 +167,7 @@ namespace BaskerNS sb++) { BASKER_MATRIX &SL = - LL[thread_array(ti).error_blk][sb]; + LL(thread_array(ti).error_blk)(sb); for(Int i = 0; i < SL.iws_size*SL.iws_mult; ++i) { SL.iws(i) = (Int) 0; @@ -307,7 +307,7 @@ namespace BaskerNS { const Int tsb = (-1*thread_array(ti).error_subblk)-1; BASKER_MATRIX &L = - LL[thread_array(ti).error_blk][tsb]; + LL(thread_array(ti).error_blk)(tsb); REALLOC_INT_1DARRAY(L.row_idx, L.nnz, resize_L); @@ -324,7 +324,7 @@ namespace BaskerNS { const Int tsb = thread_array(ti).error_subblk; BASKER_MATRIX &U = - LU[thread_array(ti).error_blk][tsb]; + LU(thread_array(ti).error_blk)(tsb); REALLOC_INT_1DARRAY(U.row_idx, U.nnz, resize_U); @@ -352,7 +352,7 @@ namespace BaskerNS //Clear workspace, whole column for(Int sb = 0; sb < LL_size(blk); sb++) { - BASKER_MATRIX &SL = LL[blk][sb]; + BASKER_MATRIX &SL = LL(blk)(sb); for(Int i = 0; i < SL.iws_size*SL.iws_mult; ++i) { SL.iws(i) = (Int) 0; @@ -372,7 +372,7 @@ namespace BaskerNS Int blk = S(error_sep_lvl)(p); //if(LL(blk)(0).w_fill == BASKER_TRUE) { - BASKER_MATRIX &TM = LL[blk][0]; + BASKER_MATRIX &TM = LL(blk)(0); //printf( " > p=%d: scol_top = %d, scol = %d, ncol = %d\n",p,scol_top,TM.scol,TM.ncol ); for(Int i = scol_top + TM.scol; i < scol_top + (TM.scol+TM.ncol); i++) { @@ -386,7 +386,7 @@ namespace BaskerNS //Note, will have to clear the perm in all sep blk in that level //Clear permuation BASKER_MATRIX &SL = - LL[thread_array(ti).error_blk][0]; + LL(thread_array(ti).error_blk)(0); //printf( " + scol_top = %d, srow = %d, nrowl = %d\n",scol_top,SL.srow,SL.nrow ); for(Int i = scol_top + SL.srow; i < scol_top + (SL.srow+SL.nrow); i++) { @@ -512,7 +512,7 @@ namespace BaskerNS } //Resize L - BASKER_MATRIX &L = (c >= btab ? LBTF[c-btab] : L_D[c]); + BASKER_MATRIX &L = (c >= btab ? LBTF(c-btab) : L_D(c)); L.clear_pend(); REALLOC_INT_1DARRAY(L.row_idx, L.nnz, @@ -533,7 +533,7 @@ namespace BaskerNS } //Resize U - BASKER_MATRIX &U = (c >= btab ? UBTF[c-btab] : U_D[c]); + BASKER_MATRIX &U = (c >= btab ? UBTF(c-btab) : U_D(c)); REALLOC_INT_1DARRAY(U.row_idx, U.nnz, thread_array(ti).error_info); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp index 030d526299a1..2e0434796e33 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp @@ -150,14 +150,14 @@ namespace BaskerNS const Mag normA_blk = BTF_A.anorm; Int b = S(0)(kid); //Which blk from schedule - BASKER_MATRIX &L = LL[b][0]; - BASKER_MATRIX &U = LU[b][LU_size(b)-1]; - BASKER_MATRIX &M = ALM[b][0]; //A->blk + BASKER_MATRIX &L = LL(b)(0); + BASKER_MATRIX &U = LU(b)(LU_size(b)-1); + BASKER_MATRIX &M = ALM(b)(0); //A->blk #ifdef BASKER_2DL //printf("Accessing blk: %d kid: %d \n", b, kid); - INT_1DARRAY ws = LL[b][0].iws; - ENTRY_1DARRAY X = LL[b][0].ews; - Int ws_size = LL[b][0].iws_size; + INT_1DARRAY ws = LL(b)(0).iws; + ENTRY_1DARRAY X = LL(b)(0).ews; + Int ws_size = LL(b)(0).iws_size; #else //else if BASKER_2DL INT_1DARRAY ws = thread_array(kid).iws; ENTRY_1DARRAY X = thread_array(kid).ews; @@ -983,8 +983,8 @@ namespace BaskerNS //Setup variables const Int wsb = S(0)(kid); - INT_1DARRAY ws = LL[wsb][l].iws; - const Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + const Int ws_size = LL(wsb)(l).iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -1014,7 +1014,7 @@ namespace BaskerNS const Int b = S(lvl)(kid); //const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &L = LL(b)(0); const Int U_col = S(lvl)(kid); Int U_row = LU_size(U_col)-1; if(lvl > 0) @@ -1022,7 +1022,7 @@ namespace BaskerNS //U_row = (lvl==1)?(kid%2):S(l)(kid)%LU_size(U_col); } - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); //const Int brow = L.srow; @@ -1130,12 +1130,12 @@ namespace BaskerNS //Setup variables const Int b = S(lvl)(kid); const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &L = LL(b)(0); const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A const Int brow_g = L.srow + scol_top; // global offset - INT_1DARRAY ws = LL[wsb][l].iws; - const Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + const Int ws_size = LL(wsb)(l).iws_size; //Int *color = &(ws[0]); Int *pattern = &(ws(ws_size)); @@ -1281,10 +1281,10 @@ namespace BaskerNS //Setup variables const Int b = S(lvl)(kid); const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &L = LL(b)(0); #ifdef BASKER_2DL - INT_1DARRAY ws = LL[wsb][l].iws; - const Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + const Int ws_size = LL(wsb)(l).iws_size; #else INT_1DARRAY ws = thread_array(kid).iws; Int ws_size = thread_array(kid).iws_size; @@ -1454,11 +1454,11 @@ namespace BaskerNS { const Int b = S(lvl)(kid); const Int wsb = S(0)(kid); - BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &L = LL(b)(0); #ifdef BASKER_2DL - INT_1DARRAY ws = LL[wsb][l].iws; - ENTRY_1DARRAY X = LL[wsb][l].ews; - Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + ENTRY_1DARRAY X = LL(wsb)(l).ews; + Int ws_size = LL(wsb)(l).iws_size; #else INT_1DARRAY ws = thread_array(kid).iws; ENTRY_1DARRAY X = thread_array(kid).ews; @@ -1534,10 +1534,10 @@ namespace BaskerNS Int X_col, Int X_row, Int k, Entry pivot) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; //const Int ws_size = LL(X_col)(X_row).iws_size; //const Int p_size = LL(X_col)(X_row).p_size; @@ -1607,8 +1607,8 @@ namespace BaskerNS if (blkcol == 2 && blkrow == 1) printf( " L.colptr(%d) = %d\n",k+1,lnnz ); #endif - //LL[X_col][X_row].p_size = 0; - LL[X_col][X_row].p_size = 0; + //LL(X_col)(X_row).p_size = 0; + LL(X_col)(X_row).p_size = 0; return 0; }//end t_dense_offdiag_mov_L() @@ -1623,12 +1623,12 @@ namespace BaskerNS Int X_col, Int X_row, Int k, Entry pivot) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - const Int ws_size = LL[X_col][X_row].iws_size; - const Int p_size = LL[X_col][X_row].p_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + const Int ws_size = LL(X_col)(X_row).iws_size; + const Int p_size = LL(X_col)(X_row).p_size; #ifdef BASKER_DEBUG_NFACTOR_BLK @@ -1714,7 +1714,7 @@ namespace BaskerNS } #endif - LL[X_col][X_row].p_size = 0; + LL(X_col)(X_row).p_size = 0; return 0; }//end t_offdiag_mov_L() @@ -1733,17 +1733,17 @@ namespace BaskerNS BASKER_BOOL A_option) { //Note: need to add support for offdiag permuation - BASKER_MATRIX &L = LL[blkcol][blkrow]; - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int nnz = LL[X_col][X_row].p_size; + Int nnz = LL(X_col)(X_row).p_size; //printf( " t_dense_back_solve_offdiag( LL(%d,%d) and ALM(%d,%d)\n", blkcol,blkrow,blkcol,blkrow ); #ifdef BASKER_DEBUG_NFACTOR_BLK - Int ws_size = LL[X_col][X_row].iws_size; + Int ws_size = LL(X_col)(X_row).iws_size; const Int brow = L.srow; const Int bcol = L.scol; printf("\n\n"); @@ -1831,8 +1831,8 @@ namespace BaskerNS }//over all nonzero in left #ifdef BASKER_2DL - //LL[X_col][X_row].p_size = nnz; - LL[X_col][X_row].p_size = nnz; + //LL(X_col)(X_row).p_size = nnz; + LL(X_col)(X_row).p_size = nnz; #endif //Debug @@ -1878,14 +1878,14 @@ namespace BaskerNS { //Note: need to add support for offdiag permuation - BASKER_MATRIX &L = LL[blkcol][blkrow]; - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; - Int ws_size = LL[X_col][X_row].iws_size; - Int nnz = LL[X_col][X_row].p_size; + Int ws_size = LL(X_col)(X_row).iws_size; + Int nnz = LL(X_col)(X_row).p_size; //const Int brow = L.srow; //const Int bcol = L.scol; @@ -2056,8 +2056,8 @@ namespace BaskerNS nnz, kid, X_col, X_row); printf("kid %d Ending nnz: %d \n",kid, nnz); #endif - //LL[X_col][X_row].p_size = nnz; - LL[X_col][X_row].p_size = nnz; + //LL(X_col)(X_row).p_size = nnz; + LL(X_col)(X_row).p_size = nnz; #endif //Debug diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp index cf6fd8b3c0d9..c9e696f50786 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk_inc.hpp @@ -670,8 +670,8 @@ namespace BaskerNS BASKER_MATRIX &L = LL(b)(0); const Int brow = L.srow; - INT_1DARRAY ws = LL[wsb][l].iws; - const Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + const Int ws_size = LL(wsb)(l).iws_size; //Int *color = &(ws[0]); Int *pattern = &(ws(ws_size)); @@ -990,8 +990,8 @@ namespace BaskerNS BASKER_MATRIX &L = LL(b)(0); const Int brow = L.srow; - INT_1DARRAY ws = LL[wsb][l].iws; - const Int ws_size = LL[wsb][l].iws_size; + INT_1DARRAY ws = LL(wsb)(l).iws; + const Int ws_size = LL(wsb)(l).iws_size; Int *color = &(ws(0)); Int *pattern = &(ws(ws_size)); @@ -1555,14 +1555,14 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; - Int nnz = LL[X_col][X_row].p_size; + Int nnz = LL(X_col)(X_row).p_size; #ifdef BASKER_DEBUG_NFACTOR_BLK printf("t_back_solve_diag, kid: %d blkcol: %d blkrow: %d \n", @@ -1696,7 +1696,7 @@ namespace BaskerNS nnz, kid, X_col, X_row); printf("kid %d Ending nnz: %d \n",kid, nnz); #endif - LL[X_col][X_row].p_size = nnz; + LL(X_col)(X_row).p_size = nnz; #endif return; @@ -1717,14 +1717,14 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; - Int nnz = LL[X_col][X_row].p_size; + Int nnz = LL(X_col)(X_row).p_size; //Int brow = L.srow; //Int bcol = L.scol; @@ -1846,7 +1846,7 @@ namespace BaskerNS nnz, kid, X_col, X_row); printf("kid %d Ending nnz: %d \n",kid, nnz); #endif - LL[X_col][X_row].p_size = nnz; + LL(X_col)(X_row).p_size = nnz; #endif return 0; @@ -1869,14 +1869,14 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; - Int nnz = LL[X_col][X_row].p_size; + Int nnz = LL(X_col)(X_row).p_size; Int brow = L.srow; Int bcol = L.scol; @@ -2046,7 +2046,7 @@ namespace BaskerNS nnz, kid, X_col, X_row); printf("kid %d Ending nnz: %d \n",kid, nnz); #endif - LL[X_col][X_row].p_size = nnz; + LL(X_col)(X_row).p_size = nnz; #endif return 0; @@ -2065,12 +2065,12 @@ namespace BaskerNS Int k, Entry pivot ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - const Int ws_size = LL[X_col][X_row].iws_size; - const Int p_size = LL[X_col][X_row].p_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + const Int ws_size = LL(X_col)(X_row).iws_size; + const Int p_size = LL(X_col)(X_row).p_size; #ifdef BASKER_DEBUG_NFACTOR_BLK @@ -2155,14 +2155,14 @@ namespace BaskerNS //Fix later if(Options.same_pattern == BASKER_FALSE) { - for(Int i = 0; i < LL[X_col][X_row].nrow; i++) + for(Int i = 0; i < LL(X_col)(X_row).nrow; i++) { stack[i] = BASKER_MAX_IDX; } } L.col_ptr(k+1) = lnnz; - LL[X_col][X_row].p_size = 0; + LL(X_col)(X_row).p_size = 0; return 0; }//end t_offdiag_mov_L_inc_lvl() @@ -2729,8 +2729,8 @@ namespace BaskerNS BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); /* @@ -2740,14 +2740,14 @@ namespace BaskerNS LP_col, LP_row, kid); */ - BASKER_MATRIX *UPP = &LU[UP_col][0]; + BASKER_MATRIX *UPP = &LU(UP_col)(0); if(UP_row != BASKER_MAX_IDX) { - UPP = &(LU[UP_col][UP_row]); + UPP = &(LU(UP_col)(UP_row)); } BASKER_MATRIX &UP = *(UPP); - BASKER_MATRIX *LPP = &LU[LP_col][0]; + BASKER_MATRIX *LPP = &LU(LP_col)(0); if(LP_row != BASKER_MAX_IDX) { LPP = &(LL(LP_col)(LP_row)); @@ -2968,14 +2968,14 @@ namespace BaskerNS Int x_size, Int x_offset, BASKER_BOOL A_option) { - BASKER_MATRIX &L = LL(blkcol)(blkrow); - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; - Int nnz = LL[X_col][X_row].p_size; + Int nnz = LL(X_col)(X_row).p_size; //const Int brow = L.srow; //const Int bcol = L.scol; @@ -3105,7 +3105,7 @@ namespace BaskerNS */ - Int temp = INC_LVL_TEMP(k_i+LL[blkcol][0].srow) + L.inc_lvl(j) + 1; + Int temp = INC_LVL_TEMP(k_i+LL(blkcol)(0).srow) + L.inc_lvl(j) + 1; /* printf("lower row: %d kid: %d inc: %d %d %d j: %d \n", @@ -3182,7 +3182,7 @@ namespace BaskerNS nnz, kid, X_col, X_row); printf("kid %d Ending nnz: %d \n",kid, nnz); #endif - LL[X_col][X_row].p_size = nnz; + LL(X_col)(X_row).p_size = nnz; #endif //Debug @@ -3218,11 +3218,11 @@ namespace BaskerNS Int k, Entry pivot ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - const Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + const Int ws_size = LL(X_col)(X_row).iws_size; //const Int p_size = LL(X_col)(X_row).p_size; //NDE - warning: unused @@ -3295,7 +3295,7 @@ namespace BaskerNS } L.col_ptr(k+1) = lnnz; - LL[X_col][X_row].p_size = 0; + LL(X_col)(X_row).p_size = 0; return 0; }//end t_dense_offdiag_mov_L_inv_lvl() @@ -3314,12 +3314,12 @@ namespace BaskerNS const BASKER_BOOL A_option ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; - BASKER_MATRIX &B = ALM[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); + BASKER_MATRIX &B = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; //Int nnz = LL(X_col)(X_row).p_size; //Int brow = L.srow; @@ -3438,11 +3438,11 @@ namespace BaskerNS Int x_size, Int x_offset ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; //Int nnz = LL(X_col)(X_row).p_size; //const Int brow = L.srow; //Not used @@ -3575,11 +3575,11 @@ namespace BaskerNS Int x_size, Int x_offset ) { - BASKER_MATRIX &L = LL[blkcol][blkrow]; + BASKER_MATRIX &L = LL(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; - Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; + Int ws_size = LL(X_col)(X_row).iws_size; //Int nnz = LL(X_col)(X_row).p_size; //const Int brow = L.srow; //Not used @@ -3757,16 +3757,16 @@ namespace BaskerNS BASKER_MATRIX *B; if(lower == BASKER_TRUE) { - B = &(ALM[blkcol][blkrow]); + B = &(ALM(blkcol)(blkrow)); } else { - B = &(AVM[blkcol][blkrow]); + B = &(AVM(blkcol)(blkrow)); } BASKER_MATRIX &M = *B; //BASKER_MATRIX &M = ALM(blkcol)(blkrow); - INT_1DARRAY ws = LL[X_col][X_row].iws; - const Int ws_size = LL[X_col][X_row].iws_size; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + const Int ws_size = LL(X_col)(X_row).iws_size; Int *color = &(ws(0)); Int *pattern = &(color[ws_size]); @@ -3856,12 +3856,12 @@ namespace BaskerNS for(Int blk = l+1; blk < endblk; ++blk) { // ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; //NDE - warning: unused - INT_1DARRAY &wsL = LL[leader_idx][blk].iws; + INT_1DARRAY &wsL = LL(leader_idx)(blk).iws; //Int p_sizeL = LL(leader_idx)(blk).p_size; - Int ws_sizeL = LL[leader_idx][blk].iws_size; + Int ws_sizeL = LL(leader_idx)(blk).iws_size; // ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; //NDE - warning: unused - INT_1DARRAY &ws = LL[my_idx][blk].iws; - const Int ws_size = LL[my_idx][blk].iws_size; + INT_1DARRAY &ws = LL(my_idx)(blk).iws; + const Int ws_size = LL(my_idx)(blk).iws_size; //Int p_size = LL(my_idx)(blk).p_size; Int *color = &(ws[0]); Int *pattern = &(color[ws_size]); @@ -3874,7 +3874,7 @@ namespace BaskerNS Int *stackL = &(patternL[ws_sizeL]); //over all nnnz found - for(Int jj = 0; jj < LL[my_idx][blk].nrow; ++jj) + for(Int jj = 0; jj < LL(my_idx)(blk).nrow; ++jj) { //if(kid==3) // { diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp index 68246464f757..289ee65f7ccd 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col.hpp @@ -137,9 +137,9 @@ namespace BaskerNS Int U_col = S(lvl)(kid); Int U_row = 0; - const Int scol = LU[U_col][U_row].scol; - const Int ecol = LU[U_col][U_row].ecol; - const Int ncol = LU[U_col][U_row].ncol; + const Int scol = LU(U_col)(U_row).scol; + const Int ecol = LU(U_col)(U_row).ecol; + const Int ncol = LU(U_col)(U_row).ncol; //for(Int k = scol; k < ecol; k++) //might have to use k+scol for barrier @@ -480,7 +480,7 @@ namespace BaskerNS #endif //end get needed variables// - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); //Ask C++ guru if this is ok BASKER_MATRIX *Bp; @@ -488,7 +488,7 @@ namespace BaskerNS //if(sep_flg == BASKER_FALSE) if(l == 0) { - Bp = &(AVM[U_col][U_row]); + Bp = &(AVM(U_col)(U_row)); //bbcol = Bp->scol; } else @@ -503,9 +503,9 @@ namespace BaskerNS // kid, X_col, X_row); - INT_1DARRAY ws = LL[X_col][X_row].iws; - const Int ws_size = LL[X_col][X_row].iws_size; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + const Int ws_size = LL(X_col)(X_row).iws_size; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; const Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A const Int brow_a = U.srow; // offset within A @@ -871,16 +871,16 @@ namespace BaskerNS Int X_col = S(0)(my_leader); Int X_row = l; //this will change for us Int col_idx_offset = 0; - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); const Int bcol = U.scol; #else BASKER_ASSERT(0==1, "t_upper_col_factor_offdiag, only work with with 2D layout"); #endif #ifdef BASKER_2DL - INT_1DARRAY ws = LL[X_col][X_row].iws; - const Int ws_size = LL[X_col][X_row].iws_size; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + const Int ws_size = LL(X_col)(X_row).iws_size; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; #else BASKER_ASSERT(0==1, "t_upper_col_factor_offdiag, only works with 2D layout"); #endif @@ -960,7 +960,7 @@ namespace BaskerNS { Int b = S(l)(kid); - BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &L = LL(b)(0); INT_1DARRAY ws = thread_array(kid).iws; ENTRY_1DARRAY X = thread_array(team_leader).ews; Int ws_size = thread_array(kid).iws_size; @@ -1080,8 +1080,8 @@ namespace BaskerNS #endif //end get needed variables - BASKER_MATRIX &L = LL[L_col][L_row]; - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &L = LL(L_col)(L_row); + BASKER_MATRIX &U = LU(U_col)(U_row); BASKER_MATRIX &B = thread_array(kid).C; @@ -1098,9 +1098,9 @@ namespace BaskerNS //B.print(); - INT_1DARRAY ws = LL[X_col][l+1].iws; - const Int ws_size = LL[X_col][l+1].iws_size; - ENTRY_1DARRAY X = LL[X_col][l+1].ews; + INT_1DARRAY ws = LL(X_col)(l+1).iws; + const Int ws_size = LL(X_col)(l+1).iws_size; + ENTRY_1DARRAY X = LL(X_col)(l+1).ews; Int scol_top = btf_tabs[btf_top_tabs_offset]; // the first column index of A const Int brow_a = U.srow; // offset within A @@ -1648,12 +1648,12 @@ namespace BaskerNS Int X_row = l+1; Int col_idx_offset = 0; //can get rid of? - BASKER_MATRIX &L = LL[L_col][L_row]; - BASKER_MATRIX &U = LU[U_col][U_row]; //U.fill(); + BASKER_MATRIX &L = LL(L_col)(L_row); + BASKER_MATRIX &U = LU(U_col)(U_row); //U.fill(); - INT_1DARRAY ws = LL[X_col][X_row].iws; - const Int ws_size = LL[X_col][X_row].iws_size; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + const Int ws_size = LL(X_col)(X_row).iws_size; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; const Int bcol = U.scol; @@ -1746,7 +1746,7 @@ namespace BaskerNS Int A_col = S(lvl)(kid); Int A_row = (lvl==1)?(2):S(l+1)(kid)%(LU_size(A_col)); - BASKER_MATRIX &B = AVM[A_col][A_col]; + BASKER_MATRIX &B = AVM(A_col)(A_col); const Int my_idx = S(0)(kid); team_leader = find_leader(kid, l); @@ -1769,17 +1769,17 @@ namespace BaskerNS //Split over threads (leader and nonleader) for(Int blk=l+1; blk Accumulate the update from (l-1)th level: // LU(U_col)(U_row) -= L(U_col)(l-1) * U(l-1)(U_row) t_add_extend(thread, kid, lvl, l-1, k, - LU[U_col][U_row].scol, + LU(U_col)(U_row).scol, BASKER_FALSE); if(kid%((Int)pow(2, l)) == 0) @@ -248,7 +248,7 @@ namespace BaskerNS // printf("[3] barrier test, kid: %d leader: %d b_size: %d lvl: %d \n", // kid, my_leader, b_size, lvl); t_basker_barrier(thread, kid, my_leader, - b_size, 3, LU[U_col][U_row].scol, 0); + b_size, 3, LU(U_col)(U_row).scol, 0); for(Int ti = 0; ti < num_threads; ti++) { if (thread_array(kid).error_type != BASKER_SUCCESS) { info = BASKER_ERROR; @@ -287,7 +287,7 @@ namespace BaskerNS printf( " kid=%d: calling t_add_extend(k=%d/%d)\n",kid,k,ncol ); fflush(stdout); #endif t_add_extend(thread, kid,lvl,lvl-1, k, - LU[U_col][U_row].scol, + LU(U_col)(U_row).scol, BASKER_TRUE); } #ifdef BASKER_TIMER @@ -515,7 +515,7 @@ namespace BaskerNS Int U_row = L_col-my_row_leader; Int X_row = l+1; //this will change for us - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); #ifdef BASKER_DEBUG_NFACTOR_COL2 if(L_row >= LL_size(L_col)) { @@ -609,10 +609,10 @@ namespace BaskerNS Int endblk = (lower)?(LL_size(my_idx)):(l+2); for(Int blk = l+1; blk < endblk; ++blk) { - ENTRY_1DARRAY &XL = LL[leader_idx][blk].ews; - Int p_sizeL = LL[leader_idx][blk].p_size; - ENTRY_1DARRAY &X = LL[my_idx][blk].ews; - INT_1DARRAY &ws = LL[my_idx][blk].iws; + ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; + Int p_sizeL = LL(leader_idx)(blk).p_size; + ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; + INT_1DARRAY &ws = LL(my_idx)(blk).iws; Int *color = &(ws[0]); //printf( " + t_dense_blk_col_copy_atomic2(kid=%d: LL(%d)(%d) += LL(%d)(%d)\n",kid,leader_idx, blk,my_idx,blk); @@ -629,7 +629,7 @@ namespace BaskerNS #endif //over all nnnz found - for(Int jj = 0; jj < LL[my_idx][blk].nrow; ++jj) + for(Int jj = 0; jj < LL(my_idx)(blk).nrow; ++jj) { color[jj] = 0; #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -677,7 +677,7 @@ namespace BaskerNS //This can be removed in the future if(kid != team_leader) { - LL[my_idx][blk].p_size = 0; + LL(my_idx)(blk).p_size = 0; } else { @@ -685,7 +685,7 @@ namespace BaskerNS printf("SETTING PS: %d L:%d %d kid: %d\n", p_sizeL, leader_idx, blk, kid); #endif - LL[leader_idx][blk].p_size = p_sizeL; + LL(leader_idx)(blk).p_size = p_sizeL; //p_size = 0; //not needed }//over all blks } @@ -735,12 +735,12 @@ namespace BaskerNS //printf("upper picked, kid: %d \n", kid); //printf("up: %d %d kid: %d \n", // A_col, A_row, kid); - Bp = &(AVM[A_col][A_row]); + Bp = &(AVM(A_col)(A_row)); } else { //printf("lower picked, kid: %d\n", kid); - Bp = &(ALM[A_col][0]); + Bp = &(ALM(A_col)(0)); } #ifdef BASKER_DEBUG_NFACTOR_COL2 printf("copy, kid: %d bl: %d A: %d %d \n", @@ -749,7 +749,7 @@ namespace BaskerNS // X += B(:, k) BASKER_MATRIX &B = *Bp; - ENTRY_1DARRAY X = LL[leader_idx][bl].ews; + ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; //printf( " -- t_dense_copy_update_matrix2(kid=%d: LL(%d)(%d) += B)\n",kid,leader_idx,bl ); //printf("ADDING UPDATES TO B\n"); //B.info(); @@ -800,9 +800,9 @@ namespace BaskerNS //For recounting patterns in dense blk //Need better sparse update - ENTRY_1DARRAY X = LL[leader_idx][bl].ews; - INT_1DARRAY ws = LL[leader_idx][bl].iws; - const Int nrow = LL[leader_idx][bl].nrow; + ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; + INT_1DARRAY ws = LL(leader_idx)(bl).iws; + const Int nrow = LL(leader_idx)(bl).nrow; Int *color = &(ws(0)); #ifdef BASKER_DEBUG_NFACTOR_COL2 printf("moving, kid: %d A: %d %d %d %d p_size: %d \n", @@ -886,7 +886,7 @@ namespace BaskerNS Int col_idx_offset = 0; //can get rid of? - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); pivot = U.tpivot; //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused L diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp index 02fde7c7ccad..c6ddadf55092 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col_inc.hpp @@ -101,7 +101,7 @@ namespace BaskerNS //for(Int k = 0; k < 1; ++k) - for(Int k = 0; k < LU[U_col][U_row].ncol; ++k) + for(Int k = 0; k < LU(U_col)(U_row).ncol; ++k) { #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -148,7 +148,7 @@ namespace BaskerNS //barrier k = 0 usedl1 t_basker_barrier_inc_lvl(thread,kid,my_leader, - b_size, 0, LU[U_col][U_row].scol, 0); + b_size, 0, LU(U_col)(U_row).scol, 0); //printf("1 kid: %d error_leader: %d lvl: %d \n", kid, error_leader, lvl); BASKER_BOOL error_flag = BASKER_FALSE; basker_barrier.ExitGet(error_leader, error_flag); @@ -172,7 +172,7 @@ namespace BaskerNS { //for(Int k = 2; k < 3; ++k) - for(Int k = 0; k < LU[U_col][U_row].ncol; ++k) + for(Int k = 0; k < LU(U_col)(U_row).ncol; ++k) { #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -181,7 +181,7 @@ namespace BaskerNS #endif t_add_extend_inc_lvl(thread, kid,lvl,l-1, k, - LU[U_col][U_row].scol, + LU(U_col)(U_row).scol, BASKER_FALSE); //where to start again @@ -234,7 +234,7 @@ namespace BaskerNS // printf("[3] barrier test, kid: %d leader: %d b_size: %d lvl: %d \n", // kid, my_leader, b_size, lvl); t_basker_barrier_inc_lvl(thread, kid, my_leader, - b_size, 7, LU[U_col][U_row].scol, 0); + b_size, 7, LU(U_col)(U_row).scol, 0); #ifdef BASKER_DEBUG_NFACTOR_COL_INC if(kid == 0) @@ -248,7 +248,7 @@ namespace BaskerNS //if(lvl < 2) { //for(Int k=0; k < 1; ++k) - for(Int k = 0; k < LU[U_col][U_row].ncol; ++k) + for(Int k = 0; k < LU(U_col)(U_row).ncol; ++k) { #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -259,7 +259,7 @@ namespace BaskerNS //printf("test: %d \n", LU(U_col)(U_row).scol); t_add_extend_inc_lvl(thread, kid,lvl,lvl-1, k, - LU[U_col][U_row].scol, + LU(U_col)(U_row).scol, BASKER_TRUE); Entry pivot = 0; if((kid%(Int)(pow(2,lvl))) == 0) @@ -654,13 +654,13 @@ namespace BaskerNS //end get needed variables// //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused L - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); //Ask C++ guru if this is ok BASKER_MATRIX *Bp; if(l == 0) { - Bp = &(AVM[U_col][U_row]); + Bp = &(AVM(U_col)(U_row)); } else { @@ -674,9 +674,9 @@ namespace BaskerNS // } //B.print(); - INT_1DARRAY ws = LL[X_col][X_row].iws; - const Int ws_size = LL[X_col][X_row].iws_size; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + INT_1DARRAY ws = LL(X_col)(X_row).iws; + const Int ws_size = LL(X_col)(X_row).iws_size; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; const Int brow = U.srow; //const Int bcol = U.scol; @@ -1121,7 +1121,7 @@ namespace BaskerNS //Int col_idx_offset = 0; - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); //const Int bcol = U.scol; #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -1272,7 +1272,7 @@ namespace BaskerNS Int col_idx_offset = 0; - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); //Need to give them the output pattern @@ -1453,7 +1453,7 @@ namespace BaskerNS Int col_idx_offset = 0; - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); //const Int bcol = U.scol; #ifdef BASKER_DEBUG_NFACTOR_COL2 @@ -1564,12 +1564,12 @@ namespace BaskerNS //printf("upper picked, kid: %d \n", kid); //printf("up: %d %d kid: %d \n", // A_col, A_row, kid); - Bp = &(AVM[A_col][A_row]); + Bp = &(AVM(A_col)(A_row)); } else { //printf("lower picked, kid: %d\n", kid); - Bp = &(ALM[A_col][0]); + Bp = &(ALM(A_col)(0)); } BASKER_MATRIX &B = *Bp; //printf("ADDING UPDATES TO B\n"); @@ -1580,10 +1580,10 @@ namespace BaskerNS //return; //Int team_leader = find_leader(kid, l); //Not used - ENTRY_1DARRAY X = LL[leader_idx][bl].ews; - INT_1DARRAY ws = LL[leader_idx][bl].iws; + ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; + INT_1DARRAY ws = LL(leader_idx)(bl).iws; Int *color = &(ws(0)); - LL[leader_idx][bl].p_size = 0; + LL(leader_idx)(bl).p_size = 0; //Get the columns pattern Int U_pattern_col = A_col; @@ -1606,7 +1606,7 @@ namespace BaskerNS //Copy into C - BASKER_MATRIX &Up = LU[U_pattern_col][U_pattern_row]; + BASKER_MATRIX &Up = LU(U_pattern_col)(U_pattern_row); for(Int i = Up.col_ptr(k); i < Up.col_ptr(k+1); i++) { const Int j = Up.row_idx(i); @@ -1620,7 +1620,7 @@ namespace BaskerNS //if there is a L if(L_pattern_row != BASKER_MAX_IDX) { - BASKER_MATRIX &Lp = LL[L_pattern_col][L_pattern_row]; + BASKER_MATRIX &Lp = LL(L_pattern_col)(L_pattern_row); for(Int i = Lp.col_ptr(k)+1; i < Lp.col_ptr(k+1);i++) { const Int j = Lp.row_idx(i); @@ -1708,12 +1708,12 @@ namespace BaskerNS //printf("upper picked, kid: %d \n", kid); //printf("up: %d %d kid: %d \n", // A_col, A_row, kid); - Bp = &(AVM[A_col][A_row]); + Bp = &(AVM(A_col)(A_row)); } else { //printf("lower picked, kid: %d\n", kid); - Bp = &(ALM[A_col][0]); + Bp = &(ALM(A_col)(0)); } BASKER_MATRIX &B = *Bp; //printf("ADDING UPDATES TO B\n"); @@ -1724,8 +1724,8 @@ namespace BaskerNS //return; //Int team_leader = find_leader(kid, l); //Not used - ENTRY_1DARRAY X = LL[leader_idx][bl].ews; - INT_1DARRAY ws = LL[leader_idx][bl].iws; + ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; + INT_1DARRAY ws = LL(leader_idx)(bl).iws; //const Int brow = LL(leader_idx)(bl).srow; //const Int nrow = LL(leader_idx)(bl).nrow; //Int p_size = LL(leader_idx)(bl).p_size; @@ -1789,11 +1789,11 @@ namespace BaskerNS //Int CM_idx = kid; - ENTRY_1DARRAY X = LL[leader_idx][bl].ews; - INT_1DARRAY ws = LL[leader_idx][bl].iws; - const Int ws_size = LL[leader_idx][bl].ews_size; + ENTRY_1DARRAY X = LL(leader_idx)(bl).ews; + INT_1DARRAY ws = LL(leader_idx)(bl).iws; + const Int ws_size = LL(leader_idx)(bl).ews_size; // const Int brow = LL(leader_idx)(bl).srow; //NU //NDE - warning: unused - const Int nrow = LL[leader_idx][bl].nrow; + const Int nrow = LL(leader_idx)(bl).nrow; //Int p_size = LL(leader_idx)(bl).p_size; //For recounting patterns in dense blk @@ -1902,8 +1902,8 @@ namespace BaskerNS #endif //end get needed variables - BASKER_MATRIX &L = LL[L_col][L_row]; - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &L = LL(L_col)(L_row); + BASKER_MATRIX &U = LU(U_col)(U_row); BASKER_MATRIX &B = thread_array(kid).C; @@ -1926,9 +1926,9 @@ namespace BaskerNS } */ - INT_1DARRAY ws = LL[X_col][l+1].iws; - const Int ws_size = LL[X_col][l+1].iws_size; - ENTRY_1DARRAY X = LL[X_col][l+1].ews; + INT_1DARRAY ws = LL(X_col)(l+1).iws; + const Int ws_size = LL(X_col)(l+1).iws_size; + ENTRY_1DARRAY X = LL(X_col)(l+1).ews; const Int brow = U.srow; //const Int bcol = U.scol; @@ -2471,11 +2471,11 @@ namespace BaskerNS Int col_idx_offset = 0; //can get rid of? //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused L - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); - INT_1DARRAY ws = LL[X_col][X_row].iws; + INT_1DARRAY ws = LL(X_col)(X_row).iws; //const Int ws_size = LL(X_col)(X_row).iws_size; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; //const Int brow = U.srow; //const Int bcol = U.scol; @@ -2592,11 +2592,11 @@ namespace BaskerNS //Int col_idx_offset = 0; //can get rid of?//NDE - warning: unused //BASKER_MATRIX &L = LL(L_col)(L_row); //NDE - warning: unused - BASKER_MATRIX &U = LU[U_col][U_row]; + BASKER_MATRIX &U = LU(U_col)(U_row); - INT_1DARRAY ws = LL[X_col][X_row].iws; + INT_1DARRAY ws = LL(X_col)(X_row).iws; //const Int ws_size = LL(X_col)(X_row).iws_size; - ENTRY_1DARRAY X = LL[X_col][X_row].ews; + ENTRY_1DARRAY X = LL(X_col)(X_row).ews; if(kid == leader_id) { @@ -2636,15 +2636,15 @@ namespace BaskerNS { //const Int blk = l+1; - ENTRY_1DARRAY &XL = LL[leader_idx][blk].ews; + ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; // INT_1DARRAY &wsL = LL(leader_idx)(blk).iws; //NDE - warning: unused // Int p_sizeL = LL(leader_idx)(blk).p_size; //NDE - warning: unused // Int ws_sizeL = LL(leader_idx)(blk).iws_size; //NDE - warning: unused - ENTRY_1DARRAY &X = LL[my_idx][blk].ews; - INT_1DARRAY &ws = LL[my_idx][blk].iws; + ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; + INT_1DARRAY &ws = LL(my_idx)(blk).iws; // const Int ws_size = LL(my_idx)(blk).iws_size; //NDE - warning: unused //Int p_size = LL(my_idx)(blk).p_size; - LL[my_idx][blk].p_size = 0; + LL(my_idx)(blk).p_size = 0; Int *color = &(ws[0]); // Int *pattern = &(color[ws_size]); //NDE - warning: unused // Int *stack = &(pattern[ws_size]); //NDE - warning: unused @@ -2716,7 +2716,7 @@ namespace BaskerNS if(U_pattern_row != BASKER_MAX_IDX) { - BASKER_MATRIX &UP = LU[U_pattern_col][U_pattern_row]; + BASKER_MATRIX &UP = LU(U_pattern_col)(U_pattern_row); for(Int jj = UP.col_ptr(k); jj < UP.col_ptr(k+1); @@ -2730,7 +2730,7 @@ namespace BaskerNS }//if UPattern if(L_pattern_row != BASKER_MAX_IDX) { - BASKER_MATRIX &LP = LL[L_pattern_col][L_pattern_row]; + BASKER_MATRIX &LP = LL(L_pattern_col)(L_pattern_row); for(Int jj = LP.col_ptr(k); jj < LP.col_ptr(k+1); jj++) @@ -2807,13 +2807,13 @@ namespace BaskerNS { //const Int blk = l+1; - ENTRY_1DARRAY &XL = LL[leader_idx][blk].ews; + ENTRY_1DARRAY &XL = LL(leader_idx)(blk).ews; // INT_1DARRAY &wsL = LL(leader_idx)(blk).iws; //NDE - warning: unused - Int p_sizeL = LL[leader_idx][blk].p_size; + Int p_sizeL = LL(leader_idx)(blk).p_size; // Int ws_sizeL = LL(leader_idx)(blk).iws_size; //NDE - warning: unused - ENTRY_1DARRAY &X = LL[my_idx][blk].ews; - INT_1DARRAY &ws = LL[my_idx][blk].iws; - const Int ws_size = LL[my_idx][blk].iws_size; + ENTRY_1DARRAY &X = LL(my_idx)(blk).ews; + INT_1DARRAY &ws = LL(my_idx)(blk).iws; + const Int ws_size = LL(my_idx)(blk).iws_size; //Int p_size = LL(my_idx)(blk).p_size; Int *color = &(ws[0]); Int *pattern = &(color[ws_size]); @@ -2845,7 +2845,7 @@ namespace BaskerNS #endif //over all nnnz found - for(Int jj = 0; jj < LL[my_idx][blk].nrow; ++jj) + for(Int jj = 0; jj < LL(my_idx)(blk).nrow; ++jj) { color[jj] = 0; @@ -2910,7 +2910,7 @@ namespace BaskerNS //This can be removed in the future if(kid != team_leader) { - LL[my_idx][blk].p_size = 0; + LL(my_idx)(blk).p_size = 0; } else { @@ -2918,7 +2918,7 @@ namespace BaskerNS printf("SETTING PS: %d L:%d %d kid: %d\n", p_sizeL, leader_idx, blk, kid); #endif - LL[leader_idx][blk].p_size = p_sizeL; + LL(leader_idx)(blk).p_size = p_sizeL; //p_size = 0; NOT USED }//over all blks } diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp index b87a0f48eadf..ccbd5a33b827 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_diag.hpp @@ -258,8 +258,8 @@ namespace BaskerNS Int btab = btf_tabs_offset; BASKER_MATRIX &M = (c >= btab ? BTF_C : BTF_D); - BASKER_MATRIX &U = (c >= btab ? UBTF[c-btab] : U_D[c]); - BASKER_MATRIX &L = (c >= btab ? LBTF[c-btab] : L_D[c]); + BASKER_MATRIX &U = (c >= btab ? UBTF(c-btab) : U_D(c)); + BASKER_MATRIX &L = (c >= btab ? LBTF(c-btab) : L_D(c)); Int k = btf_tabs(c); Int bcol = M.scol; @@ -336,8 +336,8 @@ namespace BaskerNS Int btab = btf_tabs_offset; BASKER_MATRIX &M = (c >= btab ? BTF_C : BTF_D); - BASKER_MATRIX &U = (c >= btab ? UBTF[c-btab] : U_D[c]); - BASKER_MATRIX &L = (c >= btab ? LBTF[c-btab] : L_D[c]); + BASKER_MATRIX &U = (c >= btab ? UBTF(c-btab) : U_D(c)); + BASKER_MATRIX &L = (c >= btab ? LBTF(c-btab) : L_D(c)); Int bcol = M.scol; //JDB: brow hack: fix. diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp index ef9bdb8084ef..c955ff952551 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp @@ -349,35 +349,35 @@ int Basker::sfactor() #endif #ifdef SHYLU_BASKER_STREE_LIST auto stree_p = stree_list[p]; - e_tree (ALM[blk][0], stree_p, 1); + e_tree (ALM(blk)(0), stree_p, 1); #else - e_tree (ALM[blk][0], stree, 1); + e_tree (ALM(blk)(0), stree, 1); #endif #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time1_2 += timer1.seconds(); timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - post_order(ALM[blk][0], stree_p); + post_order(ALM(blk)(0), stree_p); #else - post_order(ALM[blk][0], stree); + post_order(ALM(blk)(0), stree); #endif #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time1_3 += timer1.seconds(); timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - col_count (ALM[blk][0], stree_p); + col_count (ALM(blk)(0), stree_p); #else - col_count (ALM[blk][0], stree); + col_count (ALM(blk)(0), stree); #endif #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time1 += timer1.seconds(); #endif //Assign nnz here - //leaf_assign_nnz(LL[blk][0], stree, 0); - //leaf_assign_nnz(LU[blk][LU_size[blk]-1], stree, 0); + //leaf_assign_nnz(LL(blk)(0), stree, 0); + //leaf_assign_nnz(LU(blk)(LU_size[blk]-1), stree, 0); if(Options.verbose == BASKER_TRUE) { printf( " >> leaf_assign_nnz(LL(%d)(%d))\n",(int)blk,0); @@ -387,11 +387,11 @@ int Basker::sfactor() timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - leaf_assign_nnz(LL[blk][0], stree_p, 0); - leaf_assign_nnz(LU[blk][LU_size(blk)-1], stree_p, 0); + leaf_assign_nnz(LL(blk)(0), stree_p, 0); + leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree_p, 0); #else - leaf_assign_nnz(LL[blk][0], stree, 0); - leaf_assign_nnz(LU[blk][LU_size(blk)-1], stree, 0); + leaf_assign_nnz(LL(blk)(0), stree, 0); + leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree, 0); #endif #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time2 += timer1.seconds(); @@ -441,10 +441,10 @@ int Basker::sfactor() timer1.reset(); #endif #ifdef SHYLU_BASKER_STREE_LIST - U_blk_sfactor(AVM[U_col][U_row], stree_p, + U_blk_sfactor(AVM(U_col)(U_row), stree_p, gScol(l), gSrow(glvl), off_diag); #else - U_blk_sfactor(AVM[U_col][U_row], stree, + U_blk_sfactor(AVM(U_col)(U_row), stree, gScol(l), gSrow(glvl), off_diag); #endif #ifdef BASKER_TIMER @@ -460,8 +460,8 @@ int Basker::sfactor() // stree, gScol, gSrow); //Assign nnz counts for leaf off-diag - //U_assign_nnz(LU[U_col][U_row], stree, 0); - //L_assign_nnz(LL[blk][l+1], stree, 0); + //U_assign_nnz(LU(U_col)(U_row), stree, 0); + //L_assign_nnz(LL(blk)(l+1), stree, 0); #ifdef BASKER_TIMER timer1.reset(); #endif @@ -472,11 +472,11 @@ int Basker::sfactor() printf( " ++ L_assign_nnz(LL(%d, %d)) fill-factor x(%f+%f = %f)\n",(int)blk,(int)l+1, BASKER_DOM_NNZ_OVER,Options.user_fill,fill_factor); } #ifdef SHYLU_BASKER_STREE_LIST - U_assign_nnz(LU[U_col][U_row], stree_p, fill_factor, 0); - L_assign_nnz(LL[blk][l+1], stree_p, fill_factor, 0); + U_assign_nnz(LU(U_col)(U_row), stree_p, fill_factor, 0); + L_assign_nnz(LL(blk)(l+1), stree_p, fill_factor, 0); #else - U_assign_nnz(LU[U_col][U_row], stree, fill_factor, 0); - L_assign_nnz(LL[blk][l+1], stree, fill_factor, 0); + U_assign_nnz(LU(U_col)(U_row), stree, fill_factor, 0); + L_assign_nnz(LL(blk)(l+1), stree, fill_factor, 0); #endif #ifdef BASKER_TIMER time2 += timer1.seconds(); @@ -540,43 +540,43 @@ int Basker::sfactor() //gScol(lvl), gSrow(pp)); #ifdef BASKER_TIMER - printf( " >>> S_blk_sfactor( ALM(%d)(%d) with %dx%d and nnz=%d) <<<\n",U_col,U_row, ALM[U_col][U_row].nrow,ALM[U_col][U_row].ncol,ALM[U_col][U_row].nnz ); fflush(stdout); + printf( " >>> S_blk_sfactor( ALM(%d)(%d) with %dx%d and nnz=%d) <<<\n",U_col,U_row, ALM(U_col)(U_row).nrow,ALM(U_col)(U_row).ncol,ALM(U_col)(U_row).nnz ); fflush(stdout); #endif #ifdef SHYLU_BASKER_STREE_LIST auto stree_p = stree_list[pp]; - S_blk_sfactor(ALM[U_col][U_row], stree_p, + S_blk_sfactor(ALM(U_col)(U_row), stree_p, gScol(lvl), gSrow(pp)); #else - S_blk_sfactor(ALM[U_col][U_row], stree, + S_blk_sfactor(ALM(U_col)(U_row), stree, gScol(lvl), gSrow(pp)); #endif #ifdef BASKER_TIMER - printf( " >>> -> nnz = %d\n",ALM[U_col][U_row].nnz ); fflush(stdout); + printf( " >>> -> nnz = %d\n",ALM(U_col)(U_row).nnz ); fflush(stdout); #endif - //S_assign_nnz(LL[U_col][U_row], stree, 0); + //S_assign_nnz(LL(U_col)(U_row), stree, 0); if(Options.verbose == BASKER_TRUE) { printf( " >> S_assign_nnz( LL(%d,%d) )\n",(int)U_col,(int)U_row ); fflush(stdout); } #ifdef SHYLU_BASKER_STREE_LIST - S_assign_nnz(LL[U_col][U_row], stree_p, 0); + S_assign_nnz(LL(U_col)(U_row), stree_p, 0); #else - S_assign_nnz(LL[U_col][U_row], stree, 0); + S_assign_nnz(LL(U_col)(U_row), stree, 0); #endif - //S_assign_nnz(LU[U_col][LU_size[U_col]-1], stree,0); + //S_assign_nnz(LU(U_col)(LU_size[U_col]-1), stree,0); //printf( " >>> S_assign_nnz( LU(%d,%d) )\n",U_col,LU_size(U_col)-1 ); if(Options.verbose == BASKER_TRUE) { printf( " ++ S_assign_nnz(LU(%d, %d))\n",(int)U_col,(int)LU_size(U_col)-1); fflush(stdout); } #ifdef SHYLU_BASKER_STREE_LIST - S_assign_nnz(LU[U_col][LU_size(U_col)-1], stree_p, 0); + S_assign_nnz(LU(U_col)(LU_size(U_col)-1), stree_p, 0); #else - S_assign_nnz(LU[U_col][LU_size(U_col)-1], stree, 0); + S_assign_nnz(LU(U_col)(LU_size(U_col)-1), stree, 0); #endif #ifdef BASKER_TIMER - printf( " >>> -> nnz = %d\n",LU[U_col][LU_size(U_col)-1].nnz); fflush(stdout); + printf( " >>> -> nnz = %d\n",LU(U_col)(LU_size(U_col)-1).nnz); fflush(stdout); #endif } #ifdef SHYLU_BASKER_STREE_LIST @@ -614,10 +614,10 @@ int Basker::sfactor() Int off_diag = 1; #ifdef SHYLU_BASKER_STREE_LIST - U_blk_sfactor(AVM[U_col][U_row], stree_p, + U_blk_sfactor(AVM(U_col)(U_row), stree_p, gScol(l), gSrow(pp), off_diag); #else - U_blk_sfactor(AVM[U_col][U_row], stree, + U_blk_sfactor(AVM(U_col)(U_row), stree, gScol(l), gSrow(pp), off_diag); #endif @@ -638,11 +638,11 @@ int Basker::sfactor() fflush(stdout); } #ifdef SHYLU_BASKER_STREE_LIST - U_assign_nnz(LU[U_col][U_row], stree_p, fill_factor, 0); - L_assign_nnz(LL[inner_blk][l-lvl], stree_p, fill_factor, 0); + U_assign_nnz(LU(U_col)(U_row), stree_p, fill_factor, 0); + L_assign_nnz(LL(inner_blk)(l-lvl), stree_p, fill_factor, 0); #else - U_assign_nnz(LU[U_col][U_row], stree, fill_factor, 0); - L_assign_nnz(LL[inner_blk][l-lvl], stree, fill_factor, 0); + U_assign_nnz(LU(U_col)(U_row), stree, fill_factor, 0); + L_assign_nnz(LL(inner_blk)(l-lvl), stree, fill_factor, 0); #endif //printf("Here 1 \n"); } @@ -2491,7 +2491,7 @@ int Basker::sfactor() #ifdef BASKER_TIMER printf( " L_D[%d](%d, size = %d, nnz = %d)\n",i,(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); #endif - L_D[i].init_matrix("LBFT", + L_D(i).init_matrix("LBFT", btf_tabs(i), lblk_size, btf_tabs(i), @@ -2499,12 +2499,12 @@ int Basker::sfactor() nnz); //For pruning - L_D[i].init_pend(); + L_D(i).init_pend(); #ifdef BASKER_TIMER printf( " U_D[%d](%d, size = %d, nnz = %d)\n",i,(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); #endif - U_D[i].init_matrix("UBFT", + U_D(i).init_matrix("UBFT", btf_tabs(i), lblk_size, btf_tabs(i), @@ -2546,7 +2546,7 @@ int Basker::sfactor() #ifdef BASKER_TIMER printf( " LBTF(%d, size = %d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); #endif - LBTF[i-btf_tabs_offset].init_matrix("LBFT", + LBTF(i-btf_tabs_offset).init_matrix("LBFT", btf_tabs(i), lblk_size, btf_tabs(i), @@ -2555,12 +2555,12 @@ int Basker::sfactor() //For pruning //printf( " LBTF(%d).init_pend()\n",(int)(i-btf_tabs_offset) ); - LBTF[i-btf_tabs_offset].init_pend(); + LBTF(i-btf_tabs_offset).init_pend(); #ifdef BASKER_TIMER printf( " UBTF(%d, size = %d, nnz = %d)\n",(int)(i-btf_tabs_offset), (int)lblk_size, (int)nnz ); #endif - UBTF[i-btf_tabs_offset].init_matrix("UBFT", + UBTF(i-btf_tabs_offset).init_matrix("UBFT", btf_tabs(i), lblk_size, btf_tabs(i), diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp index 890bc8a17fca..622bdf39a0fd 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor_inc.hpp @@ -100,9 +100,9 @@ namespace BaskerNS for(Int p=0; p < num_threads; ++p) { Int blk = S(0)(p); - sfactor_nd_dom_estimate(ALM[blk][0], - LL[blk][0], - LU[blk][LU_size(blk)-1]); + sfactor_nd_dom_estimate(ALM(blk)(0), + LL(blk)(0), + LU(blk)(LU_size(blk)-1)); for(Int l=0; l < tree.nlvls; l++) { @@ -124,11 +124,11 @@ namespace BaskerNS //JDB TEST PASSED U_row = my_new_row; - sfactor_nd_upper_estimate(AVM[U_col][U_row], - LU[U_col][U_row]); + sfactor_nd_upper_estimate(AVM(U_col)(U_row), + LU(U_col)(U_row)); - sfactor_nd_lower_estimate(ALM[blk][l+1], - LL[blk][l+1]); + sfactor_nd_lower_estimate(ALM(blk)(l+1), + LL(blk)(l+1)); } // end for l @@ -141,9 +141,9 @@ namespace BaskerNS Int U_col = S(lvl+1)(ppp); Int U_row = 0; - sfactor_nd_sep_estimate(ALM[U_col][U_row], - LL[U_col][U_row], - LU[U_col][LU_size(U_col)-1]); + sfactor_nd_sep_estimate(ALM(U_col)(U_row), + LL(U_col)(U_row), + LU(U_col)(LU_size(U_col)-1)); Int innerblk = U_col; for(Int l = lvl+1; l < tree.nlvls; l++) @@ -167,12 +167,12 @@ namespace BaskerNS //JDB TEST PASS U_row = my_new_row; - sfactor_nd_sep_upper_estimate(AVM[U_col][U_row], - LU[U_col][U_row]); + sfactor_nd_sep_upper_estimate(AVM(U_col)(U_row), + LU(U_col)(U_row)); sfactor_nd_sep_lower_estimate( - ALM[innerblk][l-lvl], - LL[innerblk][l-lvl]); + ALM(innerblk)(l-lvl), + LL(innerblk)(l-lvl)); }//for - l }//for -p diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp index b2fa1204cd86..b01d3ec72632 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs.hpp @@ -293,7 +293,7 @@ namespace BaskerNS for(Int b = nblks_c-1; b>= 0; b--) { //---Lower solve - BASKER_MATRIX &LC = LBTF[b]; + BASKER_MATRIX &LC = LBTF(b); #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n btf b=%ld (%d x %d), LBTF(%d)\n", (long)b, (int)LC.nrow, (int)LC.ncol, (int)b); #endif @@ -303,7 +303,7 @@ namespace BaskerNS //printVec(y,gn); - BASKER_MATRIX &UC = UBTF[b]; + BASKER_MATRIX &UC = UBTF(b); //U(C)\x -> y upper_tri_solve(UC,x,y); @@ -420,7 +420,7 @@ namespace BaskerNS for(Int b = btf_top_tabs_offset-1; b>= 0; b--) { //L(C)\x -> y - BASKER_MATRIX &LC = L_D[b]; + BASKER_MATRIX &LC = L_D(b); lower_tri_solve(LC, x, y); #ifdef BASKER_DEBUG_SOLVE_RHS printf( "\n after L solve (b=%d)\n",b ); fflush(stdout); @@ -429,7 +429,7 @@ namespace BaskerNS #endif //U(C)\y -> x - BASKER_MATRIX &UC = U_D[b]; + BASKER_MATRIX &UC = U_D(b); upper_tri_solve(UC, y, x); #ifdef BASKER_DEBUG_SOLVE_RHS printf( "\n after U solve\n" ); fflush(stdout); @@ -476,7 +476,7 @@ namespace BaskerNS //Forward solve on A for(Int b = 0; b < tree.nblks; ++b) { - BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &L = LL(b)(0); //L\x -> y lower_tri_solve(L, x, y, scol_top); @@ -500,7 +500,7 @@ namespace BaskerNS //Update offdiag for(Int bb = 1; bb < LL_size(b); ++bb) { - BASKER_MATRIX &LD = LL[b][bb]; + BASKER_MATRIX &LD = LL(b)(bb); //x = LD*y; #ifdef BASKER_DEBUG_SOLVE_RHS char filename[200]; @@ -549,7 +549,7 @@ namespace BaskerNS #endif //U\y -> x - BASKER_MATRIX &U = LU[b][LU_size(b)-1]; + BASKER_MATRIX &U = LU(b)(LU_size(b)-1); upper_tri_solve(U, y, x, scol_top); // NDE: y , x positions swapped... // seems role of x and y changed... #ifdef BASKER_DEBUG_SOLVE_RHS @@ -568,7 +568,7 @@ namespace BaskerNS #endif //y = UB*x; - BASKER_MATRIX &UB = LU[b][bb]; + BASKER_MATRIX &UB = LU(b)(bb); neg_spmv(UB, x, y, scol_top); #ifdef BASKER_DEBUG_SOLVE_RHS diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp index bfd6e2460062..f950e9bd6132 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_solve_rhs_tr.hpp @@ -346,10 +346,10 @@ namespace BaskerNS // Update off-diag in the block-row before the diag solve for(int bb = LL_size(b)-1; bb > 0; bb--) { - BASKER_MATRIX &LD = LL[b][bb]; + BASKER_MATRIX &LD = LL(b)(bb); neg_spmv_perm_tr(LD, x, y, scol_top); // update y as mod. rhs, x as solution } - BASKER_MATRIX &L = LL[b][0]; + BASKER_MATRIX &L = LL(b)(0); if (L.nrow != 0 && L.ncol != 0) // Avoid degenerate case e.g. empty block following nd-partitioning lower_tri_solve_tr(L, y, x, scol_top); // x and y should be equal after in M range... } @@ -373,10 +373,10 @@ namespace BaskerNS for(Int bb = 0; bb < LU_size(b)-1; bb++) { // update offdiag corresponding to the block-row - BASKER_MATRIX &UB = LU[b][bb]; + BASKER_MATRIX &UB = LU(b)(bb); neg_spmv_tr(UB, x, y, scol_top); } - BASKER_MATRIX &U = LU[b][LU_size(b)-1]; + BASKER_MATRIX &U = LU(b)(LU_size(b)-1); if (U.nrow != 0 && U.ncol != 0) // Avoid degenerate case upper_tri_solve_tr(U, x, y, scol_top); } @@ -410,7 +410,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of { for(Int b = 0; b < btf_top_tabs_offset; b++) { - BASKER_MATRIX &UC = U_D[b]; + BASKER_MATRIX &UC = U_D(b); if ( b > 0 ) spmv_BTF_tr(b, BTF_D, x, y, false); @@ -418,7 +418,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of if (UC.nrow != 0 && UC.ncol != 0) // Avoid degenerate case upper_tri_solve_tr(UC, x, y); - BASKER_MATRIX &LC = L_D[b]; + BASKER_MATRIX &LC = L_D(b); if (LC.nrow != 0 && LC.ncol != 0) // Avoid degenerate case lower_tri_solve_tr(LC, x, y); @@ -462,7 +462,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of if (nblks_c > 0) { Int offset = 0; for(Int b = 0; b < nblks_c; b++) { - BASKER_MATRIX &UC = UBTF[b]; + BASKER_MATRIX &UC = UBTF(b); // Update off-diag // Update X with Y @@ -472,7 +472,7 @@ if (Options.verbose) std::cout << "BTF_D^T begin: from 0 to " << btf_top_tabs_of if (UC.nrow != 0 && UC.ncol != 0) // Avoid degenerate case upper_tri_solve_tr(UC,x,y); - BASKER_MATRIX &LC = LBTF[b]; + BASKER_MATRIX &LC = LBTF(b); if (LC.nrow != 0 && LC.ncol != 0) // Avoid degenerate case lower_tri_solve_tr(LC,x,y); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_stats.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_stats.hpp index 995bad188542..c7f804794f67 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_stats.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_stats.hpp @@ -148,8 +148,8 @@ namespace BaskerNS for(Int l = 0; l < tree.nblks; l++) { - MATRIX &myL = LL[l][0]; - stats.Lnnz += LL[l][0].nnz; + MATRIX &myL = LL(l)(0); + stats.Lnnz += LL(l)(0).nnz; }//over all Ls return stats.Lnnz; @@ -166,10 +166,10 @@ namespace BaskerNS for(Int l = 0; l < tree.nblks; l++) { - for(Int r=0; r 0 ? U_view_count(i) : 1); if (U_view_size > 0) { - MALLOC_MATRIX_1DARRAY(AVM[i], U_view_size); - MALLOC_MATRIX_1DARRAY(LU[i], U_view_size); + MALLOC_MATRIX_1DARRAY(AVM(i), U_view_size); + MALLOC_MATRIX_1DARRAY(LU(i), U_view_size); } //Malloc AL subarray // NOTE: size at least one to allow empty block Int L_view_size = (L_view_count(i) > 0 ? L_view_count(i): 1); if (L_view_size > 0) { - MALLOC_MATRIX_1DARRAY(ALM[i], L_view_size); - MALLOC_MATRIX_1DARRAY(LL[i], L_view_size); + MALLOC_MATRIX_1DARRAY(ALM(i), L_view_size); + MALLOC_MATRIX_1DARRAY(LL(i), L_view_size); } LU_size(i) = U_view_count(i); @@ -855,11 +855,11 @@ namespace BaskerNS #endif for(Int j=i; j != -flat.ncol; j=tree.treetab[j]) { - MATRIX_1DARRAY &UMtemp = AVM[j]; - MATRIX_1DARRAY &LMtemp = ALM[i]; + MATRIX_1DARRAY &UMtemp = AVM(j); + MATRIX_1DARRAY &LMtemp = ALM(i); - MATRIX_1DARRAY &LUtemp = LU[j]; - MATRIX_1DARRAY &LLtemp = LL[i]; + MATRIX_1DARRAY &LUtemp = LU(j); + MATRIX_1DARRAY &LLtemp = LL(i); #ifdef MY_DEBUG printf( " AVM(%d)(%d).set_shape(%dx%d)\n",j,U_view_count[j], tree.col_tabs[i+1]-tree.col_tabs[i],tree.col_tabs[j+1]-tree.col_tabs[j] ); @@ -1056,7 +1056,7 @@ namespace BaskerNS (r_idx < tree.nblks && tree.row_tabs(r_idx+1) == tree.row_tabs(r_idx))) // skip empty blocks { if((L_row+1 < LL_size(L_col)) && - (tree.row_tabs(r_idx+1) == ALM[L_col][L_row+1].srow)) + (tree.row_tabs(r_idx+1) == ALM(L_col)(L_row+1).srow)) { //printf( " > ALM(%d)(%d).srow = %d, row_tab(%d) = %d\n",L_col,L_row+1,ALM(L_col)(L_row+1).srow, r_idx+1,tree.row_tabs(r_idx+1) ); L_row++; @@ -1071,7 +1071,7 @@ namespace BaskerNS (r_idx < tree.nblks && tree.row_tabs(r_idx+1) == tree.row_tabs(r_idx))) // skip empty blocks { if((U_row+1 < LU_size(U_col)) && - (tree.row_tabs(r_idx+1) == AVM[U_col][U_row+1].srow)) + (tree.row_tabs(r_idx+1) == AVM(U_col)(U_row+1).srow)) { //printf( " + AVM(%d)(%d).srow = %d, row_tab(%d) = %d\n",U_col,U_row+1,AVM(U_col)(U_row+1).srow, r_idx+1,tree.row_tabs(r_idx+1) ); U_row++; @@ -1095,8 +1095,8 @@ namespace BaskerNS //Get Matrix Ref - BASKER_MATRIX &Ltemp = ALM[L_col][L_row]; - BASKER_MATRIX &Utemp = AVM[U_col][U_row]; + BASKER_MATRIX &Ltemp = ALM(L_col)(L_row); + BASKER_MATRIX &Utemp = AVM(U_col)(U_row); Int bcol = Ltemp.scol; //diag blk @@ -1162,11 +1162,11 @@ namespace BaskerNS for(Int sb = 0; sb < LL_size(b); ++sb) { //printf( " ALM(%d)(%d).clean_col()\n",b,sb ); - ALM[b][sb].clean_col(); + ALM(b)(sb).clean_col(); } for(Int sb = 0; sb < LU_size(b); ++sb) { - AVM[b][sb].clean_col(); + AVM(b)(sb).clean_col(); } }//for - over all blks diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp index 715ac1c13f5f..2d8322c05de2 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp @@ -327,7 +327,7 @@ namespace BaskerNS { for(Int b=chunk_start; b < chunk_end; b++) { - BASKER_MATRIX &L = LBTF[b-btf_tabs_offset]; + BASKER_MATRIX &L = LBTF(b-btf_tabs_offset); L.clear_pend(); L.nnz = L.mnnz; }//end-for over chunck @@ -342,7 +342,7 @@ namespace BaskerNS #if defined(BASKER_SPLIT_A) for(Int b=chunk_start; b < chunk_end; b++) { - BASKER_MATRIX &L = L_D[b]; + BASKER_MATRIX &L = L_D(b); L.clear_pend(); L.nnz = L.mnnz; }//end-for over chunck @@ -364,11 +364,11 @@ namespace BaskerNS { #ifdef BASKER_DEBUG_INIT printf("L Factor Init: %d %d , kid: %d, nnz: %ld \n", - b, row, kid, LL[b][row].nnz); + b, row, kid, LL(b)(row).nnz); #endif - LL[b][row].clear_pend(); - LL[b][row].nnz = LL[b][row].mnnz; + LL(b)(row).clear_pend(); + LL(b)(row).nnz = LL(b)(row).mnnz; }//end over all row }//end select which thread @@ -383,13 +383,13 @@ namespace BaskerNS #ifdef BASKER_DEBUG_INIT printf("U Factor init: %d %d, nnz: %ld \n", b, LU_size[b]-1, - LU[b][LU_size[b]-1].nnz); + LU(b)(LU_size[b]-1).nnz); #endif //LU(b)(LU_size(b)-1).nnz = 0; - for(Int kk = 0; kk < LU[b][LU_size(b)-1].ncol+1; kk++) + for(Int kk = 0; kk < LU(b)(LU_size(b)-1).ncol+1; kk++) { - LU[b][LU_size(b)-1].col_ptr(kk) = 0; + LU(b)(LU_size(b)-1).col_ptr(kk) = 0; } /* @@ -399,7 +399,7 @@ namespace BaskerNS LU(b)(LU_size(b)-1).mnnz); */ - LU[b][LU_size(b)-1].nnz = LU[b][LU_size(b)-1].mnnz; + LU(b)(LU_size(b)-1).nnz = LU(b)(LU_size(b)-1).mnnz; for(Int l = lvl+1; l < tree.nlvls+1; l++) { Int U_col = S(l)(kid); @@ -416,12 +416,12 @@ namespace BaskerNS #ifdef BASKER_DEBUG_INIT printf("Init U: %d %d lvl: %d l: %d kid: %d nnz: %ld \n", U_col, U_row, lvl, l, kid, - LU[U_col][U_row].nnz); + LU(U_col)(U_row).nnz); #endif - for(Int kk = 0; kk < LU[U_col][U_row].ncol+1; kk++) + for(Int kk = 0; kk < LU(U_col)(U_row).ncol+1; kk++) { - LU[U_col][U_row].col_ptr(kk) = 0; + LU(U_col)(U_row).col_ptr(kk) = 0; } /* printf("flipU (%d,%d) %d %d \n", @@ -430,7 +430,7 @@ namespace BaskerNS LU(U_col)(U_row).mnnz); */ - LU[U_col][U_row].nnz = LU[U_col][U_row].mnnz; + LU(U_col)(U_row).nnz = LU(U_col)(U_row).mnnz; //LU(U_col)(U_row).nnz = 0; }//over inner lvls @@ -466,19 +466,19 @@ namespace BaskerNS { #ifdef BASKER_DEBUG_INIT printf("L Factor Init: %d %d , kid: %d, nnz: %ld \n", - b, row, kid, LL[b][row].nnz); + b, row, kid, LL(b)(row).nnz); #endif #ifdef BASKER_TIMER timer_init_matrixL.reset(); - printf( " ++ lvl=%d: LL(%d,%d): nnz=%d, mnnz=%d ++\n",(int)lvl, (int)b, (int)row, (int)LL[b][row].nnz, (int)LL[b][row].mnnz); fflush(stdout); + printf( " ++ lvl=%d: LL(%d,%d): nnz=%d, mnnz=%d ++\n",(int)lvl, (int)b, (int)row, (int)LL(b)(row).nnz, (int)LL(b)(row).mnnz); fflush(stdout); #endif - LL[b][row].init_matrix("Loffdig", - LL[b][row].srow, - LL[b][row].nrow, - LL[b][row].scol, - LL[b][row].ncol, - LL[b][row].nnz); + LL(b)(row).init_matrix("Loffdig", + LL(b)(row).srow, + LL(b)(row).nrow, + LL(b)(row).scol, + LL(b)(row).ncol, + LL(b)(row).nnz); #ifdef BASKER_TIMER printf( " >> LL(%d,%d).init_matrix done <<\n",b,row ); fflush(stdout); init_matrixL_time += timer_init_matrixL.seconds(); @@ -487,20 +487,20 @@ namespace BaskerNS //Fix when this all happens in the future if(Options.incomplete == BASKER_TRUE) { - LL[b][row].init_inc_lvl(); + LL(b)(row).init_inc_lvl(); } #ifdef BASKER_TIMER timer_fill_matrixL.reset(); - printf( " ++ zero out (%d) ++\n",int(LL[b][row].col_ptr.extent(0)) ); fflush(stdout); + printf( " ++ zero out (%d) ++\n",int(LL(b)(row).col_ptr.extent(0)) ); fflush(stdout); #endif //LL(b)(row).fill(); - LL[b][row].init_ptr(); + LL(b)(row).init_ptr(); //Kokkos::deep_copy(LL(b)(row).col_ptr, 0); #ifdef BASKER_TIMER - printf( " LL(%d)(%d).init_pend(ncol = %d)\n",b,row,LL[b][row].ncol ); fflush(stdout); + printf( " LL(%d)(%d).init_pend(ncol = %d)\n",b,row,LL(b)(row).ncol ); fflush(stdout); fill_matrixL_time += timer_fill_matrixL.seconds(); #endif - LL[b][row].init_pend(); + LL(b)(row).init_pend(); #ifdef BASKER_TIMER printf( " (b=%d: row=%d) done\n\n",b,row ); fflush(stdout); #endif @@ -529,23 +529,23 @@ namespace BaskerNS #ifdef BASKER_DEBUG_INIT printf("U Factor init: %d %d, nnz: %ld \n", b, LU_size[b]-1, - LU[b][LU_size[b]-1].nnz); + LU(b)(LU_size[b]-1).nnz); #endif #ifdef BASKER_TIMER printf( " lvl=%d: LU(%d,%d): %dx%d, nnz=%d, mnnz=%d, at (%d,%d)\n", (int)lvl, (int)b, (int)LU_size(b)-1, - (int)LU[b][LU_size(b)-1].nrow,(int)LU[b][LU_size(b)-1].ncol,(int)LU[b][LU_size(b)-1].nnz, (int)LU[b][LU_size(b)-1].mnnz, - (int)LU[b][LU_size(b)-1].srow,(int)LU[b][LU_size(b)-1].scol); + (int)LU(b)(LU_size(b)-1).nrow,(int)LU(b)(LU_size(b)-1).ncol,(int)LU(b)(LU_size(b)-1).nnz, (int)LU(b)(LU_size(b)-1).mnnz, + (int)LU(b)(LU_size(b)-1).srow,(int)LU(b)(LU_size(b)-1).scol); #endif - LU[b][LU_size(b)-1].init_matrix("Udiag", - LU[b][LU_size(b)-1].srow, - LU[b][LU_size(b)-1].nrow, - LU[b][LU_size(b)-1].scol, - LU[b][LU_size(b)-1].ncol, - LU[b][LU_size(b)-1].nnz); + LU(b)(LU_size(b)-1).init_matrix("Udiag", + LU(b)(LU_size(b)-1).srow, + LU(b)(LU_size(b)-1).nrow, + LU(b)(LU_size(b)-1).scol, + LU(b)(LU_size(b)-1).ncol, + LU(b)(LU_size(b)-1).nnz); //LU(b)(LU_size(b)-1).fill(); - LU[b][LU_size(b)-1].init_ptr(); + LU(b)(LU_size(b)-1).init_ptr(); //Kokkos::deep_copy(LU(b)(LU_size(b)-1).col_ptr, 0); for(Int l = lvl+1; l < tree.nlvls+1; l++) @@ -583,29 +583,29 @@ namespace BaskerNS #ifdef BASKER_DEBUG_INIT printf("Init U: %d %d lvl: %d l: %d kid: %d nnz: %ld \n", U_col, U_row, lvl, l, kid, - LU[U_col][U_row].nnz); + LU(U_col)(U_row).nnz); #endif #ifdef BASKER_TIMER printf( " +++ l=%d: LU(%d,%d): %dx%d, nnz=%d, mnnz=%d at (%d,%d)\n", (int)l, (int)U_col, (int)U_row, - (int)LU[U_col][U_row].nrow,(int)LU[U_col][U_row].ncol, - (int)LU[U_col][U_row].nnz, (int)LU[U_col][U_row].mnnz, - (int)LU[U_col][U_row].srow,(int)LU[U_col][U_row].scol); + (int)LU(U_col)(U_row).nrow,(int)LU(U_col)(U_row).ncol, + (int)LU(U_col)(U_row).nnz, (int)LU(U_col)(U_row).mnnz, + (int)LU(U_col)(U_row).srow,(int)LU(U_col)(U_row).scol); #endif - LU[U_col][U_row].init_matrix("Uoffdiag", - LU[U_col][U_row].srow, - LU[U_col][U_row].nrow, - LU[U_col][U_row].scol, - LU[U_col][U_row].ncol, - LU[U_col][U_row].nnz); + LU(U_col)(U_row).init_matrix("Uoffdiag", + LU(U_col)(U_row).srow, + LU(U_col)(U_row).nrow, + LU(U_col)(U_row).scol, + LU(U_col)(U_row).ncol, + LU(U_col)(U_row).nnz); //LU(U_col)(U_row).fill(); - LU[U_col][U_row].init_ptr(); + LU(U_col)(U_row).init_ptr(); //Kokkos::deep_copy(LU(U_col)(U_row).col_ptr, 0); if(Options.incomplete == BASKER_TRUE) { - LU[U_col][U_row].init_inc_lvl(); + LU(U_col)(U_row).init_inc_lvl(); } }//over inner lvls @@ -646,9 +646,9 @@ namespace BaskerNS { #ifdef BASKER_DEBUG_INIT printf("ALM Factor Init: %d %d , kid: %d, nnz: %d nrow: %d ncol: %d \n", - b, row, kid, ALM[b][row].nnz, - ALM[b][row].nrow, - ALM[b][row].ncol); + b, row, kid, ALM(b)(row).nnz, + ALM(b)(row).nrow, + ALM(b)(row).ncol); #endif /*if (kid == 1) @@ -663,7 +663,7 @@ namespace BaskerNS printf("ALM(%d,%d: %dx%d) alloc with A: kid=%d btf=%d\n", b, row, ALM(b)(row).nrow, ALM(b)(row).ncol, kid, Options.btf); #endif - ALM[b][row].convert2D(A, alloc, kid); + ALM(b)(row).convert2D(A, alloc, kid); } else { @@ -672,7 +672,7 @@ namespace BaskerNS printf("ALM(%d,%d, %dx%d) alloc (btf) with BTF_A: kid=%d \n", b, row, ALM(b)(row).nrow, ALM(b)(row).ncol, kid); #endif - ALM[b][row].convert2D(BTF_A, alloc, kid); + ALM(b)(row).convert2D(BTF_A, alloc, kid); } /*if (kid == 0) { for(Int j = 0; j < ALM(b)(row).ncol; j++) { @@ -697,9 +697,9 @@ namespace BaskerNS #ifdef BASKER_DEBUG_INTI printf("AUM Factor init: %d %d, kid: %d nnz: %d nrow: %d ncol: %d \n", b, LU_size(b)-1, kid, - AVM[b][LU_size(b)-1].nnz, - AVM[b][LU_size(b)-1].nrow, - AVM[b][LU_size(b)-1].ncol); + AVM(b)(LU_size(b)-1).nnz, + AVM(b)(LU_size(b)-1).nrow, + AVM(b)(LU_size(b)-1).ncol); #endif /*if (kid == 1) { @@ -708,13 +708,13 @@ namespace BaskerNS }*/ if(Options.btf == BASKER_FALSE) { - AVM[b][LU_size(b)-1].convert2D(A, alloc, kid); + AVM(b)(LU_size(b)-1).convert2D(A, alloc, kid); } else { //printf("Using BTF AU\n"); //printf(" > kid=%d: convert2D AVM(%d,%d)\n", kid, b, LU_size(b)-1); - AVM[b][LU_size(b)-1].convert2D(BTF_A, alloc, kid); + AVM(b)(LU_size(b)-1).convert2D(BTF_A, alloc, kid); } /*if (kid == 0) { for(Int j = 0; j < AVM(b)(LU_size(b)-1).ncol; j++) { @@ -771,9 +771,9 @@ namespace BaskerNS #ifdef BASKER_DEBUG_INIT printf("Init AUM: %d %d lvl: %d l: %d kid: %d nnz: %d nrow: %d ncol: %d \n", U_col, U_row, lvl, l, kid, - AVM[U_col][U_row].nnz, - AVM[U_col][U_row].nrow, - AVM[U_col][U_row].ncol); + AVM(U_col)(U_row).nnz, + AVM(U_col)(U_row).nrow, + AVM(U_col)(U_row).ncol); #endif #if 0 @@ -793,7 +793,7 @@ namespace BaskerNS //printf("2nd convert AVM: %d %d size:%d kid: %d\n", // U_col, U_row, AVM(U_col)(U_row).nnz, // kid); - AVM[U_col][U_row].convert2D(BTF_A, alloc, kid); + AVM(U_col)(U_row).convert2D(BTF_A, alloc, kid); //printf(" %d: Using BTF AU(%d,%d) done\n",kid,U_col,U_row); } @@ -828,17 +828,17 @@ namespace BaskerNS for(Int l = 0; l < LL_size(b); l++) { //defining here - LL[b][l].iws_size = LL[b][l].nrow; + LL(b)(l).iws_size = LL(b)(l).nrow; //This can be made smaller, see notes in Sfactor_old - LL[b][l].iws_mult = 5; - LL[b][l].ews_size = LL[b][l].nrow; + LL(b)(l).iws_mult = 5; + LL(b)(l).ews_size = LL(b)(l).nrow; //This can be made smaller, see notes in sfactor_old - LL[b][l].ews_mult = 2; + LL(b)(l).ews_mult = 2; - Int iws_size = LL[b][l].iws_size; - Int iws_mult = LL[b][l].iws_mult; - Int ews_size = LL[b][l].ews_size; - Int ews_mult = LL[b][l].ews_mult; + Int iws_size = LL(b)(l).iws_size; + Int iws_mult = LL(b)(l).iws_mult; + Int ews_size = LL(b)(l).ews_size; + Int ews_mult = LL(b)(l).ews_mult; if(iws_size > max_sep_size) { @@ -851,10 +851,10 @@ namespace BaskerNS } BASKER_ASSERT((iws_size*iws_mult)>0, "util iws"); - MALLOC_INT_1DARRAY(LL[b][l].iws, iws_size*iws_mult); + MALLOC_INT_1DARRAY(LL(b)(l).iws, iws_size*iws_mult); for(Int i=0; i 0) { BASKER_ASSERT((ews_size*ews_mult)>0, "util ews"); - MALLOC_ENTRY_1DARRAY(LL[b][l].ews, ews_size*ews_mult); + MALLOC_ENTRY_1DARRAY(LL(b)(l).ews, ews_size*ews_mult); for(Int i=0; i Date: Fri, 25 Oct 2024 20:42:26 -0600 Subject: [PATCH 070/243] ShyLU - Basker : cleanups Signed-off-by: iyamazaki --- packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp | 1 - packages/shylu/shylu_node/basker/src/shylubasker_types.hpp | 4 ---- 2 files changed, 5 deletions(-) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp index 5b6ae49e5e14..784df704eb59 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_tree.hpp @@ -1178,7 +1178,6 @@ namespace BaskerNS BASKER_INLINE int Basker::sfactor_copy() { - printf( " .. sfactor_copy ..\n" ); fflush(stdout); //Reorder A; //Match order if(match_flag == BASKER_TRUE) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp index 193ecb11e24a..f57447b10906 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp @@ -172,7 +172,6 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC int_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ - /*a = INT_1DARRAY(BASKER_KOKKOS_NOINIT("int_1d"),s);*/ \ Kokkos::resize(a, s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ @@ -182,7 +181,6 @@ enum BASKER_INCOMPLETE_CODE { \ BASKER_ASSERT(s0>0, "BASKER ASSERT MALLOC int_rank2d: size to alloc > 0 fails"); \ BASKER_ASSERT(s1>0, "BASKER ASSERT MALLOC int_rank2d: size to alloc > 0 fails"); \ - /*a = INT_RANK2DARRAY(BASKER_KOKKOS_NOINIT("int_rank2d"),s0,s1);*/ \ Kokkos::resize(a, s0,s1); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ @@ -237,7 +235,6 @@ enum BASKER_INCOMPLETE_CODE BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_1d: size to alloc >= 0 fails"); \ if (s > 0) { \ a = MATRIX_1DARRAY(Kokkos::view_alloc("matrix_1d", Kokkos::SequentialHostInit),s); \ - Kokkos::resize(a,s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ @@ -247,7 +244,6 @@ enum BASKER_INCOMPLETE_CODE BASKER_ASSERT(s >= 0, "BASKER ASSERT MALLOC matrix_2d: size to alloc >= 0 fails"); \ if (s > 0) { \ a = MATRIX_2DARRAY(Kokkos::view_alloc("matrix_2d", Kokkos::SequentialHostInit),s); \ - Kokkos::resize(a,s); \ if(a.data() == NULL) \ throw std::bad_alloc(); \ } \ From 18097083fa60054ee56ada1f3afdf71f80c6fcc0 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 15:56:58 -0600 Subject: [PATCH 071/243] Add AT2 runner, usage of GenConfig, get-changed-packages.sh Squashing all the terrible commits I made while using the GitHub web interface. The interface does not seem to have support for signing with DCO. Includes changes which modify the event triggers to comply with new AT2 specifications, assignment of an AT2 runner to run on, usage of GenConfig to load the environment, and prototype implementation of calling the get-changed-trilinos-packages.sh script. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 65 +++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 7b51bbec8c75..e0478400bf5a 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -12,10 +12,11 @@ name: "CodeQL: Linear Solvers" on: - #push: - # branches: [ "muelu-sync-workflow" ] pull_request: branches: [ "develop" ] + types: + - opened + - synchronize schedule: - cron: '41 23 * * 2' @@ -25,17 +26,12 @@ permissions: jobs: analyze: name: Analyze (${{ matrix.language }}) - # Runner size impacts CodeQL analysis time. To learn more, please see: - # - https://gh.io/recommended-hardware-resources-for-running-codeql - # - https://gh.io/supported-runners-and-hardware-resources - # - https://gh.io/using-larger-runners (GitHub.com only) - # Consider using larger runners or machines with greater resources for possible analysis time improvements. - runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} - timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6] + if: ${{ github.event.action == 'synchronize' || github.event.action == 'opened' }} + permissions: # required for all workflows security-events: write - # only required for workflows in private repositories actions: read contents: read @@ -46,16 +42,7 @@ jobs: include: - language: c-cpp build-mode: manual - #- language: python - # build-mode: none - # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' - # Use `c-cpp` to analyze code written in C, C++ or both - # Use 'java-kotlin' to analyze code written in Java, Kotlin or both - # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both - # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, - # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. - # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how - # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: - name: Checkout repository uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -70,20 +57,38 @@ jobs: query-filters: - exclude: tags: cpp/integer-multiplication-cast-to-long - + - name: env + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: | + env + - name: module list + shell: bash -l {0} + run: | + module list + printenv PATH - if: matrix.build-mode == 'manual' - name: Configure Trilinos + name: Get dependencies run: | - mkdir -p trilinos_build - cd trilinos_build - cmake -G 'Unix Makefiles' -DTrilinos_ENABLE_TESTS=OFF -DTrilinos_ENABLE_Epetra=OFF -DTrilinos_ENABLE_AztecOO=OFF -DTrilinos_ENABLE_Ifpack=OFF -DTrilinos_ENABLE_ML=OFF -D Trilinos_ENABLE_Triutils=OFF -DTrilinos_ENABLE_Tpetra=ON -DTrilinos_ENABLE_MueLu=ON -DTrilinos_ENABLE_Krino=OFF .. - + bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" + - if: matrix.build-mode == 'manual' + name: Generate CMake fragment for changed packages + run: | + git fetch origin ${GITHUB_BASE_REF} + git branch + bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh ${GITHUB_BASE_REF} ${GITHUB_HEAD_REF} package_enables.cmake package_subprojects.cmake" - if: matrix.build-mode == 'manual' - name: Build Trilinos + name: Configure and Build Trilinos + shell: bash -lc {0} run: | - cd trilinos_build - make -j 2 - + mkdir -p trilinos_build + mv package_enables.cmake trilinos_build + cd trilinos_build + + source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables + cmake -C genconfig_fragment.cmake -C package_enables.cmake .. + ninja -j 16 + - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: From 5742da5d3c8b3599e1348e016a85484ec0826dcf Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 16:03:26 -0600 Subject: [PATCH 072/243] Fix arguments of get-changed-trilinos-packages.sh for CodeQL Fix calling of get-changed-trilinos-packages.sh to correctly reference the origin remote. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index e0478400bf5a..0851448adb05 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -76,7 +76,7 @@ jobs: run: | git fetch origin ${GITHUB_BASE_REF} git branch - bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh ${GITHUB_BASE_REF} ${GITHUB_HEAD_REF} package_enables.cmake package_subprojects.cmake" + bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" - if: matrix.build-mode == 'manual' name: Configure and Build Trilinos shell: bash -lc {0} From 661569af6dffdaaaeb11f548ea26ac71f207e356 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:59:16 +0000 Subject: [PATCH 073/243] Bump actions/dependency-review-action from 4.3.4 to 4.4.0 Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 4.3.4 to 4.4.0. - [Release notes](https://github.com/actions/dependency-review-action/releases) - [Commits](https://github.com/actions/dependency-review-action/compare/5a2ce3f5b92ee19cbb1541a4984c76d921601d7c...4081bf99e2866ebe428fc0477b69eb4fcda7220a) --- updated-dependencies: - dependency-name: actions/dependency-review-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/dependency-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 7b0990bcf5ca..bf29beac76d5 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -24,4 +24,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: 'Dependency Review' - uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4 + uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0 From 40e117299710784a953a03469b2b32752a4ea29b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:59:27 +0000 Subject: [PATCH 074/243] Bump github/codeql-action from 3.26.13 to 3.27.0 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.13 to 3.27.0. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/f779452ac5af1c261dce0346a8f964149f49322b...662472033e021d55d94146f66f6058822b0b39fd) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 56bbf091adaf..4139508fa42b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -62,7 +62,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} @@ -85,6 +85,6 @@ jobs: make -j 2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index c648a7e9b626..46a2c4571aff 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -66,6 +66,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: sarif_file: results.sarif From 1d278e8c57e5f0a936f8eb8f6184222a19e5f681 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 17:12:28 -0600 Subject: [PATCH 075/243] Move GenConfig step into the Generate CMake fragment step The GenConfig step is just used to generate a cmake fragment for the configuration. This would fit nicely with the step that generates the other cmake fragment for package enables. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0851448adb05..aa9b8043c094 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -72,20 +72,20 @@ jobs: run: | bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" - if: matrix.build-mode == 'manual' - name: Generate CMake fragment for changed packages + name: Generate CMake fragments run: | git fetch origin ${GITHUB_BASE_REF} - git branch + + mkdir -p trilinos_build && cd trilinos_build + + source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" - if: matrix.build-mode == 'manual' name: Configure and Build Trilinos shell: bash -lc {0} run: | - mkdir -p trilinos_build - mv package_enables.cmake trilinos_build cd trilinos_build - source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables cmake -C genconfig_fragment.cmake -C package_enables.cmake .. ninja -j 16 From 237a61123802fd9d586eac0a32193178bbeda52e Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 17:14:14 -0600 Subject: [PATCH 076/243] Tidy up workflow file and add newlines Tidy up workflow file with consistent naming and add newlines between each named step for better readability. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index aa9b8043c094..6fc58693f0b7 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -57,20 +57,24 @@ jobs: query-filters: - exclude: tags: cpp/integer-multiplication-cast-to-long - - name: env + + - name: Print environment env: GITHUB_CONTEXT: ${{ toJson(github) }} run: | env - - name: module list + + - name: Module list shell: bash -l {0} run: | module list printenv PATH + - if: matrix.build-mode == 'manual' name: Get dependencies run: | bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" + - if: matrix.build-mode == 'manual' name: Generate CMake fragments run: | @@ -80,15 +84,16 @@ jobs: source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" + - if: matrix.build-mode == 'manual' - name: Configure and Build Trilinos + name: Configure and build Trilinos shell: bash -lc {0} run: | cd trilinos_build cmake -C genconfig_fragment.cmake -C package_enables.cmake .. ninja -j 16 - + - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: From 7727e25ee133d6fbdce85589cca9c6359cb4109e Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 08:38:52 -0600 Subject: [PATCH 077/243] Fix bash login shell for generate CMake fragment Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 6fc58693f0b7..be4c96a2a393 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -77,6 +77,7 @@ jobs: - if: matrix.build-mode == 'manual' name: Generate CMake fragments + shell: bash -lc {0} run: | git fetch origin ${GITHUB_BASE_REF} From 26eb6ff3cd6d003781d9fa51be2108cd77b94a3d Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 09:00:32 -0600 Subject: [PATCH 078/243] Add TriBITS cache variables to reduce code built Add TriBITS cache variables to reduce code built for packages that are not needed. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index be4c96a2a393..130ed194da46 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -92,7 +92,7 @@ jobs: run: | cd trilinos_build - cmake -C genconfig_fragment.cmake -C package_enables.cmake .. + cmake -C genconfig_fragment.cmake -C package_enables.cmake -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. ninja -j 16 - name: Perform CodeQL Analysis From 1c01d54ee5d624f132f386fee433d7f8b61aa969 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 29 Oct 2024 16:02:57 -0600 Subject: [PATCH 079/243] Tpetra: Refactor FEWhichActive and FillState Signed-off-by: Christian Glusa --- .../tpetra/core/src/Tpetra_ConfigDefs.hpp | 18 ++++++ .../core/src/Tpetra_FECrsGraph_decl.hpp | 17 +---- .../tpetra/core/src/Tpetra_FECrsGraph_def.hpp | 38 +++++------ .../core/src/Tpetra_FECrsMatrix_decl.hpp | 19 ++---- .../core/src/Tpetra_FECrsMatrix_def.hpp | 64 +++++++++---------- .../core/src/Tpetra_FEMultiVector_decl.hpp | 17 +---- .../core/src/Tpetra_FEMultiVector_def.hpp | 34 +++++----- 7 files changed, 95 insertions(+), 112 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_ConfigDefs.hpp b/packages/tpetra/core/src/Tpetra_ConfigDefs.hpp index 64269d97d7f9..ec909ed0aef9 100644 --- a/packages/tpetra/core/src/Tpetra_ConfigDefs.hpp +++ b/packages/tpetra/core/src/Tpetra_ConfigDefs.hpp @@ -198,6 +198,24 @@ namespace Tpetra { Backward, Symmetric }; + + // FE* enums + namespace FE { + + // Enum for activity + enum WhichActive + { + ACTIVE_OWNED, + ACTIVE_OWNED_PLUS_SHARED + }; + + enum class FillState + { + open, // matrix is "open". Values can freely summed in to and replaced + modify, // matrix is open for modification. *local* values can be replaced + closed + }; + } } // For backwards compatibility diff --git a/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp index d14f6b3da408..adb8c325d2f8 100644 --- a/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp @@ -13,6 +13,7 @@ /// \file Tpetra_FECrsGraph_decl.hpp /// \brief Declaration of the Tpetra::FECrsGraph class +#include "Tpetra_ConfigDefs.hpp" #include "Tpetra_FECrsGraph_fwd.hpp" #include "Tpetra_CrsGraph_decl.hpp" @@ -548,25 +549,13 @@ namespace Tpetra { // template // Teuchos::RCP makeOwnedColMap (ViewType ownedGraphIndices); - // Enum for activity - enum FEWhichActive - { - FE_ACTIVE_OWNED, - FE_ACTIVE_OWNED_PLUS_SHARED - }; - - enum class FillState - { - open, // matrix is "open". Values can freely inserted - closed - }; - Teuchos::RCP fillState_; + Teuchos::RCP fillState_; // This is whichever graph isn't currently active Teuchos::RCP > inactiveCrsGraph_; // This is in RCP to make shallow copies of the FECrsGraph work correctly - Teuchos::RCP activeCrsGraph_; + Teuchos::RCP activeCrsGraph_; // The importer between the rowmaps of the two graphs Teuchos::RCP ownedRowsImporter_; diff --git a/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp index c8eb4ab9dfb9..6ef09873bea7 100644 --- a/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp @@ -182,8 +182,8 @@ setup(const Teuchos::RCP & ownedRowMap, if(ownedPlusSharedColMap.is_null()) this->allocateIndices(GlobalIndices); else this->allocateIndices(LocalIndices); - activeCrsGraph_ = Teuchos::rcp(new FEWhichActive(FE_ACTIVE_OWNED_PLUS_SHARED)); - fillState_ = Teuchos::rcp(new FillState(FillState::closed)); + activeCrsGraph_ = Teuchos::rcp(new FE::WhichActive(FE::ACTIVE_OWNED_PLUS_SHARED)); + fillState_ = Teuchos::rcp(new FE::FillState(FE::FillState::closed)); // Use a very strong map equivalence check bool maps_are_the_same = ownedRowMap->isSameAs(*ownedPlusSharedRowMap); @@ -221,7 +221,7 @@ setup(const Teuchos::RCP & ownedRowMap, template void FECrsGraph::doOwnedPlusSharedToOwned(const CombineMode CM) { const char tfecfFuncName[] = "FECrsGraph::doOwnedPlusSharedToOwned(CombineMode): "; - if(!ownedRowsImporter_.is_null() && *activeCrsGraph_ == FE_ACTIVE_OWNED_PLUS_SHARED) { + if(!ownedRowsImporter_.is_null() && *activeCrsGraph_ == FE::ACTIVE_OWNED_PLUS_SHARED) { Teuchos::RCP ownedRowMap = ownedRowsImporter_->getSourceMap(); // Do a self-export in "restricted mode" @@ -296,10 +296,10 @@ void FECrsGraph::doOwnedToOwnedPlusShared(con template void FECrsGraph::switchActiveCrsGraph() { - if(*activeCrsGraph_ == FE_ACTIVE_OWNED_PLUS_SHARED) - *activeCrsGraph_ = FE_ACTIVE_OWNED; + if(*activeCrsGraph_ == FE::ACTIVE_OWNED_PLUS_SHARED) + *activeCrsGraph_ = FE::ACTIVE_OWNED; else - *activeCrsGraph_ = FE_ACTIVE_OWNED_PLUS_SHARED; + *activeCrsGraph_ = FE::ACTIVE_OWNED_PLUS_SHARED; if(inactiveCrsGraph_.is_null()) return; @@ -318,10 +318,10 @@ void FECrsGraph::endFill( doing finite differences, things are easy --- just call fillComplete(). If, we are in the parallel FE case, then: - Precondition: FE_ACTIVE_OWNED_PLUS_SHARED mode + Precondition: FE::ACTIVE_OWNED_PLUS_SHARED mode Postconditions: - 1) FE_ACTIVE_OWNED mode + 1) FE::ACTIVE_OWNED mode 2) The OWNED graph has been fillCompleted with an Aztec-compatible column map 3) rowptr & (local) colinds are aliased between the two graphs 4) The OWNED_PLUS_SHARED graph has been fillCompleted with a column map whose first chunk @@ -333,7 +333,7 @@ void FECrsGraph::endFill( */ // Precondition const char tfecfFuncName[] = "FECrsGraph::endFill(domainMap, rangeMap): "; - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(*activeCrsGraph_ != FE_ACTIVE_OWNED_PLUS_SHARED,std::runtime_error, "must be in owned+shared mode."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(*activeCrsGraph_ != FE::ACTIVE_OWNED_PLUS_SHARED,std::runtime_error, "must be in owned+shared mode."); if(ownedRowsImporter_.is_null()) { // The easy case: One graph switchActiveCrsGraph(); @@ -365,7 +365,7 @@ void FECrsGraph::beginFill() { // Unlike FECrsMatrix and FEMultiVector, we do not allow you to call beginFill() after calling endFill() // So we throw an exception if you're in owned mode - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(*activeCrsGraph_ == FE_ACTIVE_OWNED,std::runtime_error, "can only be called once."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(*activeCrsGraph_ == FE::ACTIVE_OWNED,std::runtime_error, "can only be called once."); } @@ -373,11 +373,11 @@ template void FECrsGraph::beginAssembly() { const char tfecfFuncName[] = "FECrsGraph::beginAssembly: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::closed, + *fillState_ != FE::FillState::closed, std::runtime_error, "Cannot beginAssembly, matrix is not in a closed state" ); - *fillState_ = FillState::open; + *fillState_ = FE::FillState::open; this->beginFill(); } @@ -385,11 +385,11 @@ template void FECrsGraph::endAssembly() { const char tfecfFuncName[] = "FECrsGraph::endAssembly: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::open, + *fillState_ != FE::FillState::open, std::logic_error, "Cannot endAssembly, matrix is not open to fill but is closed." ); - *fillState_ = FillState::closed; + *fillState_ = FE::FillState::closed; this->endFill(); } @@ -400,11 +400,11 @@ void FECrsGraph::endAssembly( { const char tfecfFuncName[] = "FECrsGraph::endAssembly: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::open, + *fillState_ != FE::FillState::open, std::logic_error, "Cannot endAssembly, matrix is not open to fill but is closed." ); - *fillState_ = FillState::closed; + *fillState_ = FE::FillState::closed; this->endFill(domainMap, rangeMap); } @@ -428,7 +428,7 @@ FECrsGraph::insertGlobalIndicesImpl ( ){ const char tfecfFuncName[] = "FECrsGraph::insertGlobalIndices: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::open, + *fillState_ != FE::FillState::open, std::logic_error, "Cannot replace global values, matrix is not open to fill but is closed." ); @@ -445,7 +445,7 @@ FECrsGraph::insertGlobalIndicesImpl ( ){ const char tfecfFuncName[] = "FECrsGraph::insertGlobalIndices: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::open, + *fillState_ != FE::FillState::open, std::logic_error, "Cannot replace global values, matrix is not open to fill but is closed." ); @@ -461,7 +461,7 @@ FECrsGraph::insertLocalIndicesImpl ( ){ const char tfecfFuncName[] = "FECrsGraph::insertLocalIndices: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::open, + *fillState_ != FE::FillState::open, std::logic_error, "Cannot replace global values, matrix is not open to fill but is closed." ); diff --git a/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp index 73d9db1d1b1c..2bf93ae03896 100644 --- a/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp @@ -14,6 +14,7 @@ /// \file Tpetra_FECrsMatrix_decl.hpp /// \brief Declaration of the Tpetra::FECrsMatrix class +#include "Tpetra_ConfigDefs.hpp" #include "Tpetra_CrsMatrix_decl.hpp" #include "Tpetra_FECrsGraph.hpp" @@ -292,12 +293,6 @@ class FECrsMatrix : //@} private: - // Enum for activity - enum FEWhichActive - { - FE_ACTIVE_OWNED, - FE_ACTIVE_OWNED_PLUS_SHARED - }; // The FECrsGraph from construction time Teuchos::RCP > feGraph_; @@ -305,15 +300,9 @@ class FECrsMatrix : // This is whichever multivector isn't currently active Teuchos::RCP > inactiveCrsMatrix_; // This is in RCP to make shallow copies of the FECrsMatrix work correctly - Teuchos::RCP activeCrsMatrix_; - - enum class FillState - { - open, // matrix is "open". Values can freely summed in to and replaced - modify, // matrix is open for modification. *local* values can be replaced - closed - }; - Teuchos::RCP fillState_; + Teuchos::RCP activeCrsMatrix_; + + Teuchos::RCP fillState_; }; // end class FECrsMatrix diff --git a/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp index 012c8ec6b6a5..892db94b2fa7 100644 --- a/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp @@ -36,8 +36,8 @@ FECrsMatrix(const Teuchos::RCP& graph, "fillComplete. In that case, you must call fillComplete on the graph " "again."); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - ( *graph->activeCrsGraph_!= fe_crs_graph_type::FE_ACTIVE_OWNED,std::runtime_error, - "Input graph must be in FE_ACTIVE_OWNED mode when this constructor is called."); + ( *graph->activeCrsGraph_!= FE::ACTIVE_OWNED,std::runtime_error, + "Input graph must be in FE::ACTIVE_OWNED mode when this constructor is called."); bool start_owned = false; if (! params.is_null ()) { @@ -46,9 +46,9 @@ FECrsMatrix(const Teuchos::RCP& graph, } } if(start_owned) { - activeCrsMatrix_ = Teuchos::rcp(new FEWhichActive(FE_ACTIVE_OWNED)); + activeCrsMatrix_ = Teuchos::rcp(new FE::WhichActive(FE::ACTIVE_OWNED)); } else { - activeCrsMatrix_ = Teuchos::rcp(new FEWhichActive(FE_ACTIVE_OWNED_PLUS_SHARED)); + activeCrsMatrix_ = Teuchos::rcp(new FE::WhichActive(FE::ACTIVE_OWNED_PLUS_SHARED)); } // Make an "inactive" matrix, if we need to @@ -58,14 +58,14 @@ FECrsMatrix(const Teuchos::RCP& graph, inactiveCrsMatrix_ = Teuchos::rcp(new crs_matrix_type(*this,graph)); } - fillState_ = Teuchos::rcp(new FillState(FillState::closed)); + fillState_ = Teuchos::rcp(new FE::FillState(FE::FillState::closed)); } template void FECrsMatrix::doOwnedPlusSharedToOwned(const CombineMode CM) { - if(!inactiveCrsMatrix_.is_null() && *activeCrsMatrix_ == FE_ACTIVE_OWNED_PLUS_SHARED) { + if(!inactiveCrsMatrix_.is_null() && *activeCrsMatrix_ == FE::ACTIVE_OWNED_PLUS_SHARED) { // Do a self-export in "restricted mode" this->doExport(*this,*feGraph_->ownedRowsImporter_,CM,true); inactiveCrsMatrix_->fillComplete(); @@ -81,10 +81,10 @@ void FECrsMatrix::doOwnedToOwnedPlusS template void FECrsMatrix::switchActiveCrsMatrix() { - if(*activeCrsMatrix_ == FE_ACTIVE_OWNED_PLUS_SHARED) - *activeCrsMatrix_ = FE_ACTIVE_OWNED; + if(*activeCrsMatrix_ == FE::ACTIVE_OWNED_PLUS_SHARED) + *activeCrsMatrix_ = FE::ACTIVE_OWNED; else - *activeCrsMatrix_ = FE_ACTIVE_OWNED_PLUS_SHARED; + *activeCrsMatrix_ = FE::ACTIVE_OWNED_PLUS_SHARED; if(inactiveCrsMatrix_.is_null()) return; @@ -95,7 +95,7 @@ void FECrsMatrix::switchActiveCrsMatr template void FECrsMatrix::endFill() { - if(*activeCrsMatrix_ == FE_ACTIVE_OWNED_PLUS_SHARED) { + if(*activeCrsMatrix_ == FE::ACTIVE_OWNED_PLUS_SHARED) { doOwnedPlusSharedToOwned(Tpetra::ADD); switchActiveCrsMatrix(); } @@ -107,7 +107,7 @@ template void FECrsMatrix::beginFill() { // Note: This does not throw an error since the on construction, the FECRS is in overlap mode. Ergo, calling beginFill(), // like one should expect to do in a rational universe, should not cause an error. - if(*activeCrsMatrix_ == FE_ACTIVE_OWNED) { + if(*activeCrsMatrix_ == FE::ACTIVE_OWNED) { this->resumeFill(); switchActiveCrsMatrix(); } @@ -117,59 +117,59 @@ void FECrsMatrix::beginFill() { template void FECrsMatrix::beginAssembly() { const char tfecfFuncName[] = "FECrsMatrix::beginAssembly: "; - if (*fillState_ != FillState::closed) + if (*fillState_ != FE::FillState::closed) { std::ostringstream errmsg; errmsg << "Cannot begin assembly, matrix is not in a closed state " << "but is currently open for " - << (*fillState_ == FillState::open ? "assembly" : "modification"); + << (*fillState_ == FE::FillState::open ? "assembly" : "modification"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } - *fillState_ = FillState::open; + *fillState_ = FE::FillState::open; this->beginFill(); } template void FECrsMatrix::endAssembly() { const char tfecfFuncName[] = "FECrsMatrix::endAssembly: "; - if (*fillState_ != FillState::open) + if (*fillState_ != FE::FillState::open) { std::ostringstream errmsg; errmsg << "Cannot end assembly, matrix is not open for assembly " << "but is currently " - << (*fillState_ == FillState::closed ? "closed" : "open for modification"); + << (*fillState_ == FE::FillState::closed ? "closed" : "open for modification"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } - *fillState_ = FillState::closed; + *fillState_ = FE::FillState::closed; this->endFill(); } template void FECrsMatrix::beginModify() { const char tfecfFuncName[] = "FECrsMatrix::beginModify: "; - if (*fillState_ != FillState::closed) + if (*fillState_ != FE::FillState::closed) { std::ostringstream errmsg; errmsg << "Cannot begin modifying, matrix is not in a closed state " << "but is currently open for " - << (*fillState_ == FillState::open ? "assembly" : "modification"); + << (*fillState_ == FE::FillState::open ? "assembly" : "modification"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } - *fillState_ = FillState::modify; + *fillState_ = FE::FillState::modify; this->resumeFill(); } template void FECrsMatrix::endModify() { const char tfecfFuncName[] = "FECrsMatrix::endModify: "; - if (*fillState_ != FillState::modify) + if (*fillState_ != FE::FillState::modify) { std::ostringstream errmsg; errmsg << "Cannot end modifying, matrix is not open to modify but is currently " - << (*fillState_ == FillState::open ? "open for assembly" : "closed"); + << (*fillState_ == FE::FillState::open ? "open for assembly" : "closed"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } - *fillState_ = FillState::closed; + *fillState_ = FE::FillState::closed; this->fillComplete(); } @@ -184,12 +184,12 @@ FECrsMatrix::replaceGlobalValuesImpl( const LocalOrdinal numElts) { const char tfecfFuncName[] = "FECrsMatrix::replaceGlobalValues: "; - if (*fillState_ != FillState::open) + if (*fillState_ != FE::FillState::open) { std::ostringstream errmsg; errmsg << "Cannot replace global values, matrix is not open for assembly " << "but is currently " - << (*fillState_ == FillState::modify ? "open for modification" : "closed"); + << (*fillState_ == FE::FillState::modify ? "open for modification" : "closed"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } return crs_matrix_type::replaceGlobalValuesImpl(rowVals, graph, rowInfo, inds, newVals, numElts); @@ -206,7 +206,7 @@ FECrsMatrix::replaceLocalValuesImpl( const LocalOrdinal numElts) { const char tfecfFuncName[] = "FECrsMatrix::replaceLocalValues: "; - if (*fillState_ != FillState::open && *fillState_ != FillState::modify) + if (*fillState_ != FE::FillState::open && *fillState_ != FE::FillState::modify) { std::ostringstream errmsg; errmsg << "Cannot replace local values, matrix is not open to fill/modify. " @@ -228,12 +228,12 @@ FECrsMatrix::sumIntoGlobalValuesImpl( const bool atomic) { const char tfecfFuncName[] = "FECrsMatrix::sumIntoGlobalValues: "; - if (*fillState_ != FillState::open) + if (*fillState_ != FE::FillState::open) { std::ostringstream errmsg; errmsg << "Cannot sum in to global values, matrix is not open for assembly. " << "The matrix is currently " - << (*fillState_ == FillState::modify ? "open for modification" : "closed"); + << (*fillState_ == FE::FillState::modify ? "open for modification" : "closed"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } return crs_matrix_type::sumIntoGlobalValuesImpl( @@ -253,12 +253,12 @@ FECrsMatrix::sumIntoLocalValuesImpl( const bool atomic) { const char tfecfFuncName[] = "FECrsMatrix::sumIntoLocalValues: "; - if (*fillState_ != FillState::open) + if (*fillState_ != FE::FillState::open) { std::ostringstream errmsg; errmsg << "Cannot sum in to local values, matrix is not open for assembly. " << "The matrix is currently " - << (*fillState_ == FillState::modify ? "open for modification" : "closed"); + << (*fillState_ == FE::FillState::modify ? "open for modification" : "closed"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } return crs_matrix_type::sumIntoLocalValuesImpl( @@ -276,12 +276,12 @@ FECrsMatrix::insertGlobalValuesImpl( const size_t numInputEnt) { const char tfecfFuncName[] = "FECrsMatrix::insertGlobalValues: "; - if (*fillState_ != FillState::open) + if (*fillState_ != FE::FillState::open) { std::ostringstream errmsg; errmsg << "Cannot insert global values, matrix is not open for assembly. " << "The matrix is currently " - << (*fillState_ == FillState::modify ? "open for modification" : "closed"); + << (*fillState_ == FE::FillState::modify ? "open for modification" : "closed"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, errmsg.str()); } return crs_matrix_type::insertGlobalValuesImpl(graph, rowInfo, gblColInds, vals, numInputEnt); diff --git a/packages/tpetra/core/src/Tpetra_FEMultiVector_decl.hpp b/packages/tpetra/core/src/Tpetra_FEMultiVector_decl.hpp index 74ff5cbaadb2..c505c54d7f52 100644 --- a/packages/tpetra/core/src/Tpetra_FEMultiVector_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_FEMultiVector_decl.hpp @@ -174,20 +174,7 @@ namespace Tpetra { /// you call this method. void replaceMap (const Teuchos::RCP& map); - //! Enum for activity - enum FEWhichActive - { - FE_ACTIVE_OWNED_PLUS_SHARED, - FE_ACTIVE_OWNED - }; - - enum class FillState - { - open, // matrix is "open". Values can freely summed in to and replaced - modify, // matrix is open for modification. *local* values can be replaced - closed - }; - Teuchos::RCP fillState_; + Teuchos::RCP fillState_; //! Whichever MultiVector is not currently active. Teuchos::RCP inactiveMultiVector_; @@ -197,7 +184,7 @@ namespace Tpetra { /// /// This is an RCP in order to make shallow copies of the /// FEMultiVector work correctly. - Teuchos::RCP activeMultiVector_; + Teuchos::RCP activeMultiVector_; //! Import object used for communication between the two MultiVectors. Teuchos::RCP> importer_; diff --git a/packages/tpetra/core/src/Tpetra_FEMultiVector_def.hpp b/packages/tpetra/core/src/Tpetra_FEMultiVector_def.hpp index 68e20b0517a6..a217f64711d5 100644 --- a/packages/tpetra/core/src/Tpetra_FEMultiVector_def.hpp +++ b/packages/tpetra/core/src/Tpetra_FEMultiVector_def.hpp @@ -29,7 +29,7 @@ FEMultiVector (const Teuchos::RCP& map, const bool zeroOut) : base_type (importer.is_null () ? map : importer->getTargetMap (), numVecs, zeroOut), - activeMultiVector_ (Teuchos::rcp (new FEWhichActive (FE_ACTIVE_OWNED_PLUS_SHARED))), + activeMultiVector_ (Teuchos::rcp (new FE::WhichActive (FE::ACTIVE_OWNED_PLUS_SHARED))), importer_ (importer) { const char tfecfFuncName[] = "FEMultiVector constructor: "; @@ -60,7 +60,7 @@ FEMultiVector (const Teuchos::RCP& map, inactiveMultiVector_ = Teuchos::rcp (new base_type (*this, importer_->getSourceMap(), 0)); } - fillState_ = Teuchos::rcp(new FillState(FillState::closed)); + fillState_ = Teuchos::rcp(new FE::FillState(FE::FillState::closed)); } template @@ -70,7 +70,7 @@ beginFill () { // The FEMultiVector is in owned+shared mode on construction, so we // do not throw in that case. - if (*activeMultiVector_ == FE_ACTIVE_OWNED) { + if (*activeMultiVector_ == FE::ACTIVE_OWNED) { switchActiveMultiVector (); } } @@ -82,7 +82,7 @@ endFill () { const char tfecfFuncName[] = "endFill: "; - if (*activeMultiVector_ == FE_ACTIVE_OWNED_PLUS_SHARED) { + if (*activeMultiVector_ == FE::ACTIVE_OWNED_PLUS_SHARED) { doOwnedPlusSharedToOwned (Tpetra::ADD); switchActiveMultiVector (); } @@ -97,11 +97,11 @@ template void FEMultiVector::beginAssembly() { const char tfecfFuncName[] = "FEMultiVector::beginAssembly: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::closed, + *fillState_ != FE::FillState::closed, std::runtime_error, "Cannot beginAssembly, matrix is not in a closed state" ); - *fillState_ = FillState::open; + *fillState_ = FE::FillState::open; this->beginFill(); } @@ -109,11 +109,11 @@ template void FEMultiVector::endAssembly() { const char tfecfFuncName[] = "FEMultiVector::endAssembly: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::open, + *fillState_ != FE::FillState::open, std::runtime_error, "Cannot endAssembly, matrix is not open to fill." ); - *fillState_ = FillState::closed; + *fillState_ = FE::FillState::closed; this->endFill(); } @@ -121,22 +121,22 @@ template void FEMultiVector::beginModify() { const char tfecfFuncName[] = "FEMultiVector::beginModify: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::closed, + *fillState_ != FE::FillState::closed, std::runtime_error, "Cannot beginModify, matrix is not in a closed state" ); - *fillState_ = FillState::modify; + *fillState_ = FE::FillState::modify; } template void FEMultiVector::endModify() { const char tfecfFuncName[] = "FEMultiVector::endModify: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - *fillState_ != FillState::modify, + *fillState_ != FE::FillState::modify, std::runtime_error, "Cannot endModify, matrix is not open to modify." ); - *fillState_ = FillState::closed; + *fillState_ = FE::FillState::closed; } template @@ -164,7 +164,7 @@ FEMultiVector:: doOwnedPlusSharedToOwned (const CombineMode CM) { if (! importer_.is_null () && - *activeMultiVector_ == FE_ACTIVE_OWNED_PLUS_SHARED) { + *activeMultiVector_ == FE::ACTIVE_OWNED_PLUS_SHARED) { inactiveMultiVector_->doExport (*this, *importer_, CM); } } @@ -175,7 +175,7 @@ FEMultiVector:: doOwnedToOwnedPlusShared (const CombineMode CM) { if (! importer_.is_null () && - *activeMultiVector_ == FE_ACTIVE_OWNED) { + *activeMultiVector_ == FE::ACTIVE_OWNED) { inactiveMultiVector_->doImport (*this, *importer_, CM); } } @@ -185,11 +185,11 @@ void FEMultiVector:: switchActiveMultiVector () { - if (*activeMultiVector_ == FE_ACTIVE_OWNED_PLUS_SHARED) { - *activeMultiVector_ = FE_ACTIVE_OWNED; + if (*activeMultiVector_ == FE::ACTIVE_OWNED_PLUS_SHARED) { + *activeMultiVector_ = FE::ACTIVE_OWNED; } else { - *activeMultiVector_ = FE_ACTIVE_OWNED_PLUS_SHARED; + *activeMultiVector_ = FE::ACTIVE_OWNED_PLUS_SHARED; } if (importer_.is_null ()) { From 8432626143178985ad85bbc7111cbe8c985ffc8c Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 29 Oct 2024 16:03:33 -0600 Subject: [PATCH 080/243] PyTrilinos2: Expose Tpetra::FE* Signed-off-by: Christian Glusa --- packages/PyTrilinos2/CMakeLists.txt | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/packages/PyTrilinos2/CMakeLists.txt b/packages/PyTrilinos2/CMakeLists.txt index d912b7efe078..3813a95d5cf0 100644 --- a/packages/PyTrilinos2/CMakeLists.txt +++ b/packages/PyTrilinos2/CMakeLists.txt @@ -29,7 +29,7 @@ PYTRILINOS2_CMAKE_ERROR TRIBITS_ADD_OPTION_AND_DEFINE(PyTrilinos2_BINDER_VERBOSE PYTRILINOS2_B_VERBOSE "Increase the verbosity of binder." - OFF ) + OFF ) SET(PyTrilinos2_BINDER_NUM_FILES "100" CACHE STRING "Maxinum number of generated files by binder.") @@ -184,7 +184,7 @@ FOREACH(line IN LISTS eti_files_without_dir) ENDFOREACH(line) file(WRITE ${all_ETI_files_list} ${CONTENTS}) -SET(ETI_classes "Tpetra_CrsMatrix;Tpetra_Vector;Tpetra_MultiVector") +SET(ETI_classes "Tpetra_CrsMatrix;Tpetra_Vector;Tpetra_MultiVector;Tpetra_FEMultiVector;Tpetra_FECrsMatrix") SET(CONTENTS "") FOREACH(line IN LISTS ETI_classes) SET(CONTENTS "${CONTENTS}${line}\n") @@ -229,7 +229,7 @@ IF(PYTRILINOS2_B_VERBOSE) ENDIF() IF(PYTRILINOS2_SUPPRESS_ERRORS) list(APPEND BINDER_OPTIONS --suppress-errors) -ENDIF() +ENDIF() list(APPEND BINDER_OPTIONS --config ${CMAKE_CURRENT_SOURCE_DIR}/scripts/PyTrilinos2_config.cfg) list(APPEND BINDER_OPTIONS --) IF(TPL_ENABLE_CUDA) @@ -241,6 +241,14 @@ if (NOT(MPI_BASE_DIR STREQUAL "")) list(APPEND BINDER_OPTIONS -I${MPI_BASE_DIR}/include) ENDIF() list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/mdspan) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/View/MDSpan) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/experimental) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/experimental/__p0009_bits) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/experimental/__p1684_bits) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/experimental/__p2389_bits) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/experimental/__p2630_bits) +list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/include_tmp/experimental/__p2642_bits) list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_BINARY_DIR}/src) list(APPEND BINDER_OPTIONS -I${CMAKE_CURRENT_SOURCE_DIR}/src) IF(NOT DEFINED PyTrilinos2_BINDER_GCC_TOOLCHAIN) From c367a234d9de789c533a4e7b853ed64a463e0a8d Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 16:41:33 -0600 Subject: [PATCH 081/243] Use multi-line yml for cmake command Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 130ed194da46..3ada32ba19fe 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -92,7 +92,11 @@ jobs: run: | cd trilinos_build - cmake -C genconfig_fragment.cmake -C package_enables.cmake -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. + cmake -C genconfig_fragment.cmake -C package_enables.cmake \ + -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \ + -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \ + -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. + ninja -j 16 - name: Perform CodeQL Analysis From 54d711e40daac2252c661c52482c29a9179b6974 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 17:02:58 -0600 Subject: [PATCH 082/243] Manually disable each deprecated package in cmake command Manually disable each deprecated package in the cmake command for CodeQL configuration. Since these are defined on the command line, they should take priority over any of the enables from the package_enables.cmake or the genconfig_fragment.cmake fragments. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 3ada32ba19fe..b739518db9ef 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -95,7 +95,22 @@ jobs: cmake -C genconfig_fragment.cmake -C package_enables.cmake \ -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \ -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \ - -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. + -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF \ + -DTrilinos_ENABLE_Amesos=OFF \ + -DTrilinos_ENABLE_AztecOO=OFF \ + -DTrilinos_ENABLE_Epetra=OFF \ + -DTrilinos_ENABLE_EpetraExt=OFF \ + -DTrilinos_ENABLE_Ifpack=OFF \ + -DTrilinos_ENABLE_Intrepid=OFF \ + -DTrilinos_ENABLE_Isorropia=OFF \ + -DTrilinos_ENABLE_ML=OFF \ + -DTrilinos_ENABLE_NewPackage=OFF \ + -DTrilinos_ENABLE_Pliris=OFF \ + -DTrilinos_ENABLE_PyTrilinos=OFF \ + -DTrilinos_ENABLE_ShyLU_DDCore=OFF \ + -DTrilinos_ENABLE_ThyraEpetraAdapters=OFF \ + -DTrilinos_ENABLE_ThyraEpetraExtAdapters=OFF \ + -DTrilinos_ENABLE_Triutils=OFF .. ninja -j 16 From f07a54fbacbf35d6d7954fba0d7611c47c112761 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 17:18:12 -0600 Subject: [PATCH 083/243] Fix spack.yml syntax Signed-off-by: Anderson Chauphan --- .github/workflows/spack.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/spack.yml b/.github/workflows/spack.yml index c722c1287ed1..59976c1d9b3e 100644 --- a/.github/workflows/spack.yml +++ b/.github/workflows/spack.yml @@ -4,9 +4,9 @@ on: types: - opened - synchronize - branches: - - master - - develop + branches: + - master + - develop workflow_dispatch: # Cancels any in progress 'workflow' associated with this PR From 6ed29e6bdb7d3854d8175c3773e7f69f1360952e Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 17:18:36 -0600 Subject: [PATCH 084/243] Fix AT2.yml syntax Signed-off-by: Anderson Chauphan --- .github/workflows/AT2.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/AT2.yml b/.github/workflows/AT2.yml index 01937667d612..b232051eddf2 100644 --- a/.github/workflows/AT2.yml +++ b/.github/workflows/AT2.yml @@ -6,8 +6,8 @@ on: - opened - synchronize branches: - - master - - develop + - master + - develop workflow_dispatch: # Cancels any in progress 'workflows' associated with this PR From 16ed3aba8b676f0eb73c7f3f4761ecf67c7145d7 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 29 Oct 2024 19:37:06 -0600 Subject: [PATCH 085/243] Teuchos: Delete ConstNonconstObjectContainer::count method Signed-off-by: Christian Glusa --- .../teuchos/core/src/Teuchos_ConstNonconstObjectContainer.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/packages/teuchos/core/src/Teuchos_ConstNonconstObjectContainer.hpp b/packages/teuchos/core/src/Teuchos_ConstNonconstObjectContainer.hpp index fae32b32a9f2..a882494097e3 100644 --- a/packages/teuchos/core/src/Teuchos_ConstNonconstObjectContainer.hpp +++ b/packages/teuchos/core/src/Teuchos_ConstNonconstObjectContainer.hpp @@ -328,9 +328,6 @@ class ConstNonconstObjectContainer { /** \brief Perform an implicit conversion to an RCP. */ operator RCP() const { return getConstObj(); } - /** \brief Return the internal count. */ - int count() const - { return constObj_.count(); } private: RCP constObj_; From 0acffe8caab7916cf1e688bfe636db0283a43af7 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 30 Oct 2024 09:48:01 -0600 Subject: [PATCH 086/243] tpetra: replace use of impl_dualview_is_single_device replace use of Kokkos impl_* routine, preemptive change in case internal impl routines become private members Signed-off-by: Nathan Ellingwood --- packages/tpetra/core/src/Tpetra_MultiVector_def.hpp | 2 +- .../core/test/ImportExport2/ImportExport2_UnitTests.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp index 068927e39ed2..9fafc1502c56 100644 --- a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp +++ b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp @@ -1795,7 +1795,7 @@ void MultiVector::copyAndPermute( // - CombineMode needs to be INSERT. // - The number of vectors needs to be 1, otherwise we need to // reorder the received data. - if ((dual_view_type::impl_dualview_is_single_device::value || + if ((std::is_same_v || (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) || (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) && areRemoteLIDsContiguous && diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp index 1482f3132e3e..728df1dcd5b0 100644 --- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp @@ -730,7 +730,7 @@ namespace { // MV::imports_ and MV::view_ have the same memory space, the // imports_ view is aliased to the data view of the target MV. if ((myImageID == collectRank) && (myImageID == 0)) { - if (mv_type::dual_view_type::impl_dualview_is_single_device::value) + if (std::is_same_v) TEUCHOS_ASSERT(tgt_mv->importsAreAliased()); // else { // We do not know if copyAndPermute was run on host or device. @@ -800,7 +800,7 @@ namespace { // MV::imports_ and MV::view_ have the same memory space, the // imports_ view is aliased to the data view of the target MV. if ((myImageID == collectRank) && (myImageID == 0)) { - if (mv_type::dual_view_type::impl_dualview_is_single_device::value) + if (std::is_same_v) TEUCHOS_ASSERT(tgt_mv->importsAreAliased()); // else { // We do not know if copyAndPermute was run on host or device. From 6555e68d6d82188baaa9cd37fa9751fa831b9332 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 30 Oct 2024 15:16:39 -0600 Subject: [PATCH 087/243] shylubasker: remove unused code resolve compilation errors with printRHS and printSOL Signed-off-by: Nathan Ellingwood --- .../basker/src/shylubasker_decl.hpp | 2 - .../shylu_node/basker/src/shylubasker_def.hpp | 4 -- .../basker/src/shylubasker_thread.hpp | 2 +- .../basker/src/shylubasker_util.hpp | 55 ------------------- 4 files changed, 1 insertion(+), 62 deletions(-) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp index f9b33e325bd7..09e3f6f98382 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp @@ -1160,8 +1160,6 @@ namespace BaskerNS void printMTX(std::string fname, BASKER_MATRIX &M); void printMTX(std::string fname, BASKER_MATRIX &M, BASKER_BOOL off); void readMTX(std::string fname, BASKER_MATRIX &M); - int printRHS(); - int printSOL(); void printTree(); BASKER_INLINE diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp index c7b9d66311ab..35d8588b0bd9 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_def.hpp @@ -2307,10 +2307,6 @@ namespace BaskerNS printU(); printUMTX(); std::cout << "U printed" << std::endl; - //printRHS(); - std::cout << "RHS printed" << std::endl; - //printSOL(); - std::cout << "SOL printed" << std::endl; //printTree(); std::cout << "Tree printed" << std::endl; diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp index ebce20c9875f..6e4d1554c754 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp @@ -272,7 +272,7 @@ namespace BaskerNS BASKER_INLINE void atomic_barrier_fanout(volatile Int &value, const Int l_size) { - Kokkos::atomic_inc(&(value)) + Kokkos::atomic_inc(&(value)); while(value < l_size) { BASKER_NO_OP; diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp index 2d8322c05de2..455b76004a98 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp @@ -1472,61 +1472,6 @@ namespace BaskerNS }//end readMTX() - //Print out RHS RHS.txt - template - int Basker::printRHS() - { - if(solve_flag == false) - {return -1;} - - FILE *fp; - fp = fopen("RHS.txt", "w"); - - //over each row - for(Int r = 0; r < A.nrow; r++) - { - //over each column NOTE: come back to - //for(Int k = 0; k < rhs.size(); k++) - for(Int k = 0; k < 1; k++) - { - //fprintf(fp, "%ld %ld %f, ", (long)r, (long)gperm[r], rhs[k][r]); - fprintf(fp, "%ld %ld %.16e, ", (long)r, (long)gperm[r], rhs[k][r]); - }//end over each column - fprintf(fp, "\n"); - }//end over each row - - fclose(fp); - - return 0; - }//end printRHS() - - //Print solution SOL.txt - template - int Basker::printSOL() - { - if(solve_flag == false) - {return -1;} - - FILE *fp; - fp = fopen("SOL.txt", "w"); - - //over each row - for(Int r = 0; r < A.nrow; r++) - { - //over each column Note: come back to - //for(Int k = 0; k < rhs.size(); k++) - for(Int k = 0 ; k < 1; k++) - { - fprintf(fp, "%ld %ld %f, ", (long)r, (long)gperm[r], sol[k][r]); - }//end over each column - fprintf(fp, "\n"); - }//end over each row - - fclose(fp); - - return 0; - }//end printSOL() - //Prints the given tree into a file to analyze template void Basker::printTree() From e4752f04f5c31c4382601ec9796f242335bcb74c Mon Sep 17 00:00:00 2001 From: mperego Date: Fri, 1 Nov 2024 08:33:16 -0600 Subject: [PATCH 088/243] Intrepid2: Implementation of team-level Basis::getValues (#13437) - Implemented team-level getValues for classic Lagrangian basis functions. - Modified/added tests to compare the team-level getValues with host getValues - Modified impelementation of JacobiPolynomial to reduce FAD temporaries Signed-off-by: Mauro Perego --- .../Discretization/Basis/Intrepid2_Basis.hpp | 55 ++++ .../Basis/Intrepid2_HCURL_HEX_I1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HCURL_HEX_I1_FEMDef.hpp | 51 ++++ .../Basis/Intrepid2_HCURL_HEX_In_FEM.hpp | 32 ++- .../Basis/Intrepid2_HCURL_HEX_In_FEMDef.hpp | 112 ++++++-- .../Basis/Intrepid2_HCURL_QUAD_I1_FEM.hpp | 32 ++- .../Basis/Intrepid2_HCURL_QUAD_I1_FEMDef.hpp | 53 +++- .../Basis/Intrepid2_HCURL_QUAD_In_FEM.hpp | 32 ++- .../Basis/Intrepid2_HCURL_QUAD_In_FEMDef.hpp | 103 ++++++-- .../Basis/Intrepid2_HCURL_TET_I1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HCURL_TET_I1_FEMDef.hpp | 51 ++++ .../Basis/Intrepid2_HCURL_TET_In_FEM.hpp | 23 +- .../Basis/Intrepid2_HCURL_TET_In_FEMDef.hpp | 95 +++++-- .../Basis/Intrepid2_HCURL_TRI_I1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HCURL_TRI_I1_FEMDef.hpp | 51 +++- .../Basis/Intrepid2_HCURL_TRI_In_FEM.hpp | 23 +- .../Basis/Intrepid2_HCURL_TRI_In_FEMDef.hpp | 94 +++++-- .../Basis/Intrepid2_HCURL_WEDGE_I1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HCURL_WEDGE_I1_FEMDef.hpp | 52 ++++ .../Basis/Intrepid2_HDIV_HEX_I1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HDIV_HEX_I1_FEMDef.hpp | 52 ++++ .../Basis/Intrepid2_HDIV_HEX_In_FEM.hpp | 34 ++- .../Basis/Intrepid2_HDIV_HEX_In_FEMDef.hpp | 102 ++++++-- .../Basis/Intrepid2_HDIV_QUAD_I1_FEM.hpp | 32 ++- .../Basis/Intrepid2_HDIV_QUAD_I1_FEMDef.hpp | 51 +++- .../Basis/Intrepid2_HDIV_QUAD_In_FEM.hpp | 33 ++- .../Basis/Intrepid2_HDIV_QUAD_In_FEMDef.hpp | 104 ++++++-- .../Basis/Intrepid2_HDIV_TET_I1_FEM.hpp | 35 ++- .../Basis/Intrepid2_HDIV_TET_I1_FEMDef.hpp | 52 ++++ .../Basis/Intrepid2_HDIV_TET_In_FEM.hpp | 227 ++++++++-------- .../Basis/Intrepid2_HDIV_TET_In_FEMDef.hpp | 101 ++++++-- .../Basis/Intrepid2_HDIV_TRI_I1_FEM.hpp | 35 ++- .../Basis/Intrepid2_HDIV_TRI_I1_FEMDef.hpp | 52 ++++ .../Basis/Intrepid2_HDIV_TRI_In_FEM.hpp | 60 +++-- .../Basis/Intrepid2_HDIV_TRI_In_FEMDef.hpp | 94 +++++-- .../Basis/Intrepid2_HDIV_WEDGE_I1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HDIV_WEDGE_I1_FEMDef.hpp | 53 ++++ .../Basis/Intrepid2_HGRAD_HEX_C1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_HEX_C1_FEMDef.hpp | 49 ++++ .../Basis/Intrepid2_HGRAD_HEX_C2_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_HEX_C2_FEMDef.hpp | 49 ++++ .../Basis/Intrepid2_HGRAD_HEX_Cn_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_HEX_Cn_FEMDef.hpp | 65 ++++- .../Basis/Intrepid2_HGRAD_LINE_C1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_LINE_C1_FEMDef.hpp | 51 +++- .../Basis/Intrepid2_HGRAD_LINE_C2_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_LINE_C2_FEMDef.hpp | 51 +++- .../Basis/Intrepid2_HGRAD_LINE_Cn_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_LINE_Cn_FEMDef.hpp | 92 +++++-- .../Basis/Intrepid2_HGRAD_PYR_C1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_PYR_C1_FEMDef.hpp | 50 +++- .../Basis/Intrepid2_HGRAD_PYR_I2_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_PYR_I2_FEMDef.hpp | 50 +++- .../Basis/Intrepid2_HGRAD_QUAD_C1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_QUAD_C1_FEMDef.hpp | 58 +++++ .../Basis/Intrepid2_HGRAD_QUAD_C2_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_QUAD_C2_FEMDef.hpp | 61 +++++ .../Basis/Intrepid2_HGRAD_QUAD_Cn_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_QUAD_Cn_FEMDef.hpp | 127 ++++++--- .../Basis/Intrepid2_HGRAD_TET_C1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_TET_C1_FEMDef.hpp | 49 ++++ .../Basis/Intrepid2_HGRAD_TET_C2_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_TET_C2_FEMDef.hpp | 49 ++++ .../Basis/Intrepid2_HGRAD_TET_COMP12_FEM.hpp | 21 +- .../Intrepid2_HGRAD_TET_COMP12_FEMDef.hpp | 51 +++- .../Basis/Intrepid2_HGRAD_TET_Cn_FEM.hpp | 92 ++++--- .../Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp | 128 ++++++--- .../Basis/Intrepid2_HGRAD_TRI_C1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_TRI_C1_FEMDef.hpp | 58 +++++ .../Basis/Intrepid2_HGRAD_TRI_C2_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_TRI_C2_FEMDef.hpp | 58 +++++ .../Basis/Intrepid2_HGRAD_TRI_Cn_FEM.hpp | 96 ++++--- .../Basis/Intrepid2_HGRAD_TRI_Cn_FEMDef.hpp | 147 +++++++---- .../Intrepid2_HGRAD_TRI_Cn_FEM_ORTHDef.hpp | 61 ----- .../Basis/Intrepid2_HGRAD_WEDGE_C1_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_WEDGE_C1_FEMDef.hpp | 49 ++++ .../Basis/Intrepid2_HGRAD_WEDGE_C2_FEM.hpp | 17 ++ .../Basis/Intrepid2_HGRAD_WEDGE_C2_FEMDef.hpp | 82 ++++-- .../Basis/Intrepid2_HVOL_HEX_Cn_FEM.hpp | 35 ++- .../Basis/Intrepid2_HVOL_HEX_Cn_FEMDef.hpp | 102 ++++++-- .../Basis/Intrepid2_HVOL_LINE_Cn_FEM.hpp | 34 ++- .../Basis/Intrepid2_HVOL_LINE_Cn_FEMDef.hpp | 86 ++++-- .../Basis/Intrepid2_HVOL_QUAD_Cn_FEM.hpp | 36 ++- .../Basis/Intrepid2_HVOL_QUAD_Cn_FEMDef.hpp | 102 ++++++-- .../Basis/Intrepid2_HVOL_TET_Cn_FEM.hpp | 42 ++- .../Basis/Intrepid2_HVOL_TET_Cn_FEMDef.hpp | 94 +++++-- .../Basis/Intrepid2_HVOL_TRI_Cn_FEM.hpp | 43 ++- .../Basis/Intrepid2_HVOL_TRI_Cn_FEMDef.hpp | 101 ++++++-- ...Intrepid2_CubatureControlVolumeSideDef.hpp | 2 +- .../src/Shared/Intrepid2_PolylibDef.hpp | 217 ++++++++-------- .../intrepid2/src/Shared/Intrepid2_Utils.hpp | 26 ++ .../Basis/HCURL_HEX_I1_FEM/CMakeLists.txt | 82 +++++- .../Basis/HCURL_HEX_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_HEX_I1_FEM/test_02.hpp | 187 ++++++++++++++ .../Basis/HCURL_HEX_In_FEM/CMakeLists.txt | 81 +++++- .../Basis/HCURL_HEX_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_HEX_In_FEM/test_02.hpp | 203 +++++++++++++++ .../Basis/HCURL_QUAD_I1_FEM/CMakeLists.txt | 81 +++++- .../HCURL_QUAD_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_QUAD_I1_FEM/test_02.hpp | 185 +++++++++++++ .../Basis/HCURL_QUAD_In_FEM/CMakeLists.txt | 79 +++++- .../HCURL_QUAD_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_QUAD_In_FEM/test_02.hpp | 189 ++++++++++++++ .../Basis/HCURL_TET_I1_FEM/CMakeLists.txt | 82 +++++- .../Basis/HCURL_TET_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_TET_I1_FEM/test_02.hpp | 187 ++++++++++++++ .../Basis/HCURL_TET_In_FEM/CMakeLists.txt | 79 +++++- .../Basis/HCURL_TET_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_TET_In_FEM/test_02.hpp | 205 +++++++++++++++ .../Basis/HCURL_TRI_I1_FEM/CMakeLists.txt | 81 +++++- .../Basis/HCURL_TRI_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_TRI_I1_FEM/test_02.hpp | 185 +++++++++++++ .../Basis/HCURL_TRI_In_FEM/CMakeLists.txt | 79 +++++- .../Basis/HCURL_TRI_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_TRI_In_FEM/test_02.hpp | 189 ++++++++++++++ .../Basis/HCURL_WEDGE_I1_FEM/CMakeLists.txt | 82 +++++- .../HCURL_WEDGE_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HCURL_WEDGE_I1_FEM/test_02.hpp | 188 ++++++++++++++ .../Basis/HDIV_HEX_I1_FEM/CMakeLists.txt | 84 +++++- .../Basis/HDIV_HEX_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_HEX_I1_FEM/test_02.hpp | 186 +++++++++++++ .../Basis/HDIV_HEX_In_FEM/CMakeLists.txt | 79 +++++- .../Basis/HDIV_HEX_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_HEX_In_FEM/test_02.hpp | 190 ++++++++++++++ .../Basis/HDIV_QUAD_I1_FEM/CMakeLists.txt | 84 +++++- .../Basis/HDIV_QUAD_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_QUAD_I1_FEM/test_02.hpp | 185 +++++++++++++ .../Basis/HDIV_QUAD_In_FEM/CMakeLists.txt | 79 +++++- .../Basis/HDIV_QUAD_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_QUAD_In_FEM/test_02.hpp | 190 ++++++++++++++ .../Basis/HDIV_TET_I1_FEM/CMakeLists.txt | 84 +++++- .../Basis/HDIV_TET_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_TET_I1_FEM/test_02.hpp | 185 +++++++++++++ .../Basis/HDIV_TET_In_FEM/CMakeLists.txt | 79 +++++- .../Basis/HDIV_TET_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_TET_In_FEM/test_02.hpp | 190 ++++++++++++++ .../Basis/HDIV_TRI_I1_FEM/CMakeLists.txt | 84 +++++- .../Basis/HDIV_TRI_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_TRI_I1_FEM/test_02.hpp | 185 +++++++++++++ .../Basis/HDIV_TRI_In_FEM/CMakeLists.txt | 79 +++++- .../Basis/HDIV_TRI_In_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_TRI_In_FEM/test_02.hpp | 189 ++++++++++++++ .../Basis/HDIV_WEDGE_I1_FEM/CMakeLists.txt | 81 +++++- .../HDIV_WEDGE_I1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HDIV_WEDGE_I1_FEM/test_02.hpp | 185 +++++++++++++ .../Basis/HGRAD_HEX_C1_FEM/CMakeLists.txt | 74 ++++++ .../Basis/HGRAD_HEX_C1_FEM/eti/test_03_ETI.in | 52 ++++ .../Basis/HGRAD_HEX_C1_FEM/test_03.hpp | 184 +++++++++++++ .../Basis/HGRAD_HEX_C2_FEM/CMakeLists.txt | 86 +++++- .../Basis/HGRAD_HEX_C2_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_HEX_C2_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_HEX_Cn_FEM/CMakeLists.txt | 8 + .../Basis/HGRAD_HEX_Cn_FEM/eti/test_01_ETI.in | 5 +- .../Basis/HGRAD_HEX_Cn_FEM/eti/test_02_ETI.in | 35 ++- .../Basis/HGRAD_HEX_Cn_FEM/test_02.hpp | 201 +++++++++------ .../Basis/HGRAD_LINE_C1_FEM/CMakeLists.txt | 80 +++++- .../HGRAD_LINE_C1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_LINE_C1_FEM/test_02.hpp | 185 +++++++++++++ .../Basis/HGRAD_LINE_C2_FEM/CMakeLists.txt | 80 +++++- .../HGRAD_LINE_C2_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_LINE_C2_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_LINE_Cn_FEM/CMakeLists.txt | 78 +++++- .../HGRAD_LINE_Cn_FEM/eti/test_01_ETI.in | 6 +- .../HGRAD_LINE_Cn_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_LINE_Cn_FEM/test_02.hpp | 188 ++++++++++++++ .../Basis/HGRAD_PYR_C1_FEM/CMakeLists.txt | 80 +++++- .../Basis/HGRAD_PYR_C1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_PYR_C1_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_PYR_I2_FEM/CMakeLists.txt | 82 +++++- .../Basis/HGRAD_PYR_I2_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_PYR_I2_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_QUAD_C1_FEM/CMakeLists.txt | 80 +++++- .../HGRAD_QUAD_C1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_QUAD_C1_FEM/test_02.hpp | 228 ++++++++++++++++ .../Basis/HGRAD_QUAD_C2_FEM/CMakeLists.txt | 86 +++++- .../HGRAD_QUAD_C2_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_QUAD_C2_FEM/test_02.hpp | 228 ++++++++++++++++ .../Basis/HGRAD_QUAD_Cn_FEM/CMakeLists.txt | 8 + .../HGRAD_QUAD_Cn_FEM/eti/test_01_ETI.in | 5 +- .../HGRAD_QUAD_Cn_FEM/eti/test_02_ETI.in | 33 ++- .../Basis/HGRAD_QUAD_Cn_FEM/test_02.hpp | 244 ++++++++++++------ .../Basis/HGRAD_TET_C1_FEM/CMakeLists.txt | 80 +++++- .../Basis/HGRAD_TET_C1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_TET_C1_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_TET_C2_FEM/CMakeLists.txt | 80 +++++- .../Basis/HGRAD_TET_C2_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_TET_C2_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_TET_COMP12_FEM/CMakeLists.txt | 80 +++++- .../HGRAD_TET_COMP12_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_TET_COMP12_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_TET_Cn_FEM/CMakeLists.txt | 8 + .../Basis/HGRAD_TET_Cn_FEM/eti/test_01_ETI.in | 6 +- .../Basis/HGRAD_TET_Cn_FEM/eti/test_02_ETI.in | 33 ++- .../Basis/HGRAD_TET_Cn_FEM/test_02.hpp | 202 +++++++++------ .../Basis/HGRAD_TRI_C1_FEM/CMakeLists.txt | 80 +++++- .../Basis/HGRAD_TRI_C1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_TRI_C1_FEM/test_02.hpp | 228 ++++++++++++++++ .../Basis/HGRAD_TRI_C2_FEM/CMakeLists.txt | 80 +++++- .../Basis/HGRAD_TRI_C2_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_TRI_C2_FEM/test_02.hpp | 228 ++++++++++++++++ .../Basis/HGRAD_TRI_Cn_FEM/CMakeLists.txt | 8 + .../Basis/HGRAD_TRI_Cn_FEM/eti/test_01_ETI.in | 8 +- .../Basis/HGRAD_TRI_Cn_FEM/eti/test_02_ETI.in | 31 ++- .../Basis/HGRAD_TRI_Cn_FEM/test_02.hpp | 243 +++++++++++------ .../Basis/HGRAD_WEDGE_C1_FEM/CMakeLists.txt | 80 +++++- .../HGRAD_WEDGE_C1_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_WEDGE_C1_FEM/test_02.hpp | 184 +++++++++++++ .../Basis/HGRAD_WEDGE_C2_FEM/CMakeLists.txt | 87 ++++++- .../HGRAD_WEDGE_C2_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HGRAD_WEDGE_C2_FEM/test_02.hpp | 182 +++++++++++++ .../Basis/HVOL_HEX_Cn_FEM/CMakeLists.txt | 79 +++++- .../Basis/HVOL_HEX_Cn_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HVOL_HEX_Cn_FEM/test_02.hpp | 144 +++++++++++ .../Basis/HVOL_LINE_Cn_FEM/CMakeLists.txt | 79 +++++- .../Basis/HVOL_LINE_Cn_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HVOL_LINE_Cn_FEM/test_02.hpp | 144 +++++++++++ .../Basis/HVOL_QUAD_Cn_FEM/CMakeLists.txt | 79 +++++- .../Basis/HVOL_QUAD_Cn_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HVOL_QUAD_Cn_FEM/test_02.hpp | 144 +++++++++++ .../Basis/HVOL_TET_Cn_FEM/CMakeLists.txt | 79 +++++- .../Basis/HVOL_TET_Cn_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HVOL_TET_Cn_FEM/test_02.hpp | 144 +++++++++++ .../Basis/HVOL_TRI_Cn_FEM/CMakeLists.txt | 79 +++++- .../Basis/HVOL_TRI_Cn_FEM/eti/test_02_ETI.in | 52 ++++ .../Basis/HVOL_TRI_Cn_FEM/test_02.hpp | 145 +++++++++++ .../unit-test/Shared/Polylib/test_01.hpp | 16 +- 226 files changed, 17059 insertions(+), 1394 deletions(-) create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/eti/test_03_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/test_03.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/test_02.hpp create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/eti/test_02_ETI.in create mode 100644 packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/test_02.hpp diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp index 74b34efb6681..5779d95741e8 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp @@ -379,6 +379,61 @@ using HostBasisPtr = BasisPtrinputPoints is only used to deduce the type of the points where to evaluate basis functions. + The rank of inputPoints and its size are not relevant, however, + when using DFAD types, inputPoints cannot be empty, + otherwise the size of the scracth space needed won't be deduced correctly. + + \param space [in] - inputPoints + \param perTeamSpaceSize [out] - size of the scratch space needed per team + \param perThreadeSize [out] - size of the scratch space beeded per thread + */ + virtual + void getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const { + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE( true, std::logic_error, + ">>> ERROR (Basis::getValuesScratchSpace): this method is not supported or should be overridden accordingly by derived classes."); + } + + + /** \brief Team-level evaluation of basis functions on a reference cell. + + Returns values of operatorType acting on basis functions for a set of + points in the reference cell for which the basis is defined. + + The interface allow also to select basis functions associated to a particular entity. + As an example, if subcellDim==1 (edges) and subcellOrdinal==0, outputValues will contain all the basis functions associated with the first edge. + outputValues will contain all the cell basis functions when the default value (-1) is used for subcellDim and subcellOrdinal + + \param outputValues [out] - variable rank array with the basis values + \param inputPoints [in] - rank-2 array (P,D) with the evaluation points + \param operatorType [in] - the operator acting on the basis functions + \param teamMember [in] - team member of the Kokkos::TemaPolicy + \param scratchStorage [in] - scratch space to use by each team + \param subcellDim [in] - the dimension of the subcells, the default values of -1 returns basis functions associated to subcells of all dimensions + \param subcellOrdinal [in] - the ordinal of the subcell, the default values of -1 returns basis functions associated to subcells of all ordinals + + \remark This function is supposed to be called within a TeamPolicy kernel. + The size of the required scratch space is determined by the getScratchSpaceSize function. + */ + KOKKOS_INLINE_FUNCTION + virtual + void getValues( OutputViewType /* outputValues */, + const PointViewType /* inputPoints */, + const EOperator /* operatorType */, + const typename Kokkos::TeamPolicy::member_type& teamMember, + const typename ExecutionSpace::scratch_memory_space &scratchStorage, + const ordinal_type subcellDim=-1, + const ordinal_type subcellOrdinal=-1) const { + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE( true, std::logic_error, + ">>> ERROR (Basis::getValues): this method is not supported or should be overridden accordingly by derived classes."); + } + /** \brief Evaluation of a FEM basis on a reference cell. Returns values of operatorType acting on FEM basis functions for a set of diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEM.hpp index 299054557fca..72d0e9112c01 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEM.hpp @@ -185,6 +185,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEMDef.hpp index 7eff91667e1b..71ea78656fc1 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_I1_FEMDef.hpp @@ -330,6 +330,57 @@ namespace Intrepid2 { } + template + void + Basis_HCURL_HEX_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_HEX_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_HEX_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_HEX_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_HEX_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HCURL_HEX_I1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEM.hpp index 1af120be9949..64327bb29c08 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEM.hpp @@ -148,20 +148,21 @@ namespace Intrepid2 { class Basis_HCURL_HEX_In_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; /** \brief Constructor. */ Basis_HCURL_HEX_In_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -184,6 +185,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEMDef.hpp index 1d18b7887096..182c05d721b0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_HEX_In_FEMDef.hpp @@ -21,19 +21,19 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HCURL_HEX_In_FEM::Serial:: + Basis_HCURL_HEX_In_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinvLine, - const vinvViewType vinvBubble) { + const InputViewType input, + WorkViewType work, + const VinvViewType vinvLine, + const VinvViewType vinvBubble) { const ordinal_type cardLine = vinvLine.extent(0); const ordinal_type cardBubble = vinvBubble.extent(0); @@ -44,22 +44,22 @@ namespace Intrepid2 { const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); const auto input_z = Kokkos::subview(input, Kokkos::ALL(), range_type(2,3)); - const ordinal_type dim_s = get_dimension_scalar(work); + const ordinal_type dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data() + cardLine*npts*dim_s; auto ptr2 = work.data() + 2*cardLine*npts*dim_s; auto ptr3 = work.data() + 3*cardLine*npts*dim_s; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType outputLine_A(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType outputLine_B(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); - viewType outputBubble(Kokkos::view_wrap(ptr3, vcprop), cardBubble, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType outputLine_A(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType outputLine_B(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + ViewType outputBubble(Kokkos::view_wrap(ptr3, vcprop), cardBubble, npts); // tensor product ordinal_type idx = 0; @@ -142,12 +142,12 @@ namespace Intrepid2 { auto ptr4 = work.data() + 4*cardLine*npts*dim_s; auto ptr5 = work.data() + 5*cardLine*npts*dim_s; - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType outputLine_A(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType outputLine_B(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); - viewType outputLine_DA(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts, 1); - viewType outputLine_DB(Kokkos::view_wrap(ptr4, vcprop), cardLine, npts, 1); - viewType outputBubble(Kokkos::view_wrap(ptr5, vcprop), cardBubble, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType outputLine_A(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType outputLine_B(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + ViewType outputLine_DA(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts, 1); + ViewType outputLine_DB(Kokkos::view_wrap(ptr4, vcprop), cardLine, npts, 1); + ViewType outputBubble(Kokkos::view_wrap(ptr5, vcprop), cardBubble, npts); // tensor product ordinal_type idx = 0; @@ -588,6 +588,70 @@ namespace Intrepid2 { this->dofCoeffs_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoeffsHost); Kokkos::deep_copy(this->dofCoeffs_, dofCoeffsHost); } -} + + template + void + Basis_HCURL_HEX_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + ordinal_type scalarWorkViewExtent = (operatorType == OPERATOR_VALUE) ? + 3*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0): + 5*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0); + perThreadSpaceSize = scalarWorkViewExtent*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_HEX_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_HEX_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type scalarSizePerPoint = (operatorType == OPERATOR_VALUE) ? + 3*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0): + 5*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0); + ordinal_type sizePerPoint = scalarSizePerPoint*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_HEX_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_HEX_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HCURL_HEX_In_FEM): getValues not implemented for this operator"); + } + } + } + +} // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEM.hpp index 15f266e2db91..24c4b26bf746 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEM.hpp @@ -144,20 +144,21 @@ namespace Intrepid2 { typename pointValueType = double> class Basis_HCURL_QUAD_I1_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; /** \brief Constructor. */ Basis_HCURL_QUAD_I1_FEM(); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -178,6 +179,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEMDef.hpp index 548929fb74cc..8380a4665a05 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_I1_FEMDef.hpp @@ -60,7 +60,7 @@ namespace Intrepid2 { default: { INTREPID2_TEST_FOR_ABORT( opType != OPERATOR_VALUE && opType != OPERATOR_CURL, - ">>> ERROR: (Intrepid2::Basis_HGRAD_QUAD_C1_FEM::Serial::getValues) operator is not supported"); + ">>> ERROR: (Intrepid2::Basis_HCURL_QUAD_I1_FEM::Serial::getValues) operator is not supported"); } } //end switch } @@ -219,7 +219,56 @@ namespace Intrepid2 { } + template + void + Basis_HCURL_QUAD_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } -}// namespace Intrepid2 + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_QUAD_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_QUAD_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_QUAD_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_QUAD_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HCURL_QUAD_I1_FEM::getValues), Operator Type not supported."); + } + } + } + +}// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEM.hpp index 077f6de07afb..13d0c227d421 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEM.hpp @@ -136,20 +136,21 @@ namespace Intrepid2 { class Basis_HCURL_QUAD_In_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; /** \brief Constructor. */ Basis_HCURL_QUAD_In_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -172,6 +173,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEMDef.hpp index 13a732abb88d..b00248a51fc8 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_QUAD_In_FEMDef.hpp @@ -21,19 +21,19 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HCURL_QUAD_In_FEM::Serial:: + Basis_HCURL_QUAD_In_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinvLine, - const vinvViewType vinvBubble) { + const InputViewType input, + WorkViewType work, + const VinvViewType vinvLine, + const VinvViewType vinvBubble) { const ordinal_type cardLine = vinvLine.extent(0); const ordinal_type cardBubble = vinvBubble.extent(0); @@ -43,19 +43,19 @@ namespace Intrepid2 { const auto input_x = Kokkos::subview(input, Kokkos::ALL(), range_type(0,1)); const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); - const int dim_s = get_dimension_scalar(work); + const int dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data()+cardLine*npts*dim_s; auto ptr2 = work.data()+2*cardLine*npts*dim_s; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType outputBubble(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType outputBubble(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); // tensor product ordinal_type idx = 0; @@ -101,11 +101,11 @@ namespace Intrepid2 { case OPERATOR_CURL: { ordinal_type idx = 0; { // x - component - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); // x bubble value - viewType output_x(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType output_x(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); // y line grad - viewType output_y(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); + ViewType output_y(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, workLine, vinvBubble); @@ -120,11 +120,11 @@ namespace Intrepid2 { output.access(idx,k) = -output_x.access(i,k)*output_y.access(j,k,0); } { // y - component - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); // x line grad - viewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); + ViewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); // y bubble value - viewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, workLine, vinvBubble); @@ -386,6 +386,63 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoeffs_, dofCoeffsHost); } -} + template + void + Basis_HCURL_QUAD_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = (2*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0))*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_QUAD_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_QUAD_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type sizePerPoint = (2*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0))*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_QUAD_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_QUAD_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HCURL_QUAD_In_FEM): getValues not implemented for this operator"); + } + } + } + +} // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEM.hpp index 6d90318a4961..d293da0e38c0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEM.hpp @@ -184,6 +184,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEMDef.hpp index 9c3d2b2d1c23..4d38583f8e49 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_I1_FEMDef.hpp @@ -255,6 +255,57 @@ namespace Intrepid2 { } + template + void + Basis_HCURL_TET_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_TET_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_TET_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_TET_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_TET_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HCURL_TET_I1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEM.hpp index cae49e5b09a5..ed253d57ec13 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEM.hpp @@ -217,9 +217,26 @@ class Basis_HCURL_TET_In_FEM operatorType); } - virtual - void - getDofCoords( ScalarViewType dofCoords ) const override { + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + virtual + void + getDofCoords( ScalarViewType dofCoords ) const override { #ifdef HAVE_INTREPID2_DEBUG // Verify rank of output array. INTREPID2_TEST_FOR_EXCEPTION( rank(dofCoords) != 2, std::invalid_argument, diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEMDef.hpp index 8bb82254291f..56149a4a1820 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TET_In_FEMDef.hpp @@ -26,18 +26,18 @@ namespace Intrepid2 { namespace Impl { -template +template template +typename InputViewType, +typename WorkViewType, +typename VinvViewType> KOKKOS_INLINE_FUNCTION void -Basis_HCURL_TET_In_FEM::Serial:: +Basis_HCURL_TET_In_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType coeffs ) { + const InputViewType input, + WorkViewType work, + const VinvViewType coeffs ) { constexpr ordinal_type spaceDim = 3; const ordinal_type @@ -54,17 +54,17 @@ getValues( OutputViewType output, } } - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - workViewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); + ViewType dummyView; Impl::Basis_HGRAD_TET_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); @@ -282,7 +282,7 @@ Basis_HCURL_TET_In_FEM( const ordinal_type order, #ifdef HAVE_INTREPID2_DEBUG ordinal_type num_nonzero_sv = 0; for (int i=0;i tolerence()); + num_nonzero_sv += (S(i,0) > 10*tolerence()); INTREPID2_TEST_FOR_EXCEPTION( num_nonzero_sv != card, std::invalid_argument, ">>> ERROR: (Intrepid2::Basis_HCURL_TET_In_FEM( order, pointType), Matrix V1 should have rank equal to the cardinality of HCURL space"); @@ -562,5 +562,64 @@ Basis_HCURL_TET_In_FEM( const ordinal_type order, posDfOrd); } } + +template +void +Basis_HCURL_TET_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + ordinal_type scalarWorkViewExtent = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 7*this->basisCardinality_; + perThreadSpaceSize = scalarWorkViewExtent*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); +} + +template +KOKKOS_INLINE_FUNCTION +void +Basis_HCURL_TET_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_TET_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type scalarSizePerPoint = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 7*this->basisCardinality_; + ordinal_type sizePerPoint = scalarSizePerPoint*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_TET_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_TET_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HCURL_TET_In_FEM): getValues not implemented for this operator"); + } + } +} } // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEM.hpp index 816b999560a6..109c96988649 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEM.hpp @@ -187,6 +187,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEMDef.hpp index 85e639ea8f10..813b764608db 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_I1_FEMDef.hpp @@ -208,7 +208,56 @@ namespace Intrepid2 { } + template + void + Basis_HCURL_TRI_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_TRI_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_TRI_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_TRI_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_TRI_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HCURL_TRI_!1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif - diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEM.hpp index 3c34d125847a..a030f292fb50 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEM.hpp @@ -209,9 +209,26 @@ class Basis_HCURL_TRI_In_FEM operatorType); } - virtual - void - getDofCoords( ScalarViewType dofCoords ) const override { + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + virtual + void + getDofCoords( ScalarViewType dofCoords ) const override { #ifdef HAVE_INTREPID2_DEBUG // Verify rank of output array. INTREPID2_TEST_FOR_EXCEPTION( rank(dofCoords) != 2, std::invalid_argument, diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEMDef.hpp index 7d10682a5e45..6cb65ab386de 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_TRI_In_FEMDef.hpp @@ -25,18 +25,18 @@ namespace Intrepid2 { namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HCURL_TRI_In_FEM::Serial:: + Basis_HCURL_TRI_In_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType coeffs ) { + const InputViewType input, + WorkViewType work, + const VinvViewType coeffs ) { constexpr ordinal_type spaceDim = 2; const ordinal_type @@ -53,17 +53,16 @@ namespace Intrepid2 { } } - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - workViewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts), dummyView; Impl::Basis_HGRAD_TRI_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); @@ -452,5 +451,66 @@ namespace Intrepid2 { posDfOrd); } } + + template + void + Basis_HCURL_TRI_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + ordinal_type scalarWorkViewExtent = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 5*this->basisCardinality_; + perThreadSpaceSize = scalarWorkViewExtent*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_TRI_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_TRI_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type scalarSizePerPoint = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 5*this->basisCardinality_; + ordinal_type sizePerPoint = scalarSizePerPoint*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_TRI_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HCURL_TRI_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HCURL_TRI_In_FEM): getValues not implemented for this operator"); + } + } + } + } // namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEM.hpp index c7587cf3eec1..d2831d0ac47a 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEM.hpp @@ -185,6 +185,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEMDef.hpp index 59ad4da436e8..754355ffbd7d 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HCURL_WEDGE_I1_FEMDef.hpp @@ -276,5 +276,57 @@ namespace Intrepid2 { } + template + void + Basis_HCURL_WEDGE_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HCURL_WEDGE_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HCURL_WEDGE_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_WEDGE_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HCURL_WEDGE_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HCURL_WEDGE_I1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEM.hpp index 1de1d7c654c7..66ab525b3aec 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEM.hpp @@ -190,6 +190,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEMDef.hpp index 79e9aaef60f8..b7e865178e64 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_I1_FEMDef.hpp @@ -236,5 +236,57 @@ namespace Intrepid2 { } + template + void + Basis_HDIV_HEX_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_HEX_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_HEX_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_HEX_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_HEX_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HDIV_HEX_I1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEM.hpp index 4ed98a89967f..f563bd998237 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEM.hpp @@ -138,20 +138,21 @@ namespace Intrepid2 { class Basis_HDIV_HEX_In_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; /** \brief Constructor. */ Basis_HDIV_HEX_In_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -174,6 +175,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { @@ -254,8 +272,6 @@ namespace Intrepid2 { }// namespace Intrepid2 - - #include "Intrepid2_HDIV_HEX_In_FEMDef.hpp" #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEMDef.hpp index 0bae2c8b1b3d..0d5d25113bdb 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_HEX_In_FEMDef.hpp @@ -21,19 +21,19 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HDIV_HEX_In_FEM::Serial:: + Basis_HDIV_HEX_In_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinvLine, - const vinvViewType vinvBubble) { + const InputViewType input, + WorkViewType work, + const VinvViewType vinvLine, + const VinvViewType vinvBubble) { const ordinal_type cardLine = vinvLine.extent(0); const ordinal_type cardBubble = vinvBubble.extent(0); @@ -44,21 +44,21 @@ namespace Intrepid2 { const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); const auto input_z = Kokkos::subview(input, Kokkos::ALL(), range_type(2,3)); - const ordinal_type dim_s = get_dimension_scalar(work); + const ordinal_type dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data()+cardLine*npts*dim_s; auto ptr2 = work.data()+2*cardLine*npts*dim_s; auto ptr3 = work.data()+(2*cardLine+cardBubble)*npts*dim_s; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType outputBubble_A(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); - viewType outputBubble_B(Kokkos::view_wrap(ptr3, vcprop), cardBubble, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType outputBubble_A(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType outputBubble_B(Kokkos::view_wrap(ptr3, vcprop), cardBubble, npts); // tensor product ordinal_type idx = 0; @@ -138,13 +138,13 @@ namespace Intrepid2 { break; } case OPERATOR_DIV: { - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); // A line value - viewType outputBubble_A(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType outputBubble_A(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); // B line value - viewType outputBubble_B(Kokkos::view_wrap(ptr3, vcprop), cardBubble, npts); + ViewType outputBubble_B(Kokkos::view_wrap(ptr3, vcprop), cardBubble, npts); // Line grad - viewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts, 1); + ViewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts, 1); // tensor product ordinal_type idx = 0; @@ -508,6 +508,64 @@ namespace Intrepid2 { this->dofCoeffs_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoeffsHost); Kokkos::deep_copy(this->dofCoeffs_, dofCoeffsHost); } -} + + template + void + Basis_HDIV_HEX_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = (2*this->vinvLine_.extent(0)+2*this->vinvBubble_.extent(0))*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_HEX_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_HEX_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type sizePerPoint = (2*this->vinvLine_.extent(0)+2*this->vinvBubble_.extent(0))*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_HEX_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_HEX_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HDIV_HEX_In_FEM): getValues not implemented for this operator"); + } + } + } + +} // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEM.hpp index fab13618142c..66c5843d4da0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEM.hpp @@ -145,20 +145,21 @@ namespace Intrepid2 { typename pointValueType = double> class Basis_HDIV_QUAD_I1_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; /** \brief Constructor. */ Basis_HDIV_QUAD_I1_FEM(); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -179,6 +180,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEMDef.hpp index 1924df01978b..22eeef61929a 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_I1_FEMDef.hpp @@ -213,10 +213,59 @@ namespace Intrepid2 { this->dofCoeffs_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoeffs); Kokkos::deep_copy(this->dofCoeffs_, dofCoeffs); - } + template + void + Basis_HDIV_QUAD_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_QUAD_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_QUAD_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_QUAD_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_QUAD_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HDIV_QUAD_I1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEM.hpp index 3db4472991f2..60da55f64220 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEM.hpp @@ -135,20 +135,21 @@ namespace Intrepid2 { class Basis_HDIV_QUAD_In_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; /** \brief Constructor. */ Basis_HDIV_QUAD_In_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -170,6 +171,24 @@ namespace Intrepid2 { this->vinvBubble_, operatorType ); } + + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEMDef.hpp index f6958e8152c2..ee5bdc9458c4 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_QUAD_In_FEMDef.hpp @@ -21,19 +21,19 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HDIV_QUAD_In_FEM::Serial:: + Basis_HDIV_QUAD_In_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinvLine, - const vinvViewType vinvBubble) { + const InputViewType input, + WorkViewType work, + const VinvViewType vinvLine, + const VinvViewType vinvBubble) { const ordinal_type cardLine = vinvLine.extent(0); const ordinal_type cardBubble = vinvBubble.extent(0); @@ -43,20 +43,19 @@ namespace Intrepid2 { const auto input_x = Kokkos::subview(input, Kokkos::ALL(), range_type(0,1)); const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); - const int dim_s = get_dimension_scalar(work); + const ordinal_type dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data()+cardLine*npts*dim_s; auto ptr2 = work.data()+2*cardLine*npts*dim_s; - - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType outputBubble(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType outputLine(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType outputBubble(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); // tensor product ordinal_type idx = 0; @@ -100,11 +99,11 @@ namespace Intrepid2 { case OPERATOR_DIV: { ordinal_type idx = 0; { // x - component - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); // x bubble value - viewType output_x(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType output_x(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); // y line grad - viewType output_y(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); + ViewType output_y(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, workLine, vinvBubble); @@ -119,11 +118,11 @@ namespace Intrepid2 { output.access(idx,k) = output_x.access(i,k)*output_y.access(j,k,0); } { // y - component - viewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType workLine(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); // x line grad - viewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); + ViewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts,1); // y bubble value - viewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); + ViewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardBubble, npts); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, workLine, vinvBubble); @@ -381,6 +380,63 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoeffs_, dofCoeffsHost); } -} + template + void + Basis_HDIV_QUAD_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = (2*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0))*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_QUAD_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_QUAD_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type sizePerPoint = (2*this->vinvLine_.extent(0)+this->vinvBubble_.extent(0))*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_QUAD_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_QUAD_In_FEM::Serial::getValues( output, input, work, this->vinvLine_, this->vinvBubble_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HDIV_QUAD_In_FEM): getValues not implemented for this operator"); + } + } + } + +} // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEM.hpp index a5b72c79d540..40b58117d658 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEM.hpp @@ -144,19 +144,21 @@ namespace Intrepid2 { typename pointValueType = double> class Basis_HDIV_TET_I1_FEM: public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + + using typename BasisBase::OrdinalTypeArray1DHost; + using typename BasisBase::OrdinalTypeArray2DHost; + using typename BasisBase::OrdinalTypeArray3DHost; + + using typename BasisBase::OutputViewType; + using typename BasisBase::PointViewType ; + using typename BasisBase::ScalarViewType; /** \brief Constructor. */ Basis_HDIV_TET_I1_FEM(); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -177,6 +179,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEMDef.hpp index febd2f87333c..7aac987e9439 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_I1_FEMDef.hpp @@ -230,6 +230,58 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoeffs_, dofCoeffs); } + template + void + Basis_HDIV_TET_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_TET_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_TET_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_TET_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_TET_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HDIV_TET_I1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEM.hpp index c2f17ac83aef..9224322f9539 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEM.hpp @@ -165,131 +165,144 @@ template class Basis_HDIV_TET_In_FEM : public Basis { - public: - typedef typename Basis::OrdinalTypeArray1DHost OrdinalTypeArray1DHost; - typedef typename Basis::OrdinalTypeArray2DHost OrdinalTypeArray2DHost; - typedef typename Basis::OrdinalTypeArray3DHost OrdinalTypeArray3DHost; - - /** \brief Constructor. - */ - Basis_HDIV_TET_In_FEM(const ordinal_type order, - const EPointType pointType = POINTTYPE_EQUISPACED); - - - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - typedef typename Basis::scalarType scalarType; - - using Basis::getValues; - - virtual - void - getValues( /* */ OutputViewType outputValues, - const PointViewType inputPoints, - const EOperator operatorType = OPERATOR_VALUE) const override { + public: + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; + + /** \brief Constructor. + */ + Basis_HDIV_TET_In_FEM(const ordinal_type order, + const EPointType pointType = POINTTYPE_EQUISPACED); + + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; + using scalarType = typename BasisBase::scalarType; + using BasisBase::getValues; + + virtual + void + getValues( /* */ OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override { #ifdef HAVE_INTREPID2_DEBUG - Intrepid2::getValues_HDIV_Args(outputValues, + Intrepid2::getValues_HDIV_Args(outputValues, inputPoints, operatorType, this->getBaseCellTopology(), this->getCardinality() ); #endif -constexpr ordinal_type numPtsPerEval = Parameters::MaxNumPtsPerBasisEval; -Impl::Basis_HDIV_TET_In_FEM:: -getValues( outputValues, - inputPoints, - this->coeffs_, - operatorType); - } - - virtual - void - getDofCoords( ScalarViewType dofCoords ) const override { + constexpr ordinal_type numPtsPerEval = Parameters::MaxNumPtsPerBasisEval; + Impl::Basis_HDIV_TET_In_FEM:: + getValues( outputValues, + inputPoints, + this->coeffs_, + operatorType); + } + + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + virtual + void + getDofCoords( ScalarViewType dofCoords ) const override { #ifdef HAVE_INTREPID2_DEBUG - // Verify rank of output array. - INTREPID2_TEST_FOR_EXCEPTION( dofCoords.rank() != 2, std::invalid_argument, - ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoords) rank = 2 required for dofCoords array"); - // Verify 0th dimension of output array. - INTREPID2_TEST_FOR_EXCEPTION( static_cast(dofCoords.extent(0)) != this->getCardinality(), std::invalid_argument, - ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoords) mismatch in number of dof and 0th dimension of dofCoords array"); - // Verify 1st dimension of output array. - INTREPID2_TEST_FOR_EXCEPTION( dofCoords.extent(1) != this->getBaseCellTopology().getDimension(), std::invalid_argument, - ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoords) incorrect reference cell (1st) dimension in dofCoords array"); + // Verify rank of output array. + INTREPID2_TEST_FOR_EXCEPTION( dofCoords.rank() != 2, std::invalid_argument, + ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoords) rank = 2 required for dofCoords array"); + // Verify 0th dimension of output array. + INTREPID2_TEST_FOR_EXCEPTION( static_cast(dofCoords.extent(0)) != this->getCardinality(), std::invalid_argument, + ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoords) mismatch in number of dof and 0th dimension of dofCoords array"); + // Verify 1st dimension of output array. + INTREPID2_TEST_FOR_EXCEPTION( dofCoords.extent(1) != this->getBaseCellTopology().getDimension(), std::invalid_argument, + ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoords) incorrect reference cell (1st) dimension in dofCoords array"); #endif - Kokkos::deep_copy(dofCoords, this->dofCoords_); - } + Kokkos::deep_copy(dofCoords, this->dofCoords_); + } - virtual - void - getDofCoeffs( ScalarViewType dofCoeffs ) const override { -#ifdef HAVE_INTREPID2_DEBUG - // Verify rank of output array. - INTREPID2_TEST_FOR_EXCEPTION( dofCoeffs.rank() != 2, std::invalid_argument, - ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoeffs) rank = 2 required for dofCoeffs array"); - // Verify 0th dimension of output array. - INTREPID2_TEST_FOR_EXCEPTION( static_cast(dofCoeffs.extent(0)) != this->getCardinality(), std::invalid_argument, + virtual + void + getDofCoeffs( ScalarViewType dofCoeffs ) const override { + #ifdef HAVE_INTREPID2_DEBUG + // Verify rank of output array. + INTREPID2_TEST_FOR_EXCEPTION( dofCoeffs.rank() != 2, std::invalid_argument, + ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoeffs) rank = 2 required for dofCoeffs array"); + // Verify 0th dimension of output array. + INTREPID2_TEST_FOR_EXCEPTION( static_cast(dofCoeffs.extent(0)) != this->getCardinality(), std::invalid_argument, ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoeffs) mismatch in number of dof and 0th dimension of dofCoeffs array"); - // Verify 1st dimension of output array. - INTREPID2_TEST_FOR_EXCEPTION( dofCoeffs.extent(1) != this->getBaseCellTopology().getDimension(), std::invalid_argument, + // Verify 1st dimension of output array. + INTREPID2_TEST_FOR_EXCEPTION( dofCoeffs.extent(1) != this->getBaseCellTopology().getDimension(), std::invalid_argument, ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getDofCoeffs) incorrect reference cell (1st) dimension in dofCoeffs array"); #endif - Kokkos::deep_copy(dofCoeffs, this->dofCoeffs_); - } - - void - getExpansionCoeffs( ScalarViewType coeffs ) const { - // has to be same rank and dimensions - Kokkos::deep_copy(coeffs, this->coeffs_); - } - - virtual - const char* - getName() const override { - return "Intrepid2_HDIV_TET_In_FEM"; - } - - virtual - bool - requireOrientation() const override { - return true; - } - - /** \brief returns the basis associated to a subCell. - - The bases of the subCell are the restriction to the subCell of the bases of the parent cell, - projected along normal to the subCell. - - \param [in] subCellDim - dimension of subCell - \param [in] subCellOrd - position of the subCell among of the subCells having the same dimension - \return pointer to the subCell basis of dimension subCellDim and position subCellOrd - */ - BasisPtr - getSubCellRefBasis(const ordinal_type subCellDim, const ordinal_type subCellOrd) const override{ + Kokkos::deep_copy(dofCoeffs, this->dofCoeffs_); + } - if(subCellDim == 2) { - return Teuchos::rcp(new - Basis_HVOL_TRI_Cn_FEM - (this->basisDegree_-1, pointType_)); + void + getExpansionCoeffs( ScalarViewType coeffs ) const { + // has to be same rank and dimensions + Kokkos::deep_copy(coeffs, this->coeffs_); } - INTREPID2_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Input parameters out of bounds"); - } - BasisPtr - getHostBasis() const override{ - return Teuchos::rcp(new Basis_HDIV_TET_In_FEM(this->basisDegree_, pointType_)); - } - private: + virtual + const char* + getName() const override { + return "Intrepid2_HDIV_TET_In_FEM"; + } - /** \brief expansion coefficients of the nodal basis in terms of the - orthgonal one */ - Kokkos::DynRankView coeffs_; + virtual + bool + requireOrientation() const override { + return true; + } - /** \brief type of lattice used for creating the DoF coordinates */ - EPointType pointType_; + /** \brief returns the basis associated to a subCell. -}; + The bases of the subCell are the restriction to the subCell of the bases of the parent cell, + projected along normal to the subCell. + + \param [in] subCellDim - dimension of subCell + \param [in] subCellOrd - position of the subCell among of the subCells having the same dimension + \return pointer to the subCell basis of dimension subCellDim and position subCellOrd + */ + BasisPtr + getSubCellRefBasis(const ordinal_type subCellDim, const ordinal_type subCellOrd) const override{ + + if(subCellDim == 2) { + return Teuchos::rcp(new + Basis_HVOL_TRI_Cn_FEM + (this->basisDegree_-1, pointType_)); + } + INTREPID2_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Input parameters out of bounds"); + } + + BasisPtr + getHostBasis() const override{ + return Teuchos::rcp(new Basis_HDIV_TET_In_FEM(this->basisDegree_, pointType_)); + } + private: + + /** \brief expansion coefficients of the nodal basis in terms of the orthgonal one */ + Kokkos::DynRankView coeffs_; + + /** \brief type of lattice used for creating the DoF coordinates */ + EPointType pointType_; + }; }// namespace Intrepid2 diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEMDef.hpp index e8c97199198f..96e0e7cf2267 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TET_In_FEMDef.hpp @@ -25,18 +25,18 @@ namespace Intrepid2 { namespace Impl { -template +template template +typename InputViewType, +typename WorkViewType, +typename VinvViewType> KOKKOS_INLINE_FUNCTION void -Basis_HDIV_TET_In_FEM::Serial:: +Basis_HDIV_TET_In_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType coeffs ) { + const InputViewType input, + WorkViewType work, + const VinvViewType coeffs ) { constexpr ordinal_type spaceDim = 3; const ordinal_type @@ -53,17 +53,17 @@ getValues( OutputViewType output, } } - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - workViewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); + ViewType dummyView; Impl::Basis_HGRAD_TET_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); @@ -104,10 +104,10 @@ typename inputPointValueType, class ...inputPointProperties, typename vinvValueType, class ...vinvProperties> void Basis_HDIV_TET_In_FEM:: -getValues( /* */ Kokkos::DynRankView outputValues, - const Kokkos::DynRankView inputPoints, - const Kokkos::DynRankView coeffs, - const EOperator operatorType) { +getValues( Kokkos::DynRankView outputValues, + const Kokkos::DynRankView inputPoints, + const Kokkos::DynRankView coeffs, + const EOperator operatorType) { typedef Kokkos::DynRankView outputValueViewType; typedef Kokkos::DynRankView inputPointViewType; typedef Kokkos::DynRankView vinvViewType; @@ -450,5 +450,64 @@ Basis_HDIV_TET_In_FEM( const ordinal_type order, posDfOrd); } } + +template +void +Basis_HDIV_TET_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + ordinal_type scalarWorkViewExtent = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 7*this->basisCardinality_; + perThreadSpaceSize = scalarWorkViewExtent*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); +} + +template +KOKKOS_INLINE_FUNCTION +void +Basis_HDIV_TET_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_TET_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type scalarSizePerPoint = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 7*this->basisCardinality_; + ordinal_type sizePerPoint = scalarSizePerPoint*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_TET_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_TET_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HDIV_TET_In_FEM): getValues not implemented for this operator"); + } + } +} } // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEM.hpp index 6f6596778567..8315ac027c92 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEM.hpp @@ -142,19 +142,21 @@ namespace Intrepid2 { typename pointValueType = double> class Basis_HDIV_TRI_I1_FEM: public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + + using typename BasisBase::OrdinalTypeArray1DHost; + using typename BasisBase::OrdinalTypeArray2DHost; + using typename BasisBase::OrdinalTypeArray3DHost; + + using typename BasisBase::OutputViewType; + using typename BasisBase::PointViewType ; + using typename BasisBase::ScalarViewType; /** \brief Constructor. */ Basis_HDIV_TRI_I1_FEM(); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -175,6 +177,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEMDef.hpp index a7c13864b3a2..ef00a009fdc7 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_I1_FEMDef.hpp @@ -72,6 +72,7 @@ namespace Intrepid2 { typedef Kokkos::DynRankView inputPointViewType; typedef typename ExecSpace::ExecSpaceType ExecSpaceType; + // Number of evaluation points = dim 0 of inputPoints const auto loopSize = inputPoints.extent(0); Kokkos::RangePolicy > policy(0, loopSize); @@ -215,5 +216,56 @@ namespace Intrepid2 { } + template + void + Basis_HDIV_TRI_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_TRI_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_TRI_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_TRI_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_TRI_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HDIV_TRI_!1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEM.hpp index 71fd4f8ae99d..1ca324c938ef 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEM.hpp @@ -169,31 +169,30 @@ template class Basis_HDIV_TRI_In_FEM : public Basis { - public: - typedef typename Basis::OrdinalTypeArray1DHost OrdinalTypeArray1DHost; - typedef typename Basis::OrdinalTypeArray2DHost OrdinalTypeArray2DHost; - typedef typename Basis::OrdinalTypeArray3DHost OrdinalTypeArray3DHost; - - /** \brief Constructor. - */ - Basis_HDIV_TRI_In_FEM(const ordinal_type order, + public: + using BasisBase = Basis; + using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; + using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; + using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; + + /** \brief Constructor. + */ + Basis_HDIV_TRI_In_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - using HostBasis = Basis_HDIV_TRI_In_FEM; + using HostBasis = Basis_HDIV_TRI_In_FEM; - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - typedef typename Basis::scalarType scalarType; - - using Basis::getValues; + using OutputViewType = typename BasisBase::OutputViewType; + using PointViewType = typename BasisBase::PointViewType; + using ScalarViewType = typename BasisBase::ScalarViewType; + using scalarType = typename BasisBase::scalarType; + using BasisBase::getValues; virtual void - getValues( /* */ OutputViewType outputValues, - const PointViewType inputPoints, - const EOperator operatorType = OPERATOR_VALUE) const override { + getValues( OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override { #ifdef HAVE_INTREPID2_DEBUG Intrepid2::getValues_HDIV_Args(outputValues, inputPoints, @@ -209,9 +208,26 @@ class Basis_HDIV_TRI_In_FEM operatorType); } - virtual - void - getDofCoords( ScalarViewType dofCoords ) const override { + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + virtual + void + getDofCoords( ScalarViewType dofCoords ) const override { #ifdef HAVE_INTREPID2_DEBUG // Verify rank of output array. INTREPID2_TEST_FOR_EXCEPTION( dofCoords.rank() != 2, std::invalid_argument, diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEMDef.hpp index e412989caa56..89c86f5274e9 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_TRI_In_FEMDef.hpp @@ -24,18 +24,18 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { -template +template template +typename InputViewType, +typename WorkViewType, +typename VinvViewType> KOKKOS_INLINE_FUNCTION void -Basis_HDIV_TRI_In_FEM::Serial:: -getValues( /* */ OutputViewType output, - const inputViewType input, - /* */ workViewType work, - const vinvViewType coeffs ) { +Basis_HDIV_TRI_In_FEM::Serial:: +getValues( OutputViewType output, + const InputViewType input, + WorkViewType work, + const VinvViewType coeffs ) { constexpr ordinal_type spaceDim = 2; const ordinal_type @@ -52,17 +52,17 @@ getValues( /* */ OutputViewType output, } } - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - workViewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); + ViewType dummyView; Impl::Basis_HGRAD_TRI_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); @@ -447,5 +447,65 @@ Basis_HDIV_TRI_In_FEM( const ordinal_type order, posDfOrd); } } + + template + void + Basis_HDIV_TRI_In_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + ordinal_type scalarWorkViewExtent = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 5*this->basisCardinality_; + perThreadSpaceSize = scalarWorkViewExtent*get_dimension_scalar(inputPoints)*sizeof(scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_TRI_In_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_TRI_In_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using WorkViewType = Kokkos::DynRankView< scalarType, typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type scalarSizePerPoint = (operatorType == OPERATOR_VALUE) ? this->basisCardinality_ : 5*this->basisCardinality_; + ordinal_type sizePerPoint = scalarSizePerPoint*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_TRI_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HDIV_TRI_In_FEM::Serial::getValues( output, input, work, this->coeffs_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HDIV_TRI_In_FEM): getValues not implemented for this operator"); + } + } + } + } // namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEM.hpp index d30c7d1ee4a3..56a257b9ce54 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEM.hpp @@ -159,6 +159,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEMDef.hpp index 79dabbc6bfa6..ceefba8ac59e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HDIV_WEDGE_I1_FEMDef.hpp @@ -183,7 +183,60 @@ namespace Intrepid2 { this->dofCoeffs_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoeffs); Kokkos::deep_copy(this->dofCoeffs_, dofCoeffs); + + } + + template + void + Basis_HDIV_WEDGE_I1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; } + template + KOKKOS_INLINE_FUNCTION + void + Basis_HDIV_WEDGE_I1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HDIV_WEDGE_I1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_WEDGE_I1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_DIV: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HDIV_WEDGE_I1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HDIV_WEDGE_I1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEM.hpp index 2ea99d81a411..59dfdfff0451 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEM.hpp @@ -177,6 +177,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEMDef.hpp index 96a5fcf39429..3c2813faa81e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C1_FEMDef.hpp @@ -413,6 +413,55 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_HEX_C1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_HEX_C1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_HEX_C1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_HEX_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_HEX_C1_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEM.hpp index 9823b1fe23bd..1af419c4300e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEM.hpp @@ -245,6 +245,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEMDef.hpp index 22e1a54b74b4..c5a5ec102087 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_C2_FEMDef.hpp @@ -1626,8 +1626,57 @@ namespace Intrepid2 { this->dofCoords_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoords); Kokkos::deep_copy(this->dofCoords_, dofCoords); + } + template + void + Basis_HGRAD_HEX_DEG2_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; } + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_HEX_DEG2_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_HEX_DEG2_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + using SerialValue = typename Impl::Basis_HGRAD_HEX_DEG2_FEM::template Serial; + SerialValue::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + using SerialGrad = typename Impl::Basis_HGRAD_HEX_DEG2_FEM::template Serial; + SerialGrad::getValues( output, input); + }); + break; + default: {} + } + } }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEM.hpp index 4bcefb52d286..987206b0a1e0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEM.hpp @@ -184,6 +184,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEMDef.hpp index ac50cde72fb3..36139dfb95f4 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_HEX_Cn_FEMDef.hpp @@ -44,14 +44,14 @@ namespace Intrepid2 { const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); const auto input_z = Kokkos::subview(input, Kokkos::ALL(), range_type(2,3)); - const ordinal_type dim_s = get_dimension_scalar(work); + const ordinal_type dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data()+cardLine*npts*dim_s; auto ptr2 = work.data()+2*cardLine*npts*dim_s; auto ptr3 = work.data()+3*cardLine*npts*dim_s; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView viewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); switch (opType) { case OPERATOR_VALUE: { @@ -382,7 +382,64 @@ namespace Intrepid2 { this->dofCoords_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoordsHost); Kokkos::deep_copy(this->dofCoords_, dofCoordsHost); } - + + template + void + Basis_HGRAD_HEX_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + (void) operatorType; //avoid warning for unused variable + perTeamSpaceSize = 0; + perThreadSpaceSize = 4*this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_HEX_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_HEX_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type sizePerPoint = 4*this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_HEX_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_HEX_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HGRAD_TET_Cn_FEM): getValues not implemented for this operator"); + } + } + } }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEM.hpp index c07fdfd71f04..71ee77e7d55a 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEM.hpp @@ -161,6 +161,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEMDef.hpp index 4572b5b5c899..05291e830a3d 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C1_FEMDef.hpp @@ -177,6 +177,55 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } -} + template + void + Basis_HGRAD_LINE_C1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_LINE_C1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_LINE_C1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_LINE_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_LINE_C1_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + +}// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEM.hpp index 595cc815e9be..4e141fc5ad41 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEM.hpp @@ -161,6 +161,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEMDef.hpp index c08bd3692a27..cb3157028b0e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_C2_FEMDef.hpp @@ -173,6 +173,55 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } -} + template + void + Basis_HGRAD_LINE_C2_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_LINE_C2_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_LINE_C2_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_LINE_C2_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_LINE_C2_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + +}// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEM.hpp index c5e0e2308545..ac2bfdbcae0b 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEM.hpp @@ -192,6 +192,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEMDef.hpp index fc116eb32ebe..6140c13821cb 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_LINE_Cn_FEMDef.hpp @@ -23,16 +23,16 @@ namespace Intrepid2 { template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv, + const InputViewType input, + WorkViewType work, + const VinvViewType vinv, const ordinal_type operatorDn ) { ordinal_type opDn = operatorDn; @@ -42,12 +42,12 @@ namespace Intrepid2 { const ordinal_type order = card - 1; const double alpha = 0.0, beta = 0.0; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); switch (opType) { case OPERATOR_VALUE: { - viewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts); + ViewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts); Impl::Basis_HGRAD_LINE_Cn_FEM_JACOBI:: Serial::getValues(phis, input, order, alpha, beta); @@ -75,7 +75,7 @@ namespace Intrepid2 { case OPERATOR_Dn: { // dkcard is always 1 for 1D element const ordinal_type dkcard = 1; - viewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts, dkcard); + ViewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts, dkcard); Impl::Basis_HGRAD_LINE_Cn_FEM_JACOBI:: Serial::getValues(phis, input, order, alpha, beta, opDn); @@ -326,22 +326,64 @@ namespace Intrepid2 { posDfOrd); } } + + template + void + Basis_HGRAD_LINE_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_LINE_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_LINE_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type sizePerPoint = this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_LINE_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_LINE_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HGRAD_LINE_Cn_FEM): getValues not implemented for this operator"); + } + } + } }// namespace Intrepid2 #endif - - - - - - - - - - - - - - - diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEM.hpp index 36463fc8baec..7f00c2a90a2d 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEM.hpp @@ -163,6 +163,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEMDef.hpp index e25212499fbd..078ad0e97178 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_C1_FEMDef.hpp @@ -256,6 +256,54 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } -} + template + void + Basis_HGRAD_PYR_C1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_PYR_C1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_PYR_C1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_PYR_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_PYR_C1_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + +}// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEM.hpp index aad5bae97d00..81a09cc56c91 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEM.hpp @@ -180,6 +180,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEMDef.hpp index 3247c5aaa0b7..a6ce124d1c2b 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_PYR_I2_FEMDef.hpp @@ -371,6 +371,54 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } -} + template + void + Basis_HGRAD_PYR_I2_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_PYR_I2_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_PYR_I2_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_PYR_I2_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_PYR_I2_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + +}// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEM.hpp index 17af8c1c8685..e6b9d7b10c44 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEM.hpp @@ -167,6 +167,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEMDef.hpp index 39504493f063..246a92ac44d5 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C1_FEMDef.hpp @@ -245,5 +245,63 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_QUAD_C1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_QUAD_C1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_QUAD_C1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_QUAD_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_QUAD_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_QUAD_C1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HGRAD_QUAD_C1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEM.hpp index d82127eb1ead..038214825966 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEM.hpp @@ -194,6 +194,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEMDef.hpp index 4ec411a0ffc1..36ae4378091c 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_C2_FEMDef.hpp @@ -557,5 +557,66 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_QUAD_DEG2_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_QUAD_DEG2_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_QUAD_DEG2_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + using SerialValue = typename Impl::Basis_HGRAD_QUAD_DEG2_FEM::template Serial; + SerialValue::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + using SerialGrad = typename Impl::Basis_HGRAD_QUAD_DEG2_FEM::template Serial; + SerialGrad::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + using SerialCurl = typename Impl::Basis_HGRAD_QUAD_DEG2_FEM::template Serial; + SerialCurl::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HGRAD_QUAD_DEG2_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEM.hpp index d07b46cafb28..00a206c12ec8 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEM.hpp @@ -181,6 +181,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEMDef.hpp index 582b465a6c57..9e232352285e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_QUAD_Cn_FEMDef.hpp @@ -21,18 +21,18 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HGRAD_QUAD_Cn_FEM::Serial:: + Basis_HGRAD_QUAD_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv, + const InputViewType input, + WorkViewType work, + const VinvViewType vinv, const ordinal_type operatorDn ) { ordinal_type opDn = operatorDn; @@ -43,19 +43,19 @@ namespace Intrepid2 { const auto input_x = Kokkos::subview(input, Kokkos::ALL(), range_type(0,1)); const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); - const int dim_s = get_dimension_scalar(work); + const int dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data()+cardLine*npts*dim_s; auto ptr2 = work.data()+2*cardLine*npts*dim_s; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - viewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + ViewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv); @@ -73,29 +73,29 @@ namespace Intrepid2 { } case OPERATOR_CURL: { for (auto l=0;l<2;++l) { - viewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType output_x, output_y; + ViewType output_x, output_y; - typename workViewType::value_type s = 0.0; + typename WorkViewType::value_type s = 0.0; if (l) { // l = 1 - output_x = viewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts, 1); + output_x = ViewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts, 1); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv, 1); - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv); s = -1.0; } else { // l = 0 - output_x = viewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + output_x = ViewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv); - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv, 1); @@ -122,33 +122,33 @@ namespace Intrepid2 { case OPERATOR_D8: case OPERATOR_D9: case OPERATOR_D10: - opDn = getOperatorOrder(opType); + opDn = getOperatorOrder(OpType); case OPERATOR_Dn: { const auto dkcard = opDn + 1; for (auto l=0;l:: getValues(output_x, input_x, work_line, vinv, mult_x); } else { - output_x = viewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + output_x = ViewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv); } if (mult_y) { - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv, mult_y); } else { - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); Impl::Basis_HGRAD_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv); } @@ -357,7 +357,72 @@ namespace Intrepid2 { this->dofCoords_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoordsHost); Kokkos::deep_copy(this->dofCoords_, dofCoordsHost); } - -}// namespace Intrepid2 + + template + void + Basis_HGRAD_QUAD_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 3*this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_QUAD_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_QUAD_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type sizePerPoint = 3*this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_QUAD_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_QUAD_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_QUAD_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HGRAD_QUAD_Cn_FEM): getValues not implemented for this operator"); + } + } + } + +} // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEM.hpp index a2bd5cce665e..d8ed43bad7c7 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEM.hpp @@ -164,6 +164,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEMDef.hpp index 65d632b1b578..c8e0cc996c65 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C1_FEMDef.hpp @@ -202,5 +202,54 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_TET_C1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_TET_C1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_TET_C1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TET_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TET_C1_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEM.hpp index 3b544a29b8b9..703eef86b224 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEM.hpp @@ -183,6 +183,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEMDef.hpp index 36aa3d7df7b8..7f4f39634cd0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_C2_FEMDef.hpp @@ -323,5 +323,54 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_TET_C2_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_TET_C2_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_TET_C2_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TET_C2_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TET_C2_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEM.hpp index 2bd2814a2d6e..6cb9802be376 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEM.hpp @@ -203,12 +203,23 @@ namespace Intrepid2 { operatorType); } - /** \brief Returns spatial locations (coordinates) of degrees of freedom on a - reference Tetrahedron. + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; - \param DofCoords [out] - array with the coordinates of degrees of freedom, - dimensioned (F,D) - */ virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEMDef.hpp index ca7c10d67005..09888eddc924 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_COMP12_FEMDef.hpp @@ -408,6 +408,55 @@ namespace Intrepid2 { this->dofCoords_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoords); Kokkos::deep_copy(this->dofCoords_, dofCoords); } -} + template + void + Basis_HGRAD_TET_COMP12_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_TET_COMP12_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_TET_COMP12_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TET_COMP12_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TET_COMP12_FEM::Serial::getValues( output, input); + }); + break; + default: {} + } + } + +}// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEM.hpp index 39b8900291f4..d23d95594bdb 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEM.hpp @@ -62,53 +62,57 @@ namespace Intrepid2 { /** \brief See Intrepid2::Basis_HGRAD_TET_Cn_FEM */ - template + template struct Serial { - template + template KOKKOS_INLINE_FUNCTION static void - getValues( outputValueViewType outputValues, - const inputPointViewType inputPoints, - workViewType work, - const vinvViewType vinv ); + getValues( OutputValueViewType outputValues, + const InputPointViewType inputPoints, + WorkViewType work, + const VinvViewType vinv, + const ordinal_type order); }; template + typename OutputValueValueType, class ...OutputValueProperties, + typename InputPointValueType, class ...InputPointProperties, + typename VinvValueType, class ...VinvProperties> static void getValues( const typename DeviceType::execution_space& space, - Kokkos::DynRankView outputValues, - const Kokkos::DynRankView inputPoints, - const Kokkos::DynRankView vinv, + Kokkos::DynRankView outputValues, + const Kokkos::DynRankView inputPoints, + const Kokkos::DynRankView vinv, + const ordinal_type order, const EOperator operatorType); /** \brief See Intrepid2::Basis_HGRAD_TET_Cn_FEM */ - template struct Functor { - outputValueViewType _outputValues; - const inputPointViewType _inputPoints; - const vinvViewType _vinv; - workViewType _work; + OutputValueViewType _outputValues; + const InputPointViewType _inputPoints; + const VinvViewType _vinv; + WorkViewType _work; + const ordinal_type _order; KOKKOS_INLINE_FUNCTION - Functor( outputValueViewType outputValues_, - inputPointViewType inputPoints_, - vinvViewType vinv_, - workViewType work_) + Functor( OutputValueViewType outputValues_, + InputPointViewType inputPoints_, + VinvViewType vinv_, + WorkViewType work_, + ordinal_type order_) : _outputValues(outputValues_), _inputPoints(inputPoints_), - _vinv(vinv_), _work(work_) {} + _vinv(vinv_), _work(work_), _order(order_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type iter) const { @@ -118,15 +122,15 @@ namespace Intrepid2 { const auto ptRange = Kokkos::pair(ptBegin, ptEnd); const auto input = Kokkos::subview( _inputPoints, ptRange, Kokkos::ALL() ); - typename workViewType::pointer_type ptr = _work.data() + _work.extent(0)*ptBegin*get_dimension_scalar(_work); + typename WorkViewType::pointer_type ptr = _work.data() + _work.extent(0)*ptBegin*get_dimension_scalar(_work); auto vcprop = Kokkos::common_view_alloc_prop(_work); - workViewType work(Kokkos::view_wrap(ptr,vcprop), (ptEnd-ptBegin)*_work.extent(0)); + WorkViewType work(Kokkos::view_wrap(ptr,vcprop), (ptEnd-ptBegin)*_work.extent(0)); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE : { auto output = Kokkos::subview( _outputValues, Kokkos::ALL(), ptRange ); - Serial::getValues( output, input, work, _vinv ); + Serial::getValues( output, input, work, _vinv, _order ); break; } case OPERATOR_GRAD : @@ -135,7 +139,7 @@ namespace Intrepid2 { //case OPERATOR_D3 : { auto output = Kokkos::subview( _outputValues, Kokkos::ALL(), ptRange, Kokkos::ALL() ); - Serial::getValues( output, input, work, _vinv ); + Serial::getValues( output, input, work, _vinv, _order ); break; } default: { @@ -204,9 +208,29 @@ namespace Intrepid2 { outputValues, inputPoints, this->vinv_, + this->basisDegree_, operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp index c2c5aaf6cbe3..1c8715525bc0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp @@ -24,44 +24,36 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { -template +template template +typename InputViewType, +typename WorkViewType, +typename VinvViewType> KOKKOS_INLINE_FUNCTION void -Basis_HGRAD_TET_Cn_FEM::Serial:: +Basis_HGRAD_TET_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv ) { + const InputViewType input, + WorkViewType work, + const VinvViewType vinv, + const ordinal_type order ) { constexpr ordinal_type spaceDim = 3; const ordinal_type card = vinv.extent(0), npts = input.extent(0); - // compute order - ordinal_type order = 0; - for (ordinal_type p=0;p<=Parameters::MaxOrder;++p) { - if (card == Intrepid2::getPnCardinality(p)) { - order = p; - break; - } - } - - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - viewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); + ViewType dummyView; Impl::Basis_HGRAD_TET_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); + Serial::getValues(phis, input, workView, order); for (ordinal_type i=0;i(); //(orDn + 1); - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); - viewType dummyView; + const ordinal_type dkcard = getDkCardinality(); //(orDn + 1); + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); + ViewType dummyView; Impl::Basis_HGRAD_TET_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i outputValues, const Kokkos::DynRankView inputPoints, const Kokkos::DynRankView vinv, + const ordinal_type order, const EOperator operatorType) { typedef Kokkos::DynRankView outputValueViewType; typedef Kokkos::DynRankView inputPointViewType; @@ -156,7 +149,7 @@ getValues( workViewType work(Kokkos::view_alloc(space, "Basis_HGRAD_TET_Cn_FEM::getValues::work", vcprop), cardinality, inputPoints.extent(0)); typedef Functor FunctorType; - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); + Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work, order) ); break; } case OPERATOR_GRAD: @@ -164,23 +157,16 @@ getValues( workViewType work(Kokkos::view_alloc(space, "Basis_HGRAD_TET_Cn_FEM::getValues::work", vcprop), cardinality*(2*spaceDim+1), inputPoints.extent(0)); typedef Functor FunctorType; - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); + Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work, order) ); break; } case OPERATOR_D2: { typedef Functor FunctorType; workViewType work(Kokkos::view_alloc(space, "Basis_HGRAD_TET_Cn_FEM::getValues::work", vcprop), cardinality*outputValues.extent(2), inputPoints.extent(0)); - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); + Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work, order) ); break; } - /* case OPERATOR_D3: { - typedef Functor FunctorType; - workViewType work(Kokkos::view_alloc("Basis_HGRAD_TET_Cn_FEM::getValues::work", vcprop), cardinality, inputPoints.extent(0), outputValues.extent(2)); - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); - break; - }*/ default: { INTREPID2_TEST_FOR_EXCEPTION( true , std::invalid_argument, ">>> ERROR (Basis_HGRAD_TET_Cn_FEM): Operator type not implemented" ); @@ -431,5 +417,65 @@ Basis_HGRAD_TET_Cn_FEM( const ordinal_type order, posDfOrd); } } + + template + void + Basis_HGRAD_TET_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = getWorkSizePerPoint(operatorType)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_TET_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_TET_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + constexpr ordinal_type spaceDim = 3; + auto sizePerPoint = (operatorType==OPERATOR_VALUE) ? + this->vinv_.extent(0)*get_dimension_scalar(inputPoints) : + (2*spaceDim+1)*this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_TET_Cn_FEM::Serial::getValues( output, input, work, this->vinv_, this->basisDegree_); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_TET_Cn_FEM::Serial::getValues( output, input, work, this->vinv_, this->basisDegree_); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HGRAD_TET_Cn_FEM): getValues not implemented for this operator"); + } + } + } + } // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEM.hpp index 46349310b210..81439bdf3f50 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEM.hpp @@ -162,6 +162,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEMDef.hpp index e771ae7a3ee0..bc926788c290 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C1_FEMDef.hpp @@ -204,5 +204,63 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_TRI_C1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_TRI_C1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_TRI_C1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TRI_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TRI_C1_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TRI_C1_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HGRAD_TRI_C1_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEM.hpp index 9eb45f9c2716..627fa113720e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEM.hpp @@ -173,6 +173,23 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEMDef.hpp index 87bb96e2bbe0..86df77f41a27 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_C2_FEMDef.hpp @@ -263,5 +263,63 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_TRI_C2_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_TRI_C2_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_TRI_C2_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TRI_C2_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TRI_C2_FEM::Serial::getValues( output, input); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_TRI_C2_FEM::Serial::getValues( output, input); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HGRAD_TRI_C2_FEM::getValues), Operator Type not supported."); + } + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM.hpp index c8bc97c3fb76..17ada895efe0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM.hpp @@ -60,53 +60,57 @@ namespace Intrepid2 { work is a rank 1 view having the same value_type of inputPoints and having size equal to getWorkSizePerPoint()*inputPoints.extent(0); */ - template + template struct Serial { - template + template KOKKOS_INLINE_FUNCTION static void - getValues( outputValueViewType outputValues, - const inputPointViewType inputPoints, - workViewType work, - const vinvViewType vinv ); + getValues( OutputValueViewType outputValues, + const InputPointViewType inputPoints, + WorkViewType work, + const VinvViewType vinv, + const ordinal_type order); }; template + typename OutputValueValueType, class ...OutputValueProperties, + typename InputPointValueType, class ...InputPointProperties, + typename VinvValueType, class ...VinvProperties> static void - getValues(const typename DeviceType::execution_space& space, - Kokkos::DynRankView outputValues, - const Kokkos::DynRankView inputPoints, - const Kokkos::DynRankView vinv, - const EOperator operatorType); + getValues( const typename DeviceType::execution_space& space, + Kokkos::DynRankView outputValues, + const Kokkos::DynRankView inputPoints, + const Kokkos::DynRankView vinv, + const ordinal_type order, + const EOperator operatorType); /** \brief See Intrepid2::Basis_HGRAD_TRI_Cn_FEM */ - template struct Functor { - outputValueViewType _outputValues; - const inputPointViewType _inputPoints; - const vinvViewType _vinv; - workViewType _work; + OutputValueViewType _outputValues; + const InputPointViewType _inputPoints; + const VinvViewType _vinv; + WorkViewType _work; + const ordinal_type _order; KOKKOS_INLINE_FUNCTION - Functor( outputValueViewType outputValues_, - inputPointViewType inputPoints_, - vinvViewType vinv_, - workViewType work_) + Functor( OutputValueViewType outputValues_, + InputPointViewType inputPoints_, + VinvViewType vinv_, + WorkViewType work_, + ordinal_type order_) : _outputValues(outputValues_), _inputPoints(inputPoints_), - _vinv(vinv_), _work(work_) {} + _vinv(vinv_), _work(work_), _order(order_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type iter) const { @@ -116,22 +120,22 @@ namespace Intrepid2 { const auto ptRange = Kokkos::pair(ptBegin, ptEnd); const auto input = Kokkos::subview( _inputPoints, ptRange, Kokkos::ALL() ); - typename workViewType::pointer_type ptr = _work.data() + _work.extent(0)*ptBegin*get_dimension_scalar(_work); + typename WorkViewType::pointer_type ptr = _work.data() + _work.extent(0)*ptBegin*get_dimension_scalar(_work); auto vcprop = Kokkos::common_view_alloc_prop(_work); - workViewType work(Kokkos::view_wrap(ptr,vcprop), (ptEnd-ptBegin)*_work.extent(0)); + WorkViewType work(Kokkos::view_wrap(ptr,vcprop), (ptEnd-ptBegin)*_work.extent(0)); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE : { auto output = Kokkos::subview( _outputValues, Kokkos::ALL(), ptRange ); - Serial::getValues( output, input, work, _vinv ); + Serial::getValues( output, input, work, _vinv, _order ); break; } case OPERATOR_CURL: case OPERATOR_D1: case OPERATOR_D2: { auto output = Kokkos::subview( _outputValues, Kokkos::ALL(), ptRange, Kokkos::ALL() ); - Serial::getValues( output, input, work, _vinv ); + Serial::getValues( output, input, work, _vinv, _order ); break; } default: { @@ -200,9 +204,29 @@ namespace Intrepid2 { outputValues, inputPoints, this->vinv_, + this->basisDegree_, operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEMDef.hpp index 681148713a06..c7b7a40cfa7b 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEMDef.hpp @@ -23,44 +23,36 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { -template +template template +typename InputViewType, +typename WorkViewType, +typename VinvViewType> KOKKOS_INLINE_FUNCTION void -Basis_HGRAD_TRI_Cn_FEM::Serial:: +Basis_HGRAD_TRI_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv ) { + const InputViewType input, + WorkViewType work, + const VinvViewType vinv, + const ordinal_type order ) { constexpr ordinal_type spaceDim = 2; const ordinal_type card = vinv.extent(0), npts = input.extent(0); - // compute order - ordinal_type order = 0; - for (ordinal_type p=0;p<=Parameters::MaxOrder;++p) { - if (card == Intrepid2::getPnCardinality(p) ) { - order = p; - break; - } - } - - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - viewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); + ViewType dummyView; Impl::Basis_HGRAD_TRI_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); + Serial::getValues(phis, input, workView, order); for (ordinal_type i=0;i(); //(orDn + 1); - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); - viewType dummyView; + const ordinal_type dkcard = getDkCardinality(); //(orDn + 1); + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); + ViewType dummyView; Impl::Basis_HGRAD_TRI_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i outputValues, const Kokkos::DynRankView inputPoints, const Kokkos::DynRankView vinv, + const ordinal_type order, const EOperator operatorType) { typedef Kokkos::DynRankView outputValueViewType; typedef Kokkos::DynRankView inputPointViewType; @@ -175,7 +168,7 @@ getValues( workViewType work(Kokkos::view_alloc(space, "Basis_HGRAD_TRI_Cn_FEM::getValues::work", vcprop), cardinality, inputPoints.extent(0)); typedef Functor FunctorType; - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); + Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work, order) ); break; } case OPERATOR_GRAD: @@ -183,30 +176,23 @@ getValues( workViewType work(Kokkos::view_alloc(space, "Basis_HGRAD_TRI_Cn_FEM::getValues::work", vcprop), cardinality*(2*spaceDim+1), inputPoints.extent(0)); typedef Functor FunctorType; - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); + Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work, order) ); break; } case OPERATOR_CURL: { workViewType work(Kokkos::view_alloc(space, "Basis_HGRAD_TRI_Cn_FEM::getValues::work", vcprop), cardinality*(2*spaceDim+1), inputPoints.extent(0)); typedef Functor FunctorType; - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); + Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work, order) ); break; } case OPERATOR_D2: { typedef Functor FunctorType; workViewType work(Kokkos::view_alloc(space, "Basis_HGRAD_TRI_Cn_FEM::getValues::work", vcprop), cardinality*outputValues.extent(2), inputPoints.extent(0)); - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); + Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work, order) ); break; } - /* case OPERATOR_D3: { - typedef Functor FunctorType; - workViewType work(Kokkos::view_alloc("Basis_HGRAD_TRI_Cn_FEM::getValues::work", vcprop), cardinality, inputPoints.extent(0), outputValues.extent(2)); - Kokkos::parallel_for( policy, FunctorType(outputValues, inputPoints, vinv, work) ); - break; - }*/ default: { INTREPID2_TEST_FOR_EXCEPTION( true , std::invalid_argument, ">>> ERROR (Basis_HGRAD_TRI_Cn_FEM): Operator type not implemented" ); @@ -242,7 +228,7 @@ Basis_HGRAD_TRI_Cn_FEM( const ordinal_type order, PointTools::getLattice( dofCoords, cellTopo, order, offset, - pointType_ ); + this->pointType_ ); this->dofCoords_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoords); Kokkos::deep_copy(this->dofCoords_, dofCoords); @@ -384,5 +370,74 @@ Basis_HGRAD_TRI_Cn_FEM( const ordinal_type order, posDfOrd); } } + + template + void + Basis_HGRAD_TRI_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = getWorkSizePerPoint(operatorType)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_TRI_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_TRI_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + constexpr ordinal_type spaceDim = 2; + auto sizePerPoint = (operatorType==OPERATOR_VALUE) ? + this->vinv_.extent(0)*get_dimension_scalar(inputPoints) : + (2*spaceDim+1)*this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_TRI_Cn_FEM::Serial::getValues( output, input, work, this->vinv_, this->basisDegree_); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_TRI_Cn_FEM::Serial::getValues( output, input, work, this->vinv_, this->basisDegree_); + }); + break; + case OPERATOR_CURL: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type(pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt,pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HGRAD_TRI_Cn_FEM::Serial::getValues( output, input, work, this->vinv_, this->basisDegree_); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HGRAD_TRI_Cn_FEM): getValues not implemented for this operator"); + } + } + } + } // namespace Intrepid2 + #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM_ORTHDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM_ORTHDef.hpp index 6f5e5abb1dd0..a2b4271518ba 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM_ORTHDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TRI_Cn_FEM_ORTHDef.hpp @@ -198,69 +198,8 @@ void OrthPolynomialTri::ge const inputViewType /* input */, workViewType /* work */, const ordinal_type /* order */ ) { -#if 0 //#ifdef HAVE_INTREPID2_SACADO - -constexpr ordinal_type spaceDim = 2; -constexpr ordinal_type maxCard = Intrepid2::getPnCardinality(); - -typedef typename OutputViewType::value_type value_type; -typedef Sacado::Fad::SFad fad_type; - -const ordinal_type -npts = input.extent(0), -card = output.extent(0); - -// use stack buffer -fad_type inBuf[Parameters::MaxNumPtsPerBasisEval][spaceDim], -outBuf[maxCard][Parameters::MaxNumPtsPerBasisEval][n]; - -typedef typename inputViewType::memory_space memory_space; -typedef typename Kokkos::View outViewType; -typedef typename Kokkos::View inViewType; -auto vcprop = Kokkos::common_view_alloc_prop(input); - -inViewType in(Kokkos::view_wrap((value_type*)&inBuf[0][0], vcprop), npts, spaceDim); -outViewType out(Kokkos::view_wrap((value_type*)&outBuf[0][0][0], vcprop), card, npts, n); - -for (ordinal_type i=0;i outViewType_; -outViewType_ workView; -if (n==2) { - //char outBuf[bufSize*sizeof(typename inViewType::value_type)]; - fad_type outBuf[maxCard][Parameters::MaxNumPtsPerBasisEval][spaceDim+1]; - auto vcprop = Kokkos::common_view_alloc_prop(in); - workView = outViewType_( Kokkos::view_wrap((value_type*)&outBuf[0][0][0], vcprop), card, npts, spaceDim+1); -} -OrthPolynomialTri::generate(out, in, workView, order); - -for (ordinal_type i=0;i 0) { - //n=2: (f_x)_x, (f_y)_x - //n=3: (f_xx)_x, (f_xy)_x, (f_yy)_x - ordinal_type i_Dnm1 = i_dy; - output.access(i,j,i_Dn) = out(i,j,i_Dnm1).dx(0); - } - else { - //n=2: (f_y)_y, (f_z)_y - //n=3: (f_yy)_y - ordinal_type i_Dnm1 = i_dy-1; - output.access(i,j,i_Dn) = out(i,j,i_Dnm1).dx(1); - } - } - } -#else INTREPID2_TEST_FOR_ABORT( true, ">>> ERROR: (Intrepid2::Basis_HGRAD_TRI_Cn_FEM_ORTH::OrthPolynomialTri) Computing of second and higher-order derivatives is not currently supported"); -#endif } diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEM.hpp index 15daedfbfe49..5b8b73634bb4 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEM.hpp @@ -166,6 +166,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEMDef.hpp index 9d2c461edca2..8d76318a49e0 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C1_FEMDef.hpp @@ -245,5 +245,54 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_WEDGE_C1_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_WEDGE_C1_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_WEDGE_C1_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_WEDGE_C1_FEM::template Serial::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + Impl::Basis_HGRAD_WEDGE_C1_FEM::template Serial::getValues( output, input); + }); + break; + default: {} + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEM.hpp index d4cb38e7ca55..c952afcf0e6e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEM.hpp @@ -215,6 +215,23 @@ namespace Intrepid2 { operatorType );; } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEMDef.hpp index 363d21ad19ea..9f5327d94187 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_WEDGE_C2_FEMDef.hpp @@ -30,12 +30,13 @@ namespace Intrepid2 { Basis_HGRAD_WEDGE_DEG2_FEM::Serial:: getValues( OutputViewType output, const inputViewType input ) { + typedef typename inputViewType::value_type value_type; switch (opType) { case OPERATOR_VALUE: { - const auto x = input(0); - const auto y = input(1); - const auto z = input(2); - const auto w = 1.0 - x - y; + const value_type x = input(0); + const value_type y = input(1); + const value_type z = input(2); + const value_type w = 1.0 - x - y; // output is a rank-1 array with dimensions (basisCardinality_) if constexpr (!serendipity) { @@ -80,9 +81,9 @@ namespace Intrepid2 { break; } case OPERATOR_GRAD: { - const auto x = input(0); - const auto y = input(1); - const auto z = input(2); + const value_type x = input(0); + const value_type y = input(1); + const value_type z = input(2); if constexpr (!serendipity) { output.access(0, 0) = ((-3 + 4*x + 4*y)*(-1 + z)*z)/2.; @@ -158,7 +159,7 @@ namespace Intrepid2 { output.access(17, 1) = 4*(-1 + x + 2*y)*(-1 + z*z); output.access(17, 2) = 8*y*(-1 + x + y)*z; } else { - const auto w = 1.0 - x - y; + const value_type w = 1.0 - x - y; output.access(0, 0) = -(2.0*w - 1.0 - 0.5*z)*(1.0 - z); output.access(0, 1) = -(2.0*w - 1.0 - 0.5*z)*(1.0 - z); @@ -223,9 +224,9 @@ namespace Intrepid2 { break; } case OPERATOR_D2: { - const auto x = input(0); - const auto y = input(1); - const auto z = input(2); + const value_type x = input(0); + const value_type y = input(1); + const value_type z = input(2); if constexpr (!serendipity) { output.access(0, 0) = 2.*(-1. + z)*z; @@ -356,7 +357,7 @@ namespace Intrepid2 { } else { //serendipity element - const auto w = 1.0 - x - y; + const value_type w = 1.0 - x - y; output.access(0, 0) = 2.0*(1.0 - z); output.access(0, 1) = 2.0*(1.0 - z); output.access(0, 2) = 2.0*w - 0.5 - z; @@ -466,9 +467,9 @@ namespace Intrepid2 { } case OPERATOR_D3: { if constexpr (!serendipity) { - const auto x = input(0); - const auto y = input(1); - const auto z = input(2); + const value_type x = input(0); + const value_type y = input(1); + const value_type z = input(2); output.access(0, 0) = 0.; output.access(0, 1) = 0.; @@ -1082,5 +1083,56 @@ namespace Intrepid2 { Kokkos::deep_copy(this->dofCoords_, dofCoords); } + template + void + Basis_HGRAD_WEDGE_DEG2_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 0; + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HGRAD_WEDGE_DEG2_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim <= 0) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HGRAD_WEDGE_DEG2_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + (void) scratchStorage; //avoid unused variable warning + + const int numPoints = inputPoints.extent(0); + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + using SerialValue = typename Impl::Basis_HGRAD_WEDGE_DEG2_FEM::template Serial; + SerialValue::getValues( output, input); + }); + break; + case OPERATOR_GRAD: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), pt, Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, pt, Kokkos::ALL() ); + using SerialGrad = typename Impl::Basis_HGRAD_WEDGE_DEG2_FEM::template Serial; + SerialGrad::getValues( output, input); + }); + break; + default: {} + } + } + }// namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEM.hpp index 96cf0a64405b..388eb9ccdd1b 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEM.hpp @@ -137,20 +137,22 @@ namespace Intrepid2 { class Basis_HVOL_HEX_Cn_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + + using typename BasisBase::OrdinalTypeArray1DHost; + using typename BasisBase::OrdinalTypeArray2DHost; + using typename BasisBase::OrdinalTypeArray3DHost; + + using typename BasisBase::OutputViewType; + using typename BasisBase::PointViewType ; + using typename BasisBase::ScalarViewType; /** \brief Constructor. */ Basis_HVOL_HEX_Cn_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -172,6 +174,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEMDef.hpp index 652df8ee2689..617eeb9cad84 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_HEX_Cn_FEMDef.hpp @@ -9,7 +9,7 @@ /** \file Intrepid2_HVOL_HEX_Cn_FEMDef.hpp \brief Definition file for FEM basis functions of degree n for H(vol) functions on HEX cells - \author Created by M. Perego, based on the Intrepid2::HGRAD_HEX_Cn_FEM class + \author Created by M. Perego, based on the Intrepid2::HVOL_HEX_Cn_FEM class */ #ifndef __INTREPID2_HVOL_HEX_CN_FEMDEF_HPP__ @@ -20,18 +20,18 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HVOL_HEX_Cn_FEM::Serial:: + Basis_HVOL_HEX_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv, + const InputViewType input, + WorkViewType work, + const VinvViewType vinv, const ordinal_type operatorDn ) { ordinal_type opDn = operatorDn; @@ -43,21 +43,21 @@ namespace Intrepid2 { const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); const auto input_z = Kokkos::subview(input, Kokkos::ALL(), range_type(2,3)); - const ordinal_type dim_s = get_dimension_scalar(work); + const ordinal_type dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data()+cardLine*npts*dim_s; auto ptr2 = work.data()+2*cardLine*npts*dim_s; auto ptr3 = work.data()+3*cardLine*npts*dim_s; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - viewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); - viewType output_z(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts); + ViewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + ViewType output_z(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv); @@ -88,7 +88,7 @@ namespace Intrepid2 { case OPERATOR_D8: case OPERATOR_D9: case OPERATOR_D10: - opDn = getOperatorOrder(opType); + opDn = getOperatorOrder(OpType); case OPERATOR_Dn: { const ordinal_type dkcard = opDn + 1; @@ -105,35 +105,35 @@ namespace Intrepid2 { if (mult_x < 0) { // pass } else { - viewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); decltype(work_line) output_x, output_y, output_z; if (mult_x) { - output_x = viewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts, 1); + output_x = ViewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts, 1); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv, mult_x); } else { - output_x = viewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + output_x = ViewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv); } if (mult_y) { - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv, mult_y); } else { - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv); } if (mult_z) { - output_z = viewType(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts, 1); + output_z = ViewType(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts, 1); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_z, input_z, work_line, vinv, mult_z); } else { - output_z = viewType(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts); + output_z = ViewType(Kokkos::view_wrap(ptr3, vcprop), cardLine, npts); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_z, input_z, work_line, vinv); } @@ -316,7 +316,55 @@ namespace Intrepid2 { this->dofCoords_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoordsHost); Kokkos::deep_copy(this->dofCoords_, dofCoordsHost); } - -}// namespace Intrepid2 + + template + void + Basis_HVOL_HEX_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 4*this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HVOL_HEX_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HVOL_HEX_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + auto sizePerPoint = 4*this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HVOL_HEX_Cn_FEM::Serial::getValues( output, input, work, this->vinv_, this->basisDegree_); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HVOL_HEX_Cn_FEM): getValues not implemented for this operator"); + } + } + } + +} // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEM.hpp index 380438f33bb4..0be4ce27fba8 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEM.hpp @@ -145,15 +145,16 @@ namespace Intrepid2 { : public Basis { public: using BasisBase = Basis; + using HostBasis = Basis_HVOL_LINE_Cn_FEM; - - using OrdinalTypeArray1DHost = typename BasisBase::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename BasisBase::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename BasisBase::OrdinalTypeArray3DHost; - - using OutputViewType = typename BasisBase::OutputViewType; - using PointViewType = typename BasisBase::PointViewType ; - using ScalarViewType = typename BasisBase::ScalarViewType; + + using typename BasisBase::OrdinalTypeArray1DHost; + using typename BasisBase::OrdinalTypeArray2DHost; + using typename BasisBase::OrdinalTypeArray3DHost; + + using typename BasisBase::OutputViewType; + using typename BasisBase::PointViewType ; + using typename BasisBase::ScalarViewType; /** \brief Constructor. */ @@ -182,6 +183,23 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPointsconst, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEMDef.hpp index 3d742d4a30a4..dc8f25d3cd7e 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_LINE_Cn_FEMDef.hpp @@ -9,7 +9,7 @@ /** \file Intrepid2_HVOL_LINE_Cn_FEMDef.hpp \brief Definition file for FEM basis functions of degree n for H(vol) functions on LINE. - \author Created by M. Perego, based on the Intrepid2::HGRAD_LINE_Cn_FEM class + \author Created by M. Perego, based on the Intrepid2::HVOL_LINE_Cn_FEM class */ #ifndef __INTREPID2_HVOL_LINE_CN_FEM_DEF_HPP__ @@ -22,16 +22,16 @@ namespace Intrepid2 { template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void Basis_HVOL_LINE_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv, + const InputViewType input, + WorkViewType work, + const VinvViewType vinv, const ordinal_type operatorDn ) { ordinal_type opDn = operatorDn; @@ -41,12 +41,12 @@ namespace Intrepid2 { const ordinal_type order = card - 1; const double alpha = 0.0, beta = 0.0; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); switch (opType) { case OPERATOR_VALUE: { - viewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts); + ViewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts); Impl::Basis_HGRAD_LINE_Cn_FEM_JACOBI:: Serial::getValues(phis, input, order, alpha, beta); @@ -74,7 +74,7 @@ namespace Intrepid2 { case OPERATOR_Dn: { // dkcard is always 1 for 1D element const ordinal_type dkcard = 1; - viewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts, dkcard); + ViewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts, dkcard); Impl::Basis_HGRAD_LINE_Cn_FEM_JACOBI:: Serial::getValues(phis, input, order, alpha, beta, opDn); @@ -289,22 +289,56 @@ namespace Intrepid2 { posDfOrd); } } + + template + void + Basis_HVOL_LINE_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HVOL_LINE_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HVOL_LINE_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + ordinal_type sizePerPoint = this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HVOL_LINE_Cn_FEM::Serial::getValues( output, input, work, this->vinv_ ); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HVOL_LINE_Cn_FEM): getValues not implemented for this operator"); + } + } + } }// namespace Intrepid2 #endif - - - - - - - - - - - - - - - diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEM.hpp index 496522a6278c..6329c3cb30ce 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEM.hpp @@ -132,20 +132,22 @@ namespace Intrepid2 { class Basis_HVOL_QUAD_Cn_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + + using typename BasisBase::OrdinalTypeArray1DHost; + using typename BasisBase::OrdinalTypeArray2DHost; + using typename BasisBase::OrdinalTypeArray3DHost; + + using typename BasisBase::OutputViewType; + using typename BasisBase::PointViewType ; + using typename BasisBase::ScalarViewType; /** \brief Constructor. */ Basis_HVOL_QUAD_Cn_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - using Basis::getValues; + using BasisBase::getValues; virtual void @@ -167,6 +169,24 @@ namespace Intrepid2 { operatorType ); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEMDef.hpp index 2a9e2678b771..f492b6a65f7c 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_QUAD_Cn_FEMDef.hpp @@ -19,22 +19,22 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { - - template + + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HVOL_QUAD_Cn_FEM::Serial:: + Basis_HVOL_QUAD_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv, + const InputViewType input, + WorkViewType work, + const VinvViewType vinv, const ordinal_type operatorDn ) { ordinal_type opDn = operatorDn; - + const ordinal_type cardLine = vinv.extent(0); const ordinal_type npts = input.extent(0); @@ -42,19 +42,19 @@ namespace Intrepid2 { const auto input_x = Kokkos::subview(input, Kokkos::ALL(), range_type(0,1)); const auto input_y = Kokkos::subview(input, Kokkos::ALL(), range_type(1,2)); - const int dim_s = get_dimension_scalar(work); + const ordinal_type dim_s = get_dimension_scalar(input); auto ptr0 = work.data(); auto ptr1 = work.data()+cardLine*npts*dim_s; auto ptr2 = work.data()+2*cardLine*npts*dim_s; - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); - - switch (opType) { + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); + + switch (OpType) { case OPERATOR_VALUE: { - viewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); - viewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); - viewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + ViewType work_line(Kokkos::view_wrap(ptr0, vcprop), cardLine, npts); + ViewType output_x(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + ViewType output_y(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv); @@ -81,33 +81,33 @@ namespace Intrepid2 { case OPERATOR_D8: case OPERATOR_D9: case OPERATOR_D10: - opDn = getOperatorOrder(opType); + opDn = getOperatorOrder(OpType); case OPERATOR_Dn: { const auto dkcard = opDn + 1; for (auto l=0;l:: getValues(output_x, input_x, work_line, vinv, mult_x); } else { - output_x = viewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); + output_x = ViewType(Kokkos::view_wrap(ptr1, vcprop), cardLine, npts); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_x, input_x, work_line, vinv); } if (mult_y) { - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts, 1); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv, mult_y); } else { - output_y = viewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); + output_y = ViewType(Kokkos::view_wrap(ptr2, vcprop), cardLine, npts); Impl::Basis_HVOL_LINE_Cn_FEM::Serial:: getValues(output_y, input_y, work_line, vinv); } @@ -282,7 +282,55 @@ namespace Intrepid2 { this->dofCoords_ = Kokkos::create_mirror_view(typename DT::memory_space(), dofCoordsHost); Kokkos::deep_copy(this->dofCoords_, dofCoordsHost); } - -} + + template + void + Basis_HVOL_QUAD_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = 3*this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HVOL_QUAD_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HVOL_QUAD_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + auto sizePerPoint = 3*this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HVOL_QUAD_Cn_FEM::Serial::getValues( output, input, work, this->vinv_, this->basisDegree_); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HVOL_QUAD_Cn_FEM): getValues not implemented for this operator"); + } + } + } + +} // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEM.hpp index d47afbf7724f..8f9010f619b8 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEM.hpp @@ -156,23 +156,23 @@ namespace Intrepid2 { class Basis_HVOL_TET_Cn_FEM : public Basis { public: - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + using BasisBase = Basis; + + using typename BasisBase::OrdinalTypeArray1DHost; + using typename BasisBase::OrdinalTypeArray2DHost; + using typename BasisBase::OrdinalTypeArray3DHost; + + using typename BasisBase::OutputViewType; + using typename BasisBase::PointViewType ; + using typename BasisBase::ScalarViewType; /** \brief Constructor. */ Basis_HVOL_TET_Cn_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - - - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - typedef typename Basis::scalarType scalarType; - - using Basis::getValues; + + using scalarType = typename BasisBase::scalarType; + using BasisBase::getValues; virtual void @@ -194,6 +194,24 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEMDef.hpp index a0945a008159..7927a1e124f6 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TET_Cn_FEMDef.hpp @@ -23,18 +23,18 @@ namespace Intrepid2 { namespace Impl { - template + template template + typename InputViewType, + typename WorkViewType, + typename VinvViewType> KOKKOS_INLINE_FUNCTION void - Basis_HVOL_TET_Cn_FEM::Serial:: + Basis_HVOL_TET_Cn_FEM::Serial:: getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv ) { + const InputViewType input, + WorkViewType work, + const VinvViewType vinv ) { constexpr ordinal_type spaceDim = 3; const ordinal_type @@ -50,17 +50,17 @@ namespace Intrepid2 { } } - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - workViewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); + ViewType dummyView; Impl::Basis_HGRAD_TET_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); + Serial::getValues(phis, input, workView, order); for (ordinal_type i=0;i(); //(orDn + 1); - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); - workViewType dummyView; + const ordinal_type dkcard = getDkCardinality(); //(orDn + 1); + const + ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); + ViewType dummyView; Impl::Basis_HGRAD_TET_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i + void + Basis_HVOL_TET_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HVOL_TET_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HVOL_TET_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + auto sizePerPoint = this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HVOL_TET_Cn_FEM::Serial::getValues( output, input, work, this->vinv_); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HVOL_TET_Cn_FEM): getValues not implemented for this operator"); + } + } + } + } // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEM.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEM.hpp index 43a2161c9050..ff20e7426957 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEM.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEM.hpp @@ -151,25 +151,24 @@ namespace Intrepid2 { class Basis_HVOL_TRI_Cn_FEM : public Basis { public: + using BasisBase = Basis; using HostBasis = Basis_HVOL_TRI_Cn_FEM; - - using OrdinalTypeArray1DHost = typename Basis::OrdinalTypeArray1DHost; - using OrdinalTypeArray2DHost = typename Basis::OrdinalTypeArray2DHost; - using OrdinalTypeArray3DHost = typename Basis::OrdinalTypeArray3DHost; + + using typename BasisBase::OrdinalTypeArray1DHost; + using typename BasisBase::OrdinalTypeArray2DHost; + using typename BasisBase::OrdinalTypeArray3DHost; + + using typename BasisBase::OutputViewType; + using typename BasisBase::PointViewType ; + using typename BasisBase::ScalarViewType; /** \brief Constructor. */ Basis_HVOL_TRI_Cn_FEM(const ordinal_type order, const EPointType pointType = POINTTYPE_EQUISPACED); - - - using OutputViewType = typename Basis::OutputViewType; - using PointViewType = typename Basis::PointViewType; - using ScalarViewType = typename Basis::ScalarViewType; - - typedef typename Basis::scalarType scalarType; - - using Basis::getValues; + + using scalarType = typename BasisBase::scalarType; + using BasisBase::getValues; virtual void @@ -191,6 +190,24 @@ namespace Intrepid2 { operatorType); } + virtual void + getScratchSpaceSize( ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType = OPERATOR_VALUE) const override; + + KOKKOS_INLINE_FUNCTION + virtual void + getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DeviceType::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim = -1, + const ordinal_type subcellOrdinal = -1) const override; + + virtual void getDofCoords( ScalarViewType dofCoords ) const override { diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEMDef.hpp index f870940f506b..aa6f54065ff6 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HVOL_TRI_Cn_FEMDef.hpp @@ -22,18 +22,18 @@ namespace Intrepid2 { // ------------------------------------------------------------------------------------- namespace Impl { -template -template -KOKKOS_INLINE_FUNCTION -void -Basis_HVOL_TRI_Cn_FEM::Serial:: -getValues( OutputViewType output, - const inputViewType input, - workViewType work, - const vinvViewType vinv ) { + template + template + KOKKOS_INLINE_FUNCTION + void + Basis_HVOL_TRI_Cn_FEM::Serial:: + getValues( OutputViewType output, + const InputViewType input, + WorkViewType work, + const VinvViewType vinv ) { constexpr ordinal_type spaceDim = 2; const ordinal_type @@ -49,17 +49,17 @@ getValues( OutputViewType output, } } - typedef typename Kokkos::DynRankView viewType; - auto vcprop = Kokkos::common_view_alloc_prop(work); + typedef typename Kokkos::DynRankView ViewType; + auto vcprop = Kokkos::common_view_alloc_prop(input); auto ptr = work.data(); - switch (opType) { + switch (OpType) { case OPERATOR_VALUE: { - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); - workViewType dummyView; + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts); + ViewType dummyView; Impl::Basis_HGRAD_TRI_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i::getValues(phis, input, workView, order); + Serial::getValues(phis, input, workView, order); for (ordinal_type i=0;i(); //(orDn + 1); - const viewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); - workViewType dummyView; + const ordinal_type dkcard = getDkCardinality(); //(orDn + 1); + const ViewType phis(Kokkos::view_wrap(ptr, vcprop), card, npts, dkcard); + ViewType dummyView; Impl::Basis_HGRAD_TRI_Cn_FEM_ORTH:: - Serial::getValues(phis, input, dummyView, order); + Serial::getValues(phis, input, dummyView, order); for (ordinal_type i=0;i + void + Basis_HVOL_TRI_Cn_FEM::getScratchSpaceSize( + ordinal_type& perTeamSpaceSize, + ordinal_type& perThreadSpaceSize, + const PointViewType inputPoints, + const EOperator operatorType) const { + perTeamSpaceSize = 0; + perThreadSpaceSize = this->vinv_.extent(0)*get_dimension_scalar(inputPoints)*sizeof(typename BasisBase::scalarType); + } + + template + KOKKOS_INLINE_FUNCTION + void + Basis_HVOL_TRI_Cn_FEM::getValues( + OutputViewType outputValues, + const PointViewType inputPoints, + const EOperator operatorType, + const typename Kokkos::TeamPolicy::member_type& team_member, + const typename DT::execution_space::scratch_memory_space & scratchStorage, + const ordinal_type subcellDim, + const ordinal_type subcellOrdinal) const { + + INTREPID2_TEST_FOR_ABORT( !((subcellDim == -1) && (subcellOrdinal == -1)), + ">>> ERROR: (Intrepid2::Basis_HVOL_TRI_Cn_FEM::getValues), The capability of selecting subsets of basis functions has not been implemented yet."); + + const int numPoints = inputPoints.extent(0); + using ScalarType = typename ScalarTraits::scalar_type; + using WorkViewType = Kokkos::DynRankView< ScalarType,typename DT::execution_space::scratch_memory_space,Kokkos::MemoryTraits >; + auto sizePerPoint = this->vinv_.extent(0)*get_dimension_scalar(inputPoints); + WorkViewType workView(scratchStorage, sizePerPoint*team_member.team_size()); + using range_type = Kokkos::pair; + switch(operatorType) { + case OPERATOR_VALUE: + Kokkos::parallel_for (Kokkos::TeamThreadRange (team_member, numPoints), [=] (ordinal_type& pt) { + auto output = Kokkos::subview( outputValues, Kokkos::ALL(), range_type (pt,pt+1), Kokkos::ALL() ); + const auto input = Kokkos::subview( inputPoints, range_type(pt, pt+1), Kokkos::ALL() ); + WorkViewType work(workView.data() + sizePerPoint*team_member.team_rank(), sizePerPoint); + Impl::Basis_HVOL_TRI_Cn_FEM::Serial::getValues( output, input, work, this->vinv_); + }); + break; + default: { + INTREPID2_TEST_FOR_ABORT( true, + ">>> ERROR (Basis_HVOL_TRI_Cn_FEM): getValues not implemented for this operator"); + } + } + } + } // namespace Intrepid2 #endif diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_CubatureControlVolumeSideDef.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_CubatureControlVolumeSideDef.hpp index ad2308807bf7..cb90d54c2c08 100644 --- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_CubatureControlVolumeSideDef.hpp +++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_CubatureControlVolumeSideDef.hpp @@ -113,7 +113,7 @@ namespace Intrepid2 { const auto numSideNodeMaps = (spaceDim == 2 ? 1 : 2); const ordinal_type sideOrd[2] = { 1, 5 }; - Kokkos::pair nodeRangePerSide[2]; + Kokkos::pair nodeRangePerSide[2]={}; // the second rage is cell specific to handle remained sides switch (primaryCellTopo_.getKey()) { diff --git a/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp b/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp index 26d71b652d17..7a69b74f9fa7 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp @@ -226,21 +226,22 @@ namespace Intrepid2 { } else { const double one = 1.0, two = 2.0; - typename zViewType::value_type pd_buf[MaxPolylibPoint]; - Kokkos::View - pd((typename zViewType::pointer_type)&pd_buf[0], MaxPolylibPoint); - + auto pd = Kokkos::subview(D, np-1, Kokkos::pair(0,np)); JacobiPolynomialDerivative(np, z, pd, np, alpha, beta); - for (ordinal_type i = 0; i < np; ++i) - for (ordinal_type j = 0; j < np; ++j) - if (i != j) - //D(i*np+j) = pd(j)/(pd(i)*(z(j)-z(i))); <--- This is either a bug, or the derivative matrix is not defined consistently. - D(i,j) = pd(i)/(pd(j)*(z(i)-z(j))); - else - D(i,j) = (alpha - beta + (alpha + beta + two)*z(j))/ - (two*(one - z(j)*z(j))); + // The temporary view pd is stored in the last row of the matrix D + // This loop is designed so that we do not overwrite pd entries before we read them + for (ordinal_type i = 0; i < np; ++i) { + const auto & pd_i = pd(i); + const auto & z_i = z(i); + for (ordinal_type j = 0; j < i; ++j) { + const auto & pd_j = pd(j); + const auto & z_j = z(j); + D(j,i) = pd_j/(pd_i*(z_j-z_i)); + D(i,j) = pd_i/(pd_j*(z_i-z_j)); + } + D(i,i) = (alpha - beta + (alpha + beta + two)*z_i) / (two*(one - z_i*z_i)); + } } } @@ -260,13 +261,8 @@ namespace Intrepid2 { } else { const double one = 1.0, two = 2.0; - typename zViewType::value_type pd_buf[MaxPolylibPoint]; - Kokkos::View - pd((typename zViewType::pointer_type)&pd_buf[0], MaxPolylibPoint); - - pd(0) = pow(-one,np-1)*GammaFunction(np+beta+one); - pd(0) /= GammaFunction(np)*GammaFunction(beta+two); + auto pd = Kokkos::subview(D, np-1, Kokkos::pair(0,np)); + pd(0) = pow(-one,np-1)*GammaFunction(np+beta+one) / (GammaFunction(np)*GammaFunction(beta+two)); auto pd_plus_1 = Kokkos::subview(pd, Kokkos::pair(1, pd.extent(0))); auto z_plus_1 = Kokkos::subview( z, Kokkos::pair(1, z.extent(0))); @@ -275,17 +271,22 @@ namespace Intrepid2 { for(ordinal_type i = 1; i < np; ++i) pd(i) *= (1+z(i)); - for (ordinal_type i = 0; i < np; ++i) - for (ordinal_type j = 0; j < np; ++j) - if (i != j) - D(i,j) = pd(i)/(pd(j)*(z(i)-z(j))); - else - if (j == 0) - D(i,j) = -(np + alpha + beta + one)*(np - one)/ - (two*(beta + two)); - else - D(i,j) = (alpha - beta + one + (alpha + beta + one)*z(j))/ - (two*(one - z(j)*z(j))); + // The temporary view pd is stored in the last row of the matrix D + // This loop is designed so that we do not overwrite pd entries before we read them + for (ordinal_type i = 0; i < np; ++i) { + const auto & pd_i = pd(i); + const auto & z_i = z(i); + for (ordinal_type j = 0; j < i; ++j) { + const auto & pd_j = pd(j); + const auto & z_j = z(j); + D(j,i) = pd_j/(pd_i*(z_j-z_i)); + D(i,j) = pd_i/(pd_j*(z_i-z_j)); + } + if (i == 0) + D(i,i) = -(np + alpha + beta + one)*(np - one) / (two*(beta + two)); + else + D(i,i) = (alpha - beta + one + (alpha + beta + one)*z_i) / (two*(one - z_i*z_i)); + } } } @@ -305,29 +306,30 @@ namespace Intrepid2 { } else { const double one = 1.0, two = 2.0; - typename zViewType::value_type pd_buf[MaxPolylibPoint]; - Kokkos::View - pd((typename zViewType::pointer_type)&pd_buf[0], MaxPolylibPoint); + auto pd = Kokkos::subview(D, np-1, Kokkos::pair(0,np)); JacobiPolynomialDerivative(np-1, z, pd, np-1, alpha+1, beta); for (ordinal_type i = 0; i < np-1; ++i) pd(i) *= (1-z(i)); - pd(np-1) = -GammaFunction(np+alpha+one); - pd(np-1) /= GammaFunction(np)*GammaFunction(alpha+two); - - for (ordinal_type i = 0; i < np; ++i) - for (ordinal_type j = 0; j < np; ++j) - if (i != j) - D(i,j) = pd(i)/(pd(j)*(z(i)-z(j))); - else - if (j == np-1) - D(i,j) = (np + alpha + beta + one)*(np - one)/ - (two*(alpha + two)); - else - D(i,j) = (alpha - beta - one + (alpha + beta + one)*z(j))/ - (two*(one - z(j)*z(j))); + pd(np-1) = -GammaFunction(np+alpha+one) / (GammaFunction(np)*GammaFunction(alpha+two)); + + // The temporary view pd is stored in the last row of the matrix D + // This loop is designed so that we do not overwrite pd entries before we read them + for (ordinal_type i = 0; i < np; ++i) { + const auto & pd_i = pd(i); + const auto & z_i = z(i); + for (ordinal_type j = 0; j < i; ++j) { + const auto & pd_j = pd(j); + const auto & z_j = z(j); + D(j,i) = pd_j/(pd_i*(z_j-z_i)); + D(i,j) = pd_i/(pd_j*(z_i-z_j)); + } + if (i == np-1) + D(i,i) = (np + alpha + beta + one)*(np - one) / (two*(alpha + two)); + else + D(i,i) = (alpha - beta - one + (alpha + beta + one)*z_i) / (two*(one - z_i*z_i)); + } } } @@ -347,10 +349,7 @@ namespace Intrepid2 { } else { const double one = 1.0, two = 2.0; - typename zViewType::value_type pd_buf[MaxPolylibPoint]; - Kokkos::View - pd((typename zViewType::pointer_type)&pd_buf[0], MaxPolylibPoint); + auto pd = Kokkos::subview(D, np-1, Kokkos::pair(0,np)); pd(0) = two*pow(-one,np)*GammaFunction(np + beta); pd(0) /= GammaFunction(np - one)*GammaFunction(beta + two); @@ -359,24 +358,32 @@ namespace Intrepid2 { auto z_plus_1 = Kokkos::subview( z, Kokkos::pair(1, z.extent(0))); JacobiPolynomialDerivative(np-2, z_plus_1, pd_plus_1, np-2, alpha+1, beta+1); - for (ordinal_type i = 1; i < np-1; ++i) - pd(i) *= (one-z(i)*z(i)); + for (ordinal_type i = 1; i < np-1; ++i) { + const auto & z_i = z(i); + pd(i) *= (one-z_i*z_i); + } pd(np-1) = -two*GammaFunction(np + alpha); pd(np-1) /= GammaFunction(np - one)*GammaFunction(alpha + two); - for (ordinal_type i = 0; i < np; ++i) - for (ordinal_type j = 0; j < np; ++j) - if (i != j) - D(i,j) = pd(i)/(pd(j)*(z(i)-z(j))); - else - if (j == 0) - D(i,j) = (alpha - (np-1)*(np + alpha + beta))/(two*(beta+ two)); - else if (j == np-1) - D(i,j) =-(beta - (np-1)*(np + alpha + beta))/(two*(alpha+ two)); - else - D(i,j) = (alpha - beta + (alpha + beta)*z(j))/ - (two*(one - z(j)*z(j))); + // The temporary view pd is stored in the last row of the matrix D + // This loop is designed so that we do not overwrite pd entries before we read them + for (ordinal_type i = 0; i < np; ++i) { + const auto & pd_i = pd(i); + const auto & z_i = z(i); + for (ordinal_type j = 0; j < i; ++j) { + const auto & pd_j = pd(j); + const auto & z_j = z(j); + D(j,i) = pd_j/(pd_i*(z_j-z_i)); + D(i,j) = pd_i/(pd_j*(z_i-z_j)); + } + if (i == 0) + D(i,i) = (alpha - (np-1)*(np + alpha + beta))/(two*(beta+ two)); + else if (i == np-1) + D(i,i) =-(beta - (np-1)*(np + alpha + beta))/(two*(alpha+ two)); + else + D(i,i) = (alpha - beta + (alpha + beta)*z_i)/(two*(one - z_i*z_i)); + } } } @@ -591,57 +598,51 @@ namespace Intrepid2 { for (ordinal_type i = 0; i < np; ++i) polyd(i) = 0.5*(alpha + beta + two); } else { - double a1, a2, a3, a4; - const double apb = alpha + beta; + INTREPID2_TEST_FOR_ABORT(polyd.data() && !polyd.data() , + ">>> ERROR (Polylib::Serial::JacobiPolynomial): polyi view needed to compute polyd view."); + if(!polyi.data()) return; - typename polyiViewType::value_type - poly[MaxPolylibPoint]={}, polyn1[MaxPolylibPoint]={}, polyn2[MaxPolylibPoint]={}; + constexpr ordinal_type maxOrder = 2*MaxPolylibPoint-1; - if (polyi.data()) - for (ordinal_type i=0;i>> ERROR (Polylib::Serial::JacobiPolynomial): Requested order exceeds maxOrder ."); + + double a2[maxOrder-1]={}, a3[maxOrder-1]={}, a4[maxOrder-1]={}; + double ad1(0.0), ad2(0.0), ad3(0.0); + const double apb = alpha + beta; + const double amb = alpha - beta; - for (ordinal_type i = 0; i < np; ++i) { - polyn2[i] = one; - polyn1[i] = 0.5*(alpha - beta + (alpha + beta + two)*z(i)); - } for (auto k = 2; k <= n; ++k) { - a1 = two*k*(k + apb)*(two*k + apb - two); - a2 = (two*k + apb - one)*(alpha*alpha - beta*beta); - a3 = (two*k + apb - two)*(two*k + apb - one)*(two*k + apb); - a4 = two*(k + alpha - one)*(k + beta - one)*(two*k + apb); - - a2 /= a1; - a3 /= a1; - a4 /= a1; - - for (ordinal_type i = 0; i < np; ++i) { - poly [i] = (a2 + a3*z(i))*polyn1[i] - a4*polyn2[i]; - polyn2[i] = polyn1[i]; - polyn1[i] = poly [i]; - } + double a1 = two*k*(k + apb)*(two*k + apb - two); + a2[k-2] = (two*k + apb - one)*(apb*amb)/a1; + a3[k-2] = (two*k + apb - two)*(two*k + apb - one)*(two*k + apb)/a1; + a4[k-2] = two*(k + alpha - one)*(k + beta - one)*(two*k + apb)/a1; } if (polyd.data()) { - a1 = n*(alpha - beta); - a2 = n*(two*n + alpha + beta); - a3 = two*(n + alpha)*(n + beta); - a4 = (two*n + alpha + beta); - a1 /= a4; - a2 /= a4; - a3 /= a4; - - // note polyn2 points to polyn1 at end of poly iterations - for (ordinal_type i = 0; i < np; ++i) { - polyd(i) = (a1- a2*z(i))*poly[i] + a3*polyn2[i]; - polyd(i) /= (one - z(i)*z(i)); - } + double ad4 = (two*n + alpha + beta); + ad1 = n*(alpha - beta)/ad4; + ad2 = n*(two*n + alpha + beta)/ad4; + ad3 = two*(n + alpha)*(n + beta)/ad4; } - if (polyi.data()) - for (ordinal_type i=0;i::value && std::is_trivial::value) ? 0 : get_dimension_scalar(view); } + + /// Struct for deleting device instantiation + template + struct DeviceDeleter { + template + void operator()(T* ptr) { + Kokkos::parallel_for(Kokkos::RangePolicy(0,1), + KOKKOS_LAMBDA (const int i) { ptr->~T(); }); + typename Device::execution_space().fence(); + Kokkos::kokkos_free(ptr); + } + }; + + /// Function for creating a vtable on device (requires copy ctor for + /// derived object). Allocates device memory and must be called from + /// host. + template + std::unique_ptr> + copy_virtual_class_to_device(const Derived& host_source) + { + auto* p = static_cast(Kokkos::kokkos_malloc(sizeof(Derived))); + Kokkos::parallel_for(Kokkos::RangePolicy(0,1), + KOKKOS_LAMBDA (const int i) {new (p) Derived(host_source); }); + typename Device::execution_space().fence(); + return std::unique_ptr>(p); + } } // end namespace Intrepid2 #endif diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/CMakeLists.txt index f080139e2292..f0c9e31cd911 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,76 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_HEX_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..cd4d6dabdf43 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_HEX_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_HEX_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..7f4e2807360d --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_I1_FEM/test_02.hpp @@ -0,0 +1,187 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_HEX_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_HEX_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_HEX_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_HEX_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HCURL_HEX_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0)<< ", " << outputCurlsA_Host(ic,i,j,1) << ", " << outputCurlsA_Host(ic,i,j,2) << "]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1)<< ", " << outputCurlsB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/CMakeLists.txt index b682181c9d5b..2e5f6844ed27 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -83,3 +86,75 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_HEX_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..41f15e65574e --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_HEX_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_HEX_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/test_02.hpp new file mode 100644 index 000000000000..e9ecf8ca65a4 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_HEX_In_FEM/test_02.hpp @@ -0,0 +1,203 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_HEX_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_HEX_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_HEX_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_HEX_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HCURL_HEX_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + // avoid using a team size larger than needed, to reduce allocated scrach space memory + ordinal_type team_size = teamPolicy.team_size_recommended(functor, Kokkos::ParallelForTag()); + *outStream << "Max Recommended team size: " << team_size << ", Requested team size: " << npts <(ncells, team_size,vectorSize); + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + // avoid using a team size larger than needed, to reduce allocated scrach space memory + ordinal_type team_size = teamPolicy.team_size_recommended(functor, Kokkos::ParallelForTag()); + *outStream << "Max Recommended team size: " << team_size << ", Requested team size: " << npts <(ncells, team_size,vectorSize); + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Order: " << order << ": Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0)<< ", " << outputCurlsA_Host(ic,i,j,1) << ", " << outputCurlsA_Host(ic,i,j,2) << "]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1)<< ", " << outputCurlsB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/CMakeLists.txt index 716000daf9b3..89117d0742fb 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,75 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_QUAD_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..01d5359f6b02 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_QUAD_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_QUAD_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..ecef9a26d9d4 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_I1_FEM/test_02.hpp @@ -0,0 +1,185 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_QUAD_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_QUAD_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_QUAD_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_QUAD_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HCURL_QUAD_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j)<<"]" + << ", curls B: [" << outputCurlsB_Host(i,j) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/CMakeLists.txt index c831e83ec896..2f44c158238f 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HCURL_QUAD_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_QUAD_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HCURL_QUAD_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..daa3176be226 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_QUAD_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_QUAD_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/test_02.hpp new file mode 100644 index 000000000000..2ae8438a11ac --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_QUAD_In_FEM/test_02.hpp @@ -0,0 +1,189 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_QUAD_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_QUAD_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_QUAD_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_QUAD_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HCURL_QUAD_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Order: " << order << ": Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j)<<"]" + << ", curls B: [" << outputCurlsB_Host(i,j) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/CMakeLists.txt index 46e84774b70d..234b8e2d6fed 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,76 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_TET_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..8fadc4a2c865 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_TET_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_TET_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..9c112664aff0 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_I1_FEM/test_02.hpp @@ -0,0 +1,187 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_TET_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_TET_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_TET_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_TET_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HCURL_TET_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << ", " << outputValuesA_Host(ic,i,j,2) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << ", " << outputValuesB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0)<< ", " << outputCurlsA_Host(ic,i,j,1) << ", " << outputCurlsA_Host(ic,i,j,2) << "]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1)<< ", " << outputCurlsB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/CMakeLists.txt index 46e4453c0d57..c40f3503ccf9 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HCURL_TET_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_TET_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HCURL_TET_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..278f9326b54c --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_TET_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_TET_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/test_02.hpp new file mode 100644 index 000000000000..d51a4ed29ae1 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TET_In_FEM/test_02.hpp @@ -0,0 +1,205 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_TET_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_TET_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_TET_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_TET_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 7; + + try { + for (int order=1;order <= maxOrder;++order) { + using BasisType = Basis_HCURL_TET_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + // avoid using a team size larger than needed, to reduce allocated scrach space memory + ordinal_type team_size = teamPolicy.team_size_recommended(functor, Kokkos::ParallelForTag()); + *outStream << "Max Recommended team size: " << team_size << ", Requested team size: " << npts <(ncells, team_size,vectorSize); + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + // avoid using a team size larger than needed, to reduce allocated scrach space memory + ordinal_type team_size = teamPolicy.team_size_recommended(functor, Kokkos::ParallelForTag()); + *outStream << "Max Recommended team size: " << team_size << ", Requested team size: " << npts <(ncells, team_size,vectorSize); + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Order: " << order << ": Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + //Note, the PR intel 2021 serial build shows substantially higher errors (possibly due to operation rearrangements). + auto tol = 1.0e6*epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0)<< ", " << outputCurlsA_Host(ic,i,j,1) << ", " << outputCurlsA_Host(ic,i,j,2) << "]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1)<< ", " << outputCurlsB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/CMakeLists.txt index 2bf7bfdee691..b87adda0a338 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,75 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_TRI_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..2fe8396db2d1 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_TRI_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_TRI_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..3b255303dd1f --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_I1_FEM/test_02.hpp @@ -0,0 +1,185 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_TRI_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_TRI_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_TRI_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_TRI_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HCURL_TRI_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j)<<"]" + << ", curls B: [" << outputCurlsB_Host(i,j) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/CMakeLists.txt index 49f5b786efc9..ec30d2154004 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HCURL_TRI_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_TRI_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HCURL_TRI_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..509fff60809f --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_TRI_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_TRI_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/test_02.hpp new file mode 100644 index 000000000000..aa19ce2114e0 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_TRI_In_FEM/test_02.hpp @@ -0,0 +1,189 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_TRI_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_TRI_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_TRI_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_TRI_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HCURL_TRI_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Order: " << order << ": Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j)<<"]" + << ", curls B: [" << outputCurlsB_Host(i,j) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/CMakeLists.txt index ba0496748a48..cb1ebf7b3de1 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,76 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HCURL_WEDGE_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..1b963155651c --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HCURL_WEDGE_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HCURL_WEDGE_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..de75f4cf2d72 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HCURL_WEDGE_I1_FEM/test_02.hpp @@ -0,0 +1,188 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HCURL_WEDGE_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HCURL_WEDGE_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HCURL_WEDGE_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HCURL_WEDGE_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HCURL_WEDGE_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and curls for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and curls for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and curls on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0)<< ", " << outputCurlsA_Host(ic,i,j,1) << ", " << outputCurlsA_Host(ic,i,j,2) << "]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1)<< ", " << outputCurlsB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/CMakeLists.txt index fd4d688a591d..3fb8fc747f9c 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/CMakeLists.txt @@ -1,13 +1,18 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") - + IF (HAVE_INTREPID2_SACADO) # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DOUBLE") # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,76 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_HEX_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..ab24cfec247d --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_HEX_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_HEX_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..fb05ad186945 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_I1_FEM/test_02.hpp @@ -0,0 +1,186 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_HEX_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_HEX_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_HEX_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_HEX_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HDIV_HEX_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << ", " << outputValuesA_Host(ic,i,j,2) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << ", " << outputValuesB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/CMakeLists.txt index a0e677500751..7a81181c8403 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HDIV_HEX_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_HEX_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HDIV_HEX_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..71b715c78833 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_HEX_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_HEX_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/test_02.hpp new file mode 100644 index 000000000000..61c3d844f5dd --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_HEX_In_FEM/test_02.hpp @@ -0,0 +1,190 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_HEX_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_HEX_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + + template + int HDIV_HEX_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_HEX_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HDIV_HEX_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Order: " << order << ": Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << ", " << outputValuesA_Host(ic,i,j,2) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << ", " << outputValuesB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/CMakeLists.txt index 5900fa72e32a..b21760f88ec4 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/CMakeLists.txt @@ -1,13 +1,18 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") - + IF (HAVE_INTREPID2_SACADO) # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DOUBLE") # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,76 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_QUAD_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..5e1eb4c2dc79 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_QUAD_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_QUAD_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..a811df7230c7 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_I1_FEM/test_02.hpp @@ -0,0 +1,185 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_QUAD_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_QUAD_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_QUAD_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_QUAD_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HDIV_QUAD_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/CMakeLists.txt index 59d15e7c716c..cdc2989d6036 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HDIV_QUAD_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_QUAD_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HDIV_QUAD_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..328d40fda920 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_QUAD_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_QUAD_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/test_02.hpp new file mode 100644 index 000000000000..529007a97787 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_QUAD_In_FEM/test_02.hpp @@ -0,0 +1,190 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_QUAD_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_QUAD_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_QUAD_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_QUAD_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HDIV_QUAD_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Order: " << order << ": Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/CMakeLists.txt index ea49b4cde715..6b8c89a459f9 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/CMakeLists.txt @@ -1,13 +1,18 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") - + IF (HAVE_INTREPID2_SACADO) # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DOUBLE") # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,76 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_TET_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..72f12aed7a9b --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_TET_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_TET_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..2da333826892 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_I1_FEM/test_02.hpp @@ -0,0 +1,185 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_TET_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_TET_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_TET_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_TET_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HDIV_TET_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << ", " << outputValuesA_Host(ic,i,j,2) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << ", " << outputValuesB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/CMakeLists.txt index 264ce7c056ff..f4a2093e0e4f 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HDIV_TET_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_TET_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HDIV_TET_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..c08e06044acf --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_TET_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_TET_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/test_02.hpp new file mode 100644 index 000000000000..1d5f9059327c --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TET_In_FEM/test_02.hpp @@ -0,0 +1,190 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_TET_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_TET_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_TET_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_TET_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 7; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HDIV_TET_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Order: " << order << ": Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << ", " << outputValuesA_Host(ic,i,j,2) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << ", " << outputValuesB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + //Note, the PR intel 2021 serial build shows substantially higher errors (possibly due to operation rearrangements). + auto tol = 1e6*epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/CMakeLists.txt index 4f47ee20c141..581f594311e0 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/CMakeLists.txt @@ -1,13 +1,18 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") - + IF (HAVE_INTREPID2_SACADO) # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DOUBLE") # LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,76 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_TRI_I1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..99b3fb273163 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_TRI_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_TRI_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..15b4152a781a --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_I1_FEM/test_02.hpp @@ -0,0 +1,185 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_TRI_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_TRI_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_TRI_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_TRI_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HDIV_TRI_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/CMakeLists.txt index 4f11a0b1e70c..f06b5f1bb859 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HDIV_TRI_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_TRI_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HDIV_TRI_In_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..060c322dc641 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_TRI_In_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_TRI_In_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/test_02.hpp new file mode 100644 index 000000000000..1d3c940090d7 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_TRI_In_FEM/test_02.hpp @@ -0,0 +1,189 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_TRI_In_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_TRI_In_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_TRI_In_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_TRI_In_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HDIV_TRI_In_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Order: " << order << ": Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/CMakeLists.txt index 67fcf71311f7..d801c634869e 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,3 +71,75 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HDIV_WEDGE_In_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..16906a746c00 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HDIV_WEDGE_I1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HDIV_WEDGE_I1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/test_02.hpp new file mode 100644 index 000000000000..c991769a4852 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HDIV_WEDGE_I1_FEM/test_02.hpp @@ -0,0 +1,185 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HDIV_WEDGE_I1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HDIV_WEDGE_I1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HDIV_WEDGE_I1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HDIV_WEDGE_I1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HDIV_WEDGE_I1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputDivergencesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and divergences for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute divergences + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto divergencesACell = Kokkos::subview(outputDivergencesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(divergencesACell, inputPoints, OPERATOR_DIV, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_DIV); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and divergences for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputDivergencesB, inputPoints, OPERATOR_DIV); + + *outStream << "Comparing values and divergences on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: [" << outputValuesA_Host(ic,i,j,0) << ", " << outputValuesA_Host(ic,i,j,1) << ", " << outputValuesA_Host(ic,i,j,2) << "]" + << ", val B: [" << outputValuesB_Host(i,j,0) << ", " << outputValuesB_Host(i,j,1) << ", " << outputValuesB_Host(i,j,2) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare divergences + const auto outputDivergencesA_Host = Kokkos::create_mirror_view(outputDivergencesA); Kokkos::deep_copy(outputDivergencesA_Host, outputDivergencesA); + const auto outputDivergencesB_Host = Kokkos::create_mirror_view(outputDivergencesB); Kokkos::deep_copy(outputDivergencesB_Host, outputDivergencesB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", divergence A: " << outputDivergencesA_Host(ic,i,j) + << ", divergence B: " << outputDivergencesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/CMakeLists.txt index 88da0999c2ab..4982bd4f8dff 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/CMakeLists.txt @@ -3,6 +3,7 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -67,6 +68,7 @@ SET(Intrepid2_TEST_ETI_FILE "test_01") SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") + IF(Kokkos_ENABLE_CUDA) LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") @@ -106,3 +108,75 @@ IF (${ETI_DEVICE_COUNT} GREATER_EQUAL 0) ENDFOREACH() ENDFOREACH() ENDIF() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_03") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_HEX_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_HEX_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/eti/test_03_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/eti/test_03_ETI.in new file mode 100644 index 000000000000..a88bf31183c7 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/eti/test_03_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_03.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_HEX_C1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_03.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_HEX_C1_FEM_Test03<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/test_03.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/test_03.hpp new file mode 100644 index 000000000000..9d326a80cf33 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C1_FEM/test_03.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_03.hpp + \brief Unit tests for the Intrepid2::HGRAD_HEX_C1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_HEX_C1_FEM_Test03(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_HEX_C1_FEM, Test 3", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_HEX_C1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/CMakeLists.txt index a9bfc7f38abb..29e3244386c9 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/CMakeLists.txt @@ -1,8 +1,16 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "") +LIST(APPEND Intrepid2_TEST_ETI_FILE + "test_01" + "test_01_Serendipity") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,12 +25,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "") -LIST(APPEND Intrepid2_TEST_ETI_FILE - "test_01" - "test_01_Serendipity") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_HEX_C2_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_HEX_C2_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..f10b05aa223e --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_HEX_C2_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_HEX_C2_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/test_02.hpp new file mode 100644 index 000000000000..a29875462280 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_C2_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_HEX_C2_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_HEX_C2_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_HEX_C2_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_HEX_C2_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/CMakeLists.txt index 793d773f707f..ba86fece89df 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/CMakeLists.txt @@ -7,6 +7,7 @@ SET(Intrepid2_TEST_ETI_FILE "test_01") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -94,11 +95,18 @@ SET(Intrepid2_TEST_ETI_FILE "test_02") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_01_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_01_ETI.in index 487708632660..25426631d6de 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_01_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_01_ETI.in @@ -28,9 +28,8 @@ constexpr int num_deriv = 10; #define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 23) -constexpr int num_deriv = 9; -constexpr int max_deriv = 10; -#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, max_deriv+1) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 20) constexpr int num_deriv = 2; diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_02_ETI.in index 6a200d58b21d..d314677fd1db 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_02_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/eti/test_02_ETI.in @@ -7,22 +7,45 @@ // ***************************************************************************** // @HEADER -/** \file test_01.cpp - \brief Unit test of serial interface Intrepid2::Basis_HGRAD_QUAD_Cn_FEM. +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_HEX_Cn_FEM team-level getValues. \author Kyungjoo Kim */ #include "Kokkos_Core.hpp" +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + #include "test_02.hpp" int main(int argc, char *argv[]) { + const bool verbose = (argc-1) > 0; Kokkos::initialize(); - { - const bool verbose = (argc-1) > 0; - Intrepid2::Test::HGRAD_HEX_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); - } + + Intrepid2::Test::HGRAD_HEX_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + Kokkos::finalize(); return 0; } diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/test_02.hpp index b98955113bde..e392f1540447 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/test_02.hpp +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_HEX_Cn_FEM/test_02.hpp @@ -7,9 +7,9 @@ // ***************************************************************************** // @HEADER -/** \file test_01.hpp +/** \file test_02.hpp \brief Unit tests for the Intrepid2::HGRAD_HEX_Cn_FEM class. - \author Created by P. Bochev, D. Ridzal, K. Peterson, Kyungjoo Kim + \author Created by Kyungjoo Kim, Mauro Perego */ @@ -23,100 +23,151 @@ #include "Intrepid2_Utils.hpp" #include "Intrepid2_HGRAD_HEX_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" namespace Intrepid2 { namespace Test { - // This code provides an example to use serial interface of high order elements + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. template int HGRAD_HEX_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_HEX_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + using DeviceSpaceType = typename DeviceType::execution_space; Kokkos::print_configuration(std::cout, false); int errorFlag = 0; - + constexpr int maxOrder = 9; try { - // for higher orders in certain environments, this test can take a while to run in ctest. So we reduce the number of points as we go beyond 10th order. Also, @mperego is replacing this test, so for now we just restrict to the 10 orders we supported until recently. - for (int order=1;order<10;++order) { - Basis_HGRAD_HEX_Cn_FEM basis(order); + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HGRAD_HEX_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); - // problem setup - // let's say we want to evaluate 1000 points in parallel. output values are stored in outputValuesA and B. - // A is compuated via serial interface and B is computed with top-level interface. - const int npts = 1000, ndim = 3; - Kokkos::DynRankView outputValuesA("outputValuesA", basis.getCardinality(), npts); - Kokkos::DynRankView outputValuesB("outputValuesB", basis.getCardinality(), npts); + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; - Kokkos::View inputPointsViewToUseRandom("inputPoints", npts, ndim); - Kokkos::DynRankView inputPoints (inputPointsViewToUseRandom.data(), npts, ndim); + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); - // random values between (-1,1) x (-1,1) + // random values between (0,1) Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); - // compute setup - // we need vinv and workspace - const auto vinv = basis.getVandermondeInverse(); - - // worksize - // workspace per thread is required for serial interface. - // parallel_for with range policy would be good to use stack workspace - // as team policy only can create shared memory - // this part would be tricky as the max size should be determined at compile time - // let's think about this and find out the best practice. for now I use the following. - constexpr int worksize = (Parameters::MaxOrder+1)*4; - - // if you use team policy, worksize can be gathered from the basis object and use - // kokkos shmem_size APIs to create workspace per team or per thread. - //const auto worksize_for_teampolicy = basis.getWorksizePerPoint(OPERATOR_VALUE); - - // extract point range to be evaluated in each thread - typedef Kokkos::pair range_type; - - // parallel execution with serial interface - Kokkos::RangePolicy policy(0, npts); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i) { - // we evaluate a single point - const range_type pointRange = range_type(i,i+1); - - // out (# dofs, # pts), input (# pts, # dims) - auto output = Kokkos::subview(outputValuesA, Kokkos::ALL(), pointRange); - auto input = Kokkos::subview(inputPoints, pointRange, Kokkos::ALL()); + + *outStream << "Order: " << order << ": Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; - // wrap static workspace with a view; serial interface has a template view interface. - // either view or dynrankview with a right size is okay. - OutValueType workbuf[worksize]; - Kokkos::View work(&workbuf[0], worksize); + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; - // evaluate basis using serial interface - Impl::Basis_HGRAD_HEX_Cn_FEM - ::Serial::getValues(output, input, work, vinv); - }); - - // evaluation using high level interface - basis.getValues(outputValuesB, inputPoints, OPERATOR_VALUE); - - // compare - const auto outputValuesA_Host = Kokkos::create_mirror_view(outputValuesA); Kokkos::deep_copy(outputValuesA_Host, outputValuesA); - const auto outputValuesB_Host = Kokkos::create_mirror_view(outputValuesB); Kokkos::deep_copy(outputValuesB_Host, outputValuesB); - - double sum = 0, diff = 0; - for (size_t i=0;i 1.0e-9)) { - std::cout << " order = " << order - << " i = " << i << " j = " << j - << " val A = " << outputValuesA_Host(i,j) - << " val B = " << outputValuesB_Host(i,j) - << " diff = " << (outputValuesA_Host(i,j) - outputValuesB_Host(i,j)) - << std::endl; - } + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); } - if (diff/sum > 1.0e-9) { - errorFlag = -1; + } + + *outStream << "Order: " << order << ": Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Order: " << order << ": Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } } } } catch (std::exception &err) { diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/CMakeLists.txt index 940d4ad3ebb4..186c0369d09e 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + # MESSAGE(STATUS "Generating TEST HGRAD_LINE_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_LINE_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..69bb74d6a746 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_LINE_C1_FEM team-level get Values. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_LINE_C1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/test_02.hpp new file mode 100644 index 000000000000..c0d1db740ce9 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C1_FEM/test_02.hpp @@ -0,0 +1,185 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_LINE_C1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_LINE_C1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + template + int HGRAD_LINE_C1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_LINE_C1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_LINE_C1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + // problem setup + // let's say we want to evaluate 1000 points in parallel. output values are stored in outputValuesA and B. + // A is compuated via serial interface and B is computed with top-level interface. + const int ncells = 5, npts = 10, ndim = 1; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/CMakeLists.txt index f26e93eb35f6..47fc663ffd89 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_LINE_C2_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_LINE_C2_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..76e7d225aa79 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_LINE_C2_FEM team-level get Values. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_LINE_C2_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/test_02.hpp new file mode 100644 index 000000000000..7c40e6e00dd1 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_C2_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_LINE_C2_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_LINE_C2_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_LINE_C2_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_LINE_C2_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_LINE_C2_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 1; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/CMakeLists.txt index 40be3eb7ddf0..088e2285ac8f 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_LINE_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_LINE_Cn_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_01_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_01_ETI.in index 070fba1f3916..b662965e7493 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_01_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_01_ETI.in @@ -28,10 +28,8 @@ constexpr int num_deriv = 10; #define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 23) -/// Mauro, the master branch uses this derivative dimension which sounds dummy -constexpr int num_deriv = 0; //9; -constexpr int max_deriv = 1; //10; -#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, max_deriv+1) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 20) constexpr int num_deriv = 2; diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..2ff629694b47 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_LINE_Cn_FEM team-level get Values. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_LINE_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/test_02.hpp new file mode 100644 index 000000000000..23dafa935f7e --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_LINE_Cn_FEM/test_02.hpp @@ -0,0 +1,188 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_LINE_Cn_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_LINE_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function.s + template + int HGRAD_LINE_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_LINE_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + for (int order=1;order<=Parameters::MaxOrder;++order) { + using BasisType = Basis_HGRAD_LINE_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 1; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Order: " << order << ": Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/CMakeLists.txt index aac1913c1e91..60b3eaa1ed20 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_PYR_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_PYR_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..ac8e1393df20 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_PYR_C1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_PYR_C1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/test_02.hpp new file mode 100644 index 000000000000..ae1ba8b9b47d --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_C1_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_PYR_C1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_PYR_C1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_PYR_C1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_PYR_C1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_PYR_C1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/CMakeLists.txt index fdbf58124c2e..813910ef9f3d 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,7 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") - +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -68,6 +71,79 @@ FOREACH(I RANGE ${ETI_DEVICE_COUNT}) ENDFOREACH() ENDFOREACH() + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_TET_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_TET_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + TRIBITS_COPY_FILES_TO_BINARY_DIR(HGRAD_PYR_I2TestDataCopy SOURCE_FILES PYR_I2_D2Vals.dat diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..b1bc686c303d --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_PYR_I2_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_PYR_I2_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/test_02.hpp new file mode 100644 index 000000000000..39b7903b384a --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_PYR_I2_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_PYR_I2_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_PYR_I2_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_PYR_I2_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_PYR_I2_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_PYR_I2_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/CMakeLists.txt index ef324d6c681f..593042946b82 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_QUAD_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_QUAD_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..7650cb60968c --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of serial interface Intrepid2::Basis_HGRAD_QUAD_C1_FEM. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_QUAD_C1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/test_02.hpp new file mode 100644 index 000000000000..2dba47d6f022 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C1_FEM/test_02.hpp @@ -0,0 +1,228 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_QUAD_C1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_QUAD_C1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_QUAD_C1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_QUAD_C1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0) << ", " << outputCurlsA_Host(ic,i,j,1) <<"]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/CMakeLists.txt index 6d92bb337ac4..14d863a19fc7 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/CMakeLists.txt @@ -1,8 +1,16 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "") +LIST(APPEND Intrepid2_TEST_ETI_FILE + "test_01" + "test_01_Serendipity") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,12 +25,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "") -LIST(APPEND Intrepid2_TEST_ETI_FILE - "test_01" - "test_01_Serendipity") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_QUAD_C2_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_QUAD_C2_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..3dac2095b0b6 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of serial interface Intrepid2::Basis_HGRAD_QUAD_C2_FEM. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_QUAD_C2_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/test_02.hpp new file mode 100644 index 000000000000..cf2ba0043d7b --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_C2_FEM/test_02.hpp @@ -0,0 +1,228 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_QUAD_C2_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + namespace Test { + + template + int HGRAD_QUAD_C2_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_QUAD_C2_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_QUAD_C2_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0) << ", " << outputCurlsA_Host(ic,i,j,1) <<"]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/CMakeLists.txt index 9a6190ea4405..50f38bf22177 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/CMakeLists.txt @@ -7,6 +7,7 @@ SET(Intrepid2_TEST_ETI_FILE "test_01") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -94,11 +95,18 @@ SET(Intrepid2_TEST_ETI_FILE "test_02") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_01_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_01_ETI.in index 46bd4b13ccfb..0cae06ee9e31 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_01_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_01_ETI.in @@ -28,9 +28,8 @@ constexpr int num_deriv = 10; #define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 23) -constexpr int num_deriv = 9; -constexpr int max_deriv = 10; -#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, max_deriv+1) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 20) constexpr int num_deriv = 2; diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_02_ETI.in index ccb60ba60798..07adf2c5e888 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_02_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/eti/test_02_ETI.in @@ -8,21 +8,44 @@ // @HEADER /** \file test_01.cpp - \brief Unit test of serial interface Intrepid2::Basis_HGRAD_QUAD_Cn_FEM. + \brief Unit test of Intrepid2::Basis_HGRAD_QUAD_Cn_FEM team-level getValues. \author Kyungjoo Kim */ #include "Kokkos_Core.hpp" +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + #include "test_02.hpp" int main(int argc, char *argv[]) { + const bool verbose = (argc-1) > 0; Kokkos::initialize(); - { - const bool verbose = (argc-1) > 0; - Intrepid2::Test::HGRAD_QUAD_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); - } + + Intrepid2::Test::HGRAD_QUAD_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + Kokkos::finalize(); return 0; } diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/test_02.hpp index 36a858dec901..2e2bdf715470 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/test_02.hpp +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_QUAD_Cn_FEM/test_02.hpp @@ -7,9 +7,9 @@ // ***************************************************************************** // @HEADER -/** \file test_01.hpp +/** \file test_02.hpp \brief Unit tests for the Intrepid2::HGRAD_QUAD_Cn_FEM class. - \author Created by P. Bochev, D. Ridzal, K. Peterson, Kyungjoo Kim + \author Created by Kyungjoo Kim, Mauro Perego */ @@ -23,98 +23,196 @@ #include "Intrepid2_Utils.hpp" #include "Intrepid2_HGRAD_QUAD_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" namespace Intrepid2 { namespace Test { - // This code provides an example to use serial interface of high order elements + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. template int HGRAD_QUAD_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_QUAD_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; Kokkos::print_configuration(std::cout, false); int errorFlag = 0; - + constexpr int maxOrder = 9; try { - for (int order=1;order basis(order); + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HGRAD_QUAD_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); - // problem setup - // let's say we want to evaluate 1000 points in parallel. output values are stored in outputValuesA and B. - // A is compuated via serial interface and B is computed with top-level interface. - const int npts = 1000, ndim = 2; - Kokkos::DynRankView outputValuesA("outputValuesA", basis.getCardinality(), npts); - Kokkos::DynRankView outputValuesB("outputValuesB", basis.getCardinality(), npts); + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); - Kokkos::View inputPointsViewToUseRandom("inputPoints", npts, ndim); - Kokkos::DynRankView inputPoints (inputPointsViewToUseRandom.data(), npts, ndim); + using ScalarType = typename ScalarTraits::scalar_type; - // random values between (-1,1) x (-1,1) + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); - // compute setup - // we need vinv and workspace - const auto vinv = basis.getVandermondeInverse(); - - // worksize - // workspace per thread is required for serial interface. - // parallel_for with range policy would be good to use stack workspace - // as team policy only can create shared memory - // this part would be tricky as the max size should be determined at compile time - // let's think about this and find out the best practice. for now I use the following. - constexpr int worksize = (Parameters::MaxOrder+1)*3; - - // if you use team policy, worksize can be gathered from the basis object and use - // kokkos shmem_size APIs to create workspace per team or per thread. - //const auto worksize_for_teampolicy = basis.getWorksizePerPoint(OPERATOR_VALUE); - - // extract point range to be evaluated in each thread - typedef Kokkos::pair range_type; - - // parallel execution with serial interface - Kokkos::RangePolicy policy(0, npts); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i) { - // we evaluate a single point - const range_type pointRange = range_type(i,i+1); + + *outStream << "Order: " << order << ": Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; - // out (# dofs, # pts), input (# pts, # dims) - auto output = Kokkos::subview(outputValuesA, Kokkos::ALL(), pointRange); - auto input = Kokkos::subview(inputPoints, pointRange, Kokkos::ALL()); + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; - // wrap static workspace with a view; serial interface has a template view interface. - // either view or dynrankview with a right size is okay. - OutValueType workbuf[worksize]; - Kokkos::View work(&workbuf[0], worksize); + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; - // evaluate basis using serial interface - Impl::Basis_HGRAD_QUAD_Cn_FEM - ::Serial::getValues(output, input, work, vinv); - }); - - // evaluation using high level interface - basis.getValues(outputValuesB, inputPoints, OPERATOR_VALUE); - - // compare - const auto outputValuesA_Host = Kokkos::create_mirror_view(outputValuesA); Kokkos::deep_copy(outputValuesA_Host, outputValuesA); - const auto outputValuesB_Host = Kokkos::create_mirror_view(outputValuesB); Kokkos::deep_copy(outputValuesB_Host, outputValuesB); - - double sum = 0, diff = 0; - for (size_t i=0;igetValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Order: " << order << ": Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0) << ", " << outputCurlsA_Host(ic,i,j,1) <<"]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } } } } catch (std::exception &err) { diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/CMakeLists.txt index 37135caa841f..325ab37afd81 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_TET_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_TET_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..7fab72655932 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_TET_C1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_TET_C1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/test_02.hpp new file mode 100644 index 000000000000..48d5b3eb9e2f --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C1_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_TET_C1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_TET_C1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_TET_C1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_TET_C1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_TET_C1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/CMakeLists.txt index 2d0041218982..cac75a8505b4 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_TET_C2_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_TET_C2_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..b01c59418753 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_TET_C2_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_TET_C2_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/test_02.hpp new file mode 100644 index 000000000000..5b788cc85328 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_C2_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_TET_C2_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_TET_C2_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + namespace Test { + + template + int HGRAD_TET_C2_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_TET_C2_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_TET_C2_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/CMakeLists.txt index 09fddb77ac02..3bc181264b2b 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_TET_COMP12_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_TET_COMP12_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..c7a155d2f8a9 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_TET_COMP12_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_TET_COMP12_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/test_02.hpp new file mode 100644 index 000000000000..9120a9bf8b53 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_COMP12_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_TET_COMP12_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_TET_COMP12_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_TET_COMP12_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_TET_COMP12_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_TET_COMP12_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/CMakeLists.txt index cb2c34d9f2e6..b669b1b2ba1f 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/CMakeLists.txt @@ -7,6 +7,7 @@ SET(Intrepid2_TEST_ETI_FILE "test_01") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -94,11 +95,18 @@ SET(Intrepid2_TEST_ETI_FILE "test_02") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_01_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_01_ETI.in index 74f1bccc00db..c997523b3120 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_01_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_01_ETI.in @@ -28,10 +28,8 @@ constexpr int num_deriv = 10; #define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 23) -/// Mauro, the master branch uses this derivative dimension which sounds dummy -constexpr int num_deriv = 0; //9; -constexpr int max_deriv = 1; //10; -#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, max_deriv+1) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 20) constexpr int num_deriv = 2; diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_02_ETI.in index c26586d323cf..a6b3263eb7c0 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_02_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/eti/test_02_ETI.in @@ -8,21 +8,44 @@ // @HEADER /** \file test_01.cpp - \brief Unit test of serial interface Intrepid2::Basis_HGRAD_TET_Cn_FEM. + \brief Unit test of Intrepid2::Basis_HGRAD_TET_Cn_FEM team-level getValues. \author Kyungjoo Kim */ #include "Kokkos_Core.hpp" +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + #include "test_02.hpp" int main(int argc, char *argv[]) { + const bool verbose = (argc-1) > 0; Kokkos::initialize(); - { - const bool verbose = (argc-1) > 0; - Intrepid2::Test::HGRAD_TET_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); - } + + Intrepid2::Test::HGRAD_TET_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + Kokkos::finalize(); return 0; } diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/test_02.hpp index 4f6c6c3a33e0..711fd6d35bdb 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/test_02.hpp +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TET_Cn_FEM/test_02.hpp @@ -7,9 +7,9 @@ // ***************************************************************************** // @HEADER -/** \file test_01.hpp +/** \file test_02.hpp \brief Unit tests for the Intrepid2::HGRAD_TET_Cn_FEM class. - \author Created by P. Bochev, D. Ridzal, K. Peterson, Kyungjoo Kim + \author Created by Kyungjoo Kim, Mauro Perego */ @@ -23,99 +23,153 @@ #include "Intrepid2_Utils.hpp" #include "Intrepid2_HGRAD_TET_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" namespace Intrepid2 { namespace Test { - // This code provides an example to use serial interface of high order elements + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. template int HGRAD_TET_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_TET_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + using DeviceSpaceType = typename DeviceType::execution_space; Kokkos::print_configuration(std::cout, false); int errorFlag = 0; - + constexpr int maxOrder = 7; try { - for (int order=1;order<10;++order) { - Basis_HGRAD_TET_Cn_FEM basis(order); + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HGRAD_TET_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); - // problem setup - // let's say we want to evaluate 1000 points in parallel. output values are stored in outputValuesA and B. - // A is compuated via serial interface and B is computed with top-level interface. - const int npts = 1000, ndim = 3; - Kokkos::DynRankView outputValuesA("outputValuesA", basis.getCardinality(), npts); - Kokkos::DynRankView outputValuesB("outputValuesB", basis.getCardinality(), npts); + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; - Kokkos::View inputPointsViewToUseRandom("inputPoints", npts, ndim); - Kokkos::DynRankView inputPoints (inputPointsViewToUseRandom.data(), npts, ndim); + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); - // random values between (-1,1) x (-1,1) + // random values between (0,1) Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); - // compute setup - // we need vinv and workspace - const auto vinv = basis.getVandermondeInverse(); - - // worksize - // workspace per thread is required for serial interface. - // parallel_for with range policy would be good to use stack workspace - // as team policy only can create shared memory - // this part would be tricky as the max size should be determined at compile time - // let's think about this and find out the best practice. for now I use the following. - constexpr int worksize = (Parameters::MaxOrder+1)*(Parameters::MaxOrder+1)*(Parameters::MaxOrder+1); - - // if you use team policy, worksize can be gathered from the basis object and use - // kokkos shmem_size APIs to create workspace per team or per thread. - //const auto worksize_for_teampolicy = basis.getWorksizePerPoint(OPERATOR_VALUE); - - // extract point range to be evaluated in each thread - typedef Kokkos::pair range_type; - - // parallel execution with serial interface - Kokkos::RangePolicy policy(0, npts); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i) { - // we evaluate a single point - const range_type pointRange = range_type(i,i+1); - - // out (# dofs, # pts), input (# pts, # dims) - auto output = Kokkos::subview(outputValuesA, Kokkos::ALL(), pointRange); - auto input = Kokkos::subview(inputPoints, pointRange, Kokkos::ALL()); + + *outStream << "Order: " << order << ": Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; - // wrap static workspace with a view; serial interface has a template view interface. - // either view or dynrankview with a right size is okay. - OutValueType workbuf[worksize]; - Kokkos::View work(&workbuf[0], worksize); + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; - // evaluate basis using serial interface - Impl::Basis_HGRAD_TET_Cn_FEM - ::Serial::getValues(output, input, work, vinv); - }); - - // evaluation using high level interface - basis.getValues(outputValuesB, inputPoints, OPERATOR_VALUE); - - // compare - const auto outputValuesA_Host = Kokkos::create_mirror_view(outputValuesA); Kokkos::deep_copy(outputValuesA_Host, outputValuesA); - const auto outputValuesB_Host = Kokkos::create_mirror_view(outputValuesB); Kokkos::deep_copy(outputValuesB_Host, outputValuesB); - - double sum = 0, diff = 0; - for (size_t i=0;igetValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Order: " << order << ": Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + + //Note, the PR intel 2021 serial build shows substantially higher errors (possibly due to operation rearrangements). + auto tol = 1.0e6*epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } } } } catch (std::exception &err) { diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/CMakeLists.txt index eb726da6bb26..c5f307a89f52 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_TRI_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_TRI_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..1a918203d7cc --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_TRI_C1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_TRI_C1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/test_02.hpp new file mode 100644 index 000000000000..928394ded0a4 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C1_FEM/test_02.hpp @@ -0,0 +1,228 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_TRI_C1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_TRI_C1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_TRI_C1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_TRI_C1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0) << ", " << outputCurlsA_Host(ic,i,j,1) <<"]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/CMakeLists.txt index 21c4f220d58a..ae831c937e39 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_TRI_C2_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_TRI_C2_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..cd49ca800b02 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_TRI_C2_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_TRI_C2_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/test_02.hpp new file mode 100644 index 000000000000..60b8c49aa454 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_C2_FEM/test_02.hpp @@ -0,0 +1,228 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.hpp + \brief Unit tests for the Intrepid2::HGRAD_TRI_C2_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_TRI_C2_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_TRI_C2_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_TRI_C2_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_TRI_C2_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_CURL); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0) << ", " << outputCurlsA_Host(ic,i,j,1) <<"]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/CMakeLists.txt index 28b96612c334..4855e54c85a1 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/CMakeLists.txt @@ -7,6 +7,7 @@ SET(Intrepid2_TEST_ETI_FILE "test_01") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -94,11 +95,18 @@ SET(Intrepid2_TEST_ETI_FILE "test_02") # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_01_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_01_ETI.in index 513cf7cb37bc..eaaead469fb6 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_01_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_01_ETI.in @@ -14,7 +14,6 @@ #include "Kokkos_Core.hpp" - #define ETI_SACADO @ETI_SACADO@ #if (ETI_SACADO != 0) /// SACADO #include "Kokkos_ViewFactory.hpp" @@ -29,16 +28,15 @@ constexpr int num_deriv = 10; #define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 23) -/// Mauro, the master branch uses this derivative dimension which sounds dummy -constexpr int num_deriv = 0; //9; -constexpr int max_deriv = 1; //10; -#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, max_deriv+1) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #elif (ETI_SACADO == 20) constexpr int num_deriv = 2; #define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) #define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) #endif + #include "test_01.hpp" int main(int argc, char *argv[]) { diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_02_ETI.in index c78997d6ea0c..a5343a485d3a 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_02_ETI.in +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/eti/test_02_ETI.in @@ -14,15 +14,38 @@ #include "Kokkos_Core.hpp" +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + #include "test_02.hpp" int main(int argc, char *argv[]) { + const bool verbose = (argc-1) > 0; Kokkos::initialize(); - { - const bool verbose = (argc-1) > 0; - Intrepid2::Test::HGRAD_TRI_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); - } + + Intrepid2::Test::HGRAD_TRI_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + Kokkos::finalize(); return 0; } diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/test_02.hpp index 80d75c9bf099..a82178b45f9a 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/test_02.hpp +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_TRI_Cn_FEM/test_02.hpp @@ -7,9 +7,9 @@ // ***************************************************************************** // @HEADER -/** \file test_01.hpp +/** \file test_02.hpp \brief Unit tests for the Intrepid2::HGRAD_TRI_Cn_FEM class. - \author Created by P. Bochev, D. Ridzal, K. Peterson, Kyungjoo Kim + \author Created by Kyungjoo Kim, Mauro Perego */ @@ -23,99 +23,196 @@ #include "Intrepid2_Utils.hpp" #include "Intrepid2_HGRAD_TRI_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" namespace Intrepid2 { namespace Test { - // This code provides an example to use serial interface of high order elements + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. template int HGRAD_TRI_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_TRI_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + using DeviceSpaceType = typename DeviceType::execution_space; Kokkos::print_configuration(std::cout, false); int errorFlag = 0; - + constexpr int maxOrder = 9; try { - for (int order=1;order basis(order); + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HGRAD_TRI_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); - // problem setup - // let's say we want to evaluate 1000 points in parallel. output values are stored in outputValuesA and B. - // A is compuated via serial interface and B is computed with top-level interface. - const int npts = 1000, ndim = 2; - Kokkos::DynRankView outputValuesA("outputValuesA", basis.getCardinality(), npts); - Kokkos::DynRankView outputValuesB("outputValuesB", basis.getCardinality(), npts); + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputCurlsB, basisPtr->getCardinality(), npts, ndim); + + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); - Kokkos::View inputPointsViewToUseRandom("inputPoints", npts, ndim); - Kokkos::DynRankView inputPoints (inputPointsViewToUseRandom.data(), npts, ndim); + using ScalarType = typename ScalarTraits::scalar_type; - // random values between (-1,1) x (-1,1) + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); - // compute setup - // we need vinv and workspace - const auto vinv = basis.getVandermondeInverse(); - - // worksize - // workspace per thread is required for serial interface. - // parallel_for with range policy would be good to use stack workspace - // as team policy only can create shared memory - // this part would be tricky as the max size should be determined at compile time - // let's think about this and find out the best practice. for now I use the following. - constexpr int worksize = (Parameters::MaxOrder+1)*(Parameters::MaxOrder+1); - - // if you use team policy, worksize can be gathered from the basis object and use - // kokkos shmem_size APIs to create workspace per team or per thread. - //const auto worksize_for_teampolicy = basis.getWorksizePerPoint(OPERATOR_VALUE); - - // extract point range to be evaluated in each thread - typedef Kokkos::pair range_type; - - // parallel execution with serial interface - Kokkos::RangePolicy policy(0, npts); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i) { - // we evaluate a single point - const range_type pointRange = range_type(i,i+1); + + *outStream << "Order: " << order << ": Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; - // out (# dofs, # pts), input (# pts, # dims) - auto output = Kokkos::subview(outputValuesA, Kokkos::ALL(), pointRange); - auto input = Kokkos::subview(inputPoints, pointRange, Kokkos::ALL()); + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; - // wrap static workspace with a view; serial interface has a template view interface. - // either view or dynrankview with a right size is okay. - OutValueType workbuf[worksize]; - Kokkos::View work(&workbuf[0], worksize); + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute curls + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto curlsACell = Kokkos::subview(outputCurlsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(curlsACell, inputPoints, OPERATOR_CURL, team_member, team_member.team_scratch(scratch_space_level)); + }; - // evaluate basis using serial interface - Impl::Basis_HGRAD_TRI_Cn_FEM - ::Serial::getValues(output, input, work, vinv); - }); - - // evaluation using high level interface - basis.getValues(outputValuesB, inputPoints, OPERATOR_VALUE); - - // compare - const auto outputValuesA_Host = Kokkos::create_mirror_view(outputValuesA); Kokkos::deep_copy(outputValuesA_Host, outputValuesA); - const auto outputValuesB_Host = Kokkos::create_mirror_view(outputValuesB); Kokkos::deep_copy(outputValuesB_Host, outputValuesB); - - double sum = 0, diff = 0; - for (size_t i=0;igetValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + basisPtr->getValues(outputCurlsB, inputPoints, OPERATOR_CURL); + + *outStream << "Order: " << order << ": Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << "]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare curls + const auto outputCurlsA_Host = Kokkos::create_mirror_view(outputCurlsA); Kokkos::deep_copy(outputCurlsA_Host, outputCurlsA); + const auto outputCurlsB_Host = Kokkos::create_mirror_view(outputCurlsB); Kokkos::deep_copy(outputCurlsB_Host, outputCurlsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", curls A: [" << outputCurlsA_Host(ic,i,j,0) << ", " << outputCurlsA_Host(ic,i,j,1) <<"]" + << ", curls B: [" << outputCurlsB_Host(i,j,0) << ", " << outputCurlsB_Host(i,j,1) << "]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } } } } catch (std::exception &err) { diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/CMakeLists.txt index a32463e45988..4b1a7626d4ff 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,9 +22,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "test_01") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_WEDGE_C1_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_WEDGE_C1_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..759d2a05ad2a --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_WEDGE_C1_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_WEDGE_C1_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/test_02.hpp new file mode 100644 index 000000000000..33f9bb85137d --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C1_FEM/test_02.hpp @@ -0,0 +1,184 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_WEDGE_C1_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_WEDGE_C1_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HGRAD_WEDGE_C1_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_WEDGE_C1_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_WEDGE_C1_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/CMakeLists.txt index 759b1bc6bc22..f82d6a4f7fc4 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/CMakeLists.txt @@ -1,8 +1,16 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "") +LIST(APPEND Intrepid2_TEST_ETI_FILE + "test_01" + "test_01_Serendipity") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double") @@ -17,12 +25,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") -# Host test -SET(Intrepid2_TEST_ETI_FILE "") -LIST(APPEND Intrepid2_TEST_ETI_FILE - "test_01" - "test_01_Serendipity") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HGRAD_WEDGE_C2_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HGRAD_WEDGE_C2_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + +# test +SET(Intrepid2_TEST_ETI_FILE "test_02") + +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -83,4 +159,3 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(HGRAD_WEDGE_C2TestDataCopy SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/testdata" DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/testdata" ) - diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..86de2ff60329 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_01.cpp + \brief Unit test of Intrepid2::Basis_HGRAD_WEDGE_C2_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HGRAD_WEDGE_C2_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/test_02.hpp new file mode 100644 index 000000000000..35f0e18c3dbe --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HGRAD_WEDGE_C2_FEM/test_02.hpp @@ -0,0 +1,182 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HGRAD_WEDGE_C2_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HGRAD_WEDGE_C2_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + template + int HGRAD_WEDGE_C2_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HGRAD_WEDGE_C2_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + try { + using BasisType = Basis_HGRAD_WEDGE_C2_FEM; + auto basisPtr = Teuchos::rcp(new BasisType()); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsA, ncells, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelOutView(outputGradsB, basisPtr->getCardinality(), npts, ndim); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Computing values and gradients for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + + { //compute gradients + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto gradsACell = Kokkos::subview(outputGradsA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(gradsACell, inputPoints, OPERATOR_GRAD, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_GRAD); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Computing values and gradients for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + basisPtr->getValues(outputGradsB, inputPoints, OPERATOR_GRAD); + + *outStream << "Comparing values and gradients on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + + { + // compare grads + const auto outputGradsA_Host = Kokkos::create_mirror_view(outputGradsA); Kokkos::deep_copy(outputGradsA_Host, outputGradsA); + const auto outputGradsB_Host = Kokkos::create_mirror_view(outputGradsB); Kokkos::deep_copy(outputGradsB_Host, outputGradsB); + + OutValueType diff = 0; + auto tol = epsilon(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", grads A: [" << outputGradsA_Host(ic,i,j,0) << ", " << outputGradsA_Host(ic,i,j,1) << ", " << outputGradsA_Host(ic,i,j,2) <<"]" + << ", grads B: [" << outputGradsB_Host(i,j,0) << ", " << outputGradsB_Host(i,j,1) << ", " << outputGradsB_Host(i,j,2) <<"]" + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/CMakeLists.txt index ebf2144cc03f..fa03caf7d02b 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HVOL_HEX_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HVOL_HEX_Cn_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HVOL_HEX_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..766fea986506 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HVOL_HEX_Cn_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HVOL_HEX_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/test_02.hpp new file mode 100644 index 000000000000..115a371bc82a --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_HEX_Cn_FEM/test_02.hpp @@ -0,0 +1,144 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HVOL_HEX_Cn_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HVOL_HEX_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HVOL_HEX_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HVOL_HEX_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HVOL_HEX_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + + *outStream << "Order: " << order << ": Comparing values on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/CMakeLists.txt index f5caa97dcaf1..24d663415a74 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HVOL_LINE_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HVOL_LINE_Cn_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HVOL_LINE_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..a6f42f8ba7b8 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HVOL_LINE_Cn_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HVOL_LINE_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/test_02.hpp new file mode 100644 index 000000000000..c71f401eda49 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_LINE_Cn_FEM/test_02.hpp @@ -0,0 +1,144 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HVOL_LINE_Cn_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HVOL_LINE_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HVOL_LINE_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HVOL_LINE_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HVOL_LINE_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 1; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + + *outStream << "Order: " << order << ": Comparing values on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/CMakeLists.txt index 769157316641..aedb132dcc82 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HVOL_QUAD_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HVOL_QUAD_Cn_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HVOL_QUAD_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..1c01cd896135 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HVOL_QUAD_Cn_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HVOL_QUAD_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/test_02.hpp new file mode 100644 index 000000000000..044f8fad53a0 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_QUAD_Cn_FEM/test_02.hpp @@ -0,0 +1,144 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HVOL_QUAD_Cn_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HVOL_QUAD_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HVOL_QUAD_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HVOL_QUAD_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HVOL_QUAD_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + + *outStream << "Order: " << order << ": Comparing values on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/CMakeLists.txt index 0f61f9a7cfff..7dfea7c9986c 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HVOL_TET_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HVOL_TET_Cn_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HVOL_TET_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..d15050227457 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HVOL_TET_Cn_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HVOL_TET_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/test_02.hpp new file mode 100644 index 000000000000..d27c666355c1 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TET_Cn_FEM/test_02.hpp @@ -0,0 +1,144 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HVOL_TET_Cn_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HVOL_TET_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HVOL_TET_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HVOL_TET_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HVOL_TET_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 3; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + + *outStream << "Order: " << order << ": Comparing values on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/CMakeLists.txt b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/CMakeLists.txt index 700140bb8337..49398919d584 100644 --- a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/CMakeLists.txt +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/CMakeLists.txt @@ -1,8 +1,13 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# test +SET(Intrepid2_TEST_ETI_FILE "test_01") + # value types SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") @@ -32,9 +37,80 @@ ENDIF() LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") +# device +SET(Intrepid2_TEST_ETI_DEVICE_NAME "") +SET(Intrepid2_TEST_ETI_DEVICE "") +IF(Kokkos_ENABLE_SERIAL) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "Serial") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "OpenMP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_CUDA) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "CUDA") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() +IF(Kokkos_ENABLE_HIP) + LIST(APPEND Intrepid2_TEST_ETI_DEVICE_NAME "HIP") + LIST(APPEND Intrepid2_TEST_ETI_DEVICE "Kokkos::Device") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_DEVICE_NAME ETI_DEVICE_COUNT) +MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") + +FOREACH(I RANGE ${ETI_DEVICE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) + LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) + #MESSAGE(STATUS "Generating TEST HVOL_TRI_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") + FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) + LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) + LIST(GET Intrepid2_TEST_ETI_SACADO ${J} ETI_SACADO) + FOREACH(ETI_FILE IN LISTS Intrepid2_TEST_ETI_FILE) + SET(ETI_NAME "${ETI_FILE}_${ETI_DEVICE_NAME}_${ETI_VALUETYPE_NAME}") + MESSAGE(STATUS "Generating TEST: HVOL_TRI_Cn_FEM ${ETI_NAME}.cpp") + CONFIGURE_FILE(eti/${ETI_FILE}_ETI.in ${ETI_NAME}.cpp) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ETI_NAME} + SOURCES ${ETI_NAME}.cpp + ARGS PrintItAll + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION "TEST PASSED" + ADD_DIR_TO_NAME + ) + + ENDFOREACH() + ENDFOREACH() +ENDFOREACH() + + + + # test -SET(Intrepid2_TEST_ETI_FILE "test_01") +SET(Intrepid2_TEST_ETI_FILE "test_02") +# value types +SET(Intrepid2_TEST_ETI_VALUETYPE_NAME "") +SET(Intrepid2_TEST_ETI_VALUETYPE "") +SET(Intrepid2_TEST_ETI_SACADO "") + +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DOUBLE_DOUBLE") +LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "double,double") +LIST(APPEND Intrepid2_TEST_ETI_SACADO "0") + +IF (HAVE_INTREPID2_SACADO) + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE_NAME "DFAD_DFAD") + LIST(APPEND Intrepid2_TEST_ETI_VALUETYPE "Sacado::Fad::DFad,Sacado::Fad::DFad ") + LIST(APPEND Intrepid2_TEST_ETI_SACADO "33") +ENDIF() + +LIST(LENGTH Intrepid2_TEST_ETI_VALUETYPE_NAME ETI_VALUETYPE_COUNT) +MATH(EXPR ETI_VALUETYPE_COUNT "${ETI_VALUETYPE_COUNT}-1") + +# device SET(Intrepid2_TEST_ETI_DEVICE_NAME "") SET(Intrepid2_TEST_ETI_DEVICE "") IF(Kokkos_ENABLE_SERIAL) @@ -60,7 +136,6 @@ MATH(EXPR ETI_DEVICE_COUNT "${ETI_DEVICE_COUNT}-1") FOREACH(I RANGE ${ETI_DEVICE_COUNT}) LIST(GET Intrepid2_TEST_ETI_DEVICE_NAME ${I} ETI_DEVICE_NAME) LIST(GET Intrepid2_TEST_ETI_DEVICE ${I} ETI_DEVICE) - #MESSAGE(STATUS "Generating TEST HVOL_TRI_Cn_FEM for ${ETI_DEVICE_NAME} with ${ETI_DEVICE}") FOREACH(J RANGE ${ETI_VALUETYPE_COUNT}) LIST(GET Intrepid2_TEST_ETI_VALUETYPE_NAME ${J} ETI_VALUETYPE_NAME) LIST(GET Intrepid2_TEST_ETI_VALUETYPE ${J} ETI_VALUETYPE) diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/eti/test_02_ETI.in b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/eti/test_02_ETI.in new file mode 100644 index 000000000000..4b98bc03263a --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/eti/test_02_ETI.in @@ -0,0 +1,52 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.cpp + \brief Unit test of Intrepid2::Basis_HVOL_TRI_Cn_FEM team-level getValues. + \author Kyungjoo Kim +*/ + +#include "Kokkos_Core.hpp" + +#define ETI_SACADO @ETI_SACADO@ +#if (ETI_SACADO != 0) /// SACADO +#include "Kokkos_ViewFactory.hpp" +#include "Sacado.hpp" +#endif + +#if (ETI_SACADO == 0) /// double double +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#elif (ETI_SACADO == 11 /* SFAD SFAD */ || ETI_SACADO == 33 /* DFAD DFAD */) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 23) +constexpr int num_deriv = 3; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#elif (ETI_SACADO == 20) +constexpr int num_deriv = 2; +#define ConstructWithLabelOutView(obj, ...) obj(#obj, __VA_ARGS__, num_deriv+1) +#define ConstructWithLabelPointView(obj, ...) obj(#obj, __VA_ARGS__) +#endif + +#include "test_02.hpp" + +int main(int argc, char *argv[]) { + + const bool verbose = (argc-1) > 0; + Kokkos::initialize(); + + Intrepid2::Test::HVOL_TRI_Cn_FEM_Test02<@ETI_VALUETYPE@,@ETI_DEVICE@>(verbose); + + Kokkos::finalize(); + return 0; +} + diff --git a/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/test_02.hpp b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/test_02.hpp new file mode 100644 index 000000000000..aaefaa951c33 --- /dev/null +++ b/packages/intrepid2/unit-test/Discretization/Basis/HVOL_TRI_Cn_FEM/test_02.hpp @@ -0,0 +1,145 @@ +// @HEADER +// ***************************************************************************** +// Intrepid2 Package +// +// Copyright 2007 NTESS and the Intrepid2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/** \file test_02.hpp + \brief Unit tests for the Intrepid2::HVOL_TRI_Cn_FEM class. + \author Created by Kyungjoo Kim, Mauro Perego + */ + + +#include "Intrepid2_config.h" +#include "Kokkos_Random.hpp" +#ifdef HAVE_INTREPID2_DEBUG +#define INTREPID2_TEST_FOR_DEBUG_ABORT_OVERRIDE_TO_CONTINUE +#endif + +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Intrepid2_HVOL_TRI_Cn_FEM.hpp" +#include "packages/intrepid2/unit-test/Discretization/Basis/Setup.hpp" + +namespace Intrepid2 { + + namespace Test { + + // This test evaluates the basis functions at a set of points on a batch of cells using the team-level getValues, + // and compares the results with those obtained using the classic getValues function. + template + int HVOL_TRI_Cn_FEM_Test02(const bool verbose) { + + //! Setup test output stream. + Teuchos::RCP outStream = setup_output_stream( + verbose, "HVOL_TRI_Cn_FEM, Test 2", {} + ); + + *outStream + << "\n" + << "===============================================================================\n" + << "| Testing Team-level Implemntation of getValues |\n" + << "===============================================================================\n"; + + using DeviceSpaceType = typename DeviceType::execution_space; + Kokkos::print_configuration(std::cout, false); + + int errorFlag = 0; + + constexpr int maxOrder = 9; + try { + for (int order=1;order<=maxOrder;++order) { + using BasisType = Basis_HVOL_TRI_Cn_FEM; + auto basisPtr = Teuchos::rcp(new BasisType(order)); + + const int ncells = 5, npts = 10, ndim = 2; + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesA, ncells, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelOutView(outputValuesB, basisPtr->getCardinality(), npts); + Kokkos::DynRankView ConstructWithLabelPointView(point, 1); + + using ScalarType = typename ScalarTraits::scalar_type; + + Kokkos::View inputPointsViewToUseRandom("inputPoints", npts*ndim*get_dimension_scalar(point)); + auto vcprop = Kokkos::common_view_alloc_prop(point); + Kokkos::DynRankView inputPoints (Kokkos::view_wrap(inputPointsViewToUseRandom.data(), vcprop), npts, ndim); + + // random values between (0,1) + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(inputPointsViewToUseRandom, random, 1.0); + + + *outStream << "Order: " << order << ": Computing values for " << ncells << " cells and " << npts << " points using team-level getValues function" <(*basisPtr); + auto basisRawPtr_device = basisPtr_device.get(); + + int scratch_space_level =1; + const int vectorSize = getVectorSizeForHierarchicalParallelism(); + Kokkos::TeamPolicy teamPolicy(ncells, Kokkos::AUTO,vectorSize); + + { //compute values + auto functor = KOKKOS_LAMBDA (typename Kokkos::TeamPolicy::member_type team_member) { + auto valsACell = Kokkos::subview(outputValuesA, team_member.league_rank(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + basisRawPtr_device->getValues(valsACell, inputPoints, OPERATOR_VALUE, team_member, team_member.team_scratch(scratch_space_level)); + }; + + //Get the required size of the scratch space per team and per thread. + int perThreadSpaceSize(0), perTeamSpaceSize(0); + basisPtr->getScratchSpaceSize(perTeamSpaceSize,perThreadSpaceSize,inputPoints, OPERATOR_VALUE); + teamPolicy.set_scratch_size(scratch_space_level, Kokkos::PerTeam(perTeamSpaceSize), Kokkos::PerThread(perThreadSpaceSize)); + + Kokkos::parallel_for (teamPolicy,functor); + } + } + + *outStream << "Order: " << order << ": Computing values for " << npts << " points using high-level getValues function" <getValues(outputValuesB, inputPoints, OPERATOR_VALUE); + + *outStream << "Order: " << order << ": Comparing values on host" <(); + for (size_t ic=0;ic tol) { + ++errorFlag; + std::cout << " order: " << order + << ", ic: " << ic << ", i: " << i << ", j: " << j + << ", val A: " << outputValuesA_Host(ic,i,j) + << ", val B: " << outputValuesB_Host(i,j) + << ", |diff|: " << diff + << ", tol: " << tol + << std::endl; + } + } + } + } + } catch (std::exception &err) { + std::cout << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; + std::cout << err.what() << '\n'; + std::cout << "-------------------------------------------------------------------------------" << "\n\n"; + errorFlag = -1000; + }; + + if (errorFlag != 0) + std::cout << "End Result: TEST FAILED\n"; + else + std::cout << "End Result: TEST PASSED\n"; + + return errorFlag; + } + } +} diff --git a/packages/intrepid2/unit-test/Shared/Polylib/test_01.hpp b/packages/intrepid2/unit-test/Shared/Polylib/test_01.hpp index 5f71f3b5e376..4dcc02da30ce 100644 --- a/packages/intrepid2/unit-test/Shared/Polylib/test_01.hpp +++ b/packages/intrepid2/unit-test/Shared/Polylib/test_01.hpp @@ -233,7 +233,8 @@ namespace Intrepid2 { outStream->precision(5); - const ordinal_type npLower = 5, npUpper = Polylib::MaxPolylibPoint; // npUpper: 31 right now + const ordinal_type npLower = 5, npUpper = Polylib::MaxPolylibPoint; + const ordinal_type npUpperStep1 = 21; // we cover all np values from npLower to npUpperStep1; we only cover every 5th one after that const ValueType tol = 1000.0 * tolerence(); const double lowOrderTol = tol; const double highOrderTol = tol * 100; @@ -268,7 +269,8 @@ namespace Intrepid2 { while (alpha <= 5.0) { ValueType beta = -0.5; while (beta <= 5.0) { - for (auto np = npLower; np <= npUpper; ++np){ + ordinal_type npStep = 1; + for (auto np = npLower; np <= npUpper; np += npStep){ const double localTol = (np > 20) ? highOrderTol : lowOrderTol; Polylib::Serial::getCubature(z, w, np, alpha, beta, poly); @@ -281,6 +283,7 @@ namespace Intrepid2 { ", np = " << np << ", n = " << n << " integral was " << sum << "\n"; } } + if (np == npUpperStep1) npStep = 5; } beta += 0.5; } @@ -296,7 +299,8 @@ namespace Intrepid2 { while (alpha <= 5.0) { ValueType beta = -0.5; while (beta <= 5.0) { - for (auto np = npLower; np <= npUpper; ++np) { + ordinal_type npStep = 1; + for (auto np = npLower; np <= npUpper; np += npStep) { Polylib::Serial::getCubature(z, w, np, alpha, beta, poly); const double localTol = (np > 20) ? highOrderTol : lowOrderTol; @@ -316,6 +320,7 @@ namespace Intrepid2 { ", np = " << np << ", n = " << n << " difference " << sum << "\n"; } } + if (np == npUpperStep1) npStep = 5; } beta += 0.5; } @@ -331,8 +336,8 @@ namespace Intrepid2 { while (alpha <= 5.0) { ValueType beta = -0.5; while (beta <= 5.0) { - - for (auto np = npLower; np <= npUpper; ++np) { + ordinal_type npStep = 1; + for (auto np = npLower; np <= npUpper; np += npStep) { const double localTol = (np > 20) ? highOrderTol : lowOrderTol; Polylib::Serial::getCubature(z, w, np, alpha, beta, poly); @@ -353,6 +358,7 @@ namespace Intrepid2 { ", np = " << np << ", n = " << n << " difference " << sum << "\n"; } } + if (np == npUpperStep1) npStep = 5; } beta += 0.5; } From d88fa994bdb626eb4dac0026e6be601f9a62f03c Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Fri, 23 Aug 2024 19:21:52 -0600 Subject: [PATCH 089/243] MueLu: Fixing Issue #13377 and #13378 Issues listed above have been addressed. Threshold has been redefined to 1/threshold. Unit tests have been modified to be more thorough. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 60 +++--- .../test/unit_tests/CoalesceDropFactory.cpp | 178 +++++++++++++++--- 2 files changed, 187 insertions(+), 51 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 0431bf011541..1f9961289cb0 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -591,13 +591,27 @@ void CoalesceDropFactory::Build(Level //move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes); auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - auto At = Utilities::Op2TpetraCrs(A); - auto A_device = At->getLocalMatrixDevice(); + auto A_device = A->getLocalMatrixDevice(); + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + RCP importer = A->getCrsGraph()->getImporter(); + RCP boundaryNodesVector = Xpetra::VectorFactory::Build(graph->GetDomainMap()); + RCP boundaryColumnVector; + for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) { + boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i]; + } + if(!importer.is_null()) { + boundaryColumnVector = Xpetra::VectorFactory::Build(graph->GetImportMap()); + boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT); + } + else { + boundaryColumnVector = boundaryNodesVector; + } + auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly); + auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); auto drop_views = Kokkos::View("drop_views", A_device.nnz()); @@ -608,30 +622,24 @@ void CoalesceDropFactory::Build(Level auto rowView = A_device.rowConst(row); size_t nnz = rowView.length; - size_t dropSize = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { index_view(colID) = colID; LO col = rowView.colidx(colID); //ignore diagonals for now, they are checked again later - if(row == col) { - drop_view(colID) = true; - count++; - } //Don't aggregate boundaries - else if(boundaryNodesDevice(colID)) { + if(row == col || boundary(col)) { drop_view(colID) = true; } else { drop_view(colID) = false; - count++; } - }, dropSize); + }); - size_t dropStart = dropSize; + size_t dropStart = nnz; if (classicalAlgo == unscaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { @@ -646,7 +654,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_aij = 0; @@ -658,7 +666,7 @@ void CoalesceDropFactory::Build(Level y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); } - if(x_aij > realThresholdKokkos * y_aij) { + if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { if(i < min) { min = i; } @@ -673,30 +681,30 @@ void CoalesceDropFactory::Build(Level else { auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); } }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_val = 0; typename implATS::magnitudeType y_val = 0; if(!drop_view(x)) { typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); x_val = x_aij / x_aiiajj; } if(!drop_view(y)) { typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); y_val = y_aij / y_aiiajj; } - if(x_val > realThresholdKokkos * y_val) { + if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) { if(i < min) { min = i; } @@ -705,15 +713,15 @@ void CoalesceDropFactory::Build(Level } //drop everything to the right of where values stop passing threshold - if(dropStart < dropSize) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) { + if(dropStart < nnz) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { drop_view(index_view(i)) = true; }); } LO rownnz = 0; GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal if(row == col || !drop_view(idxID)) { @@ -1381,7 +1389,7 @@ void CoalesceDropFactory::Build(Level auto const& y = drop_vec[i]; auto a = x.val; auto b = y.val; - if (a > realThreshold * b) { + if (realThreshold * realThreshold * a > b) { drop = true; #ifdef HAVE_MUELU_DEBUG if (distanceLaplacianCutVerbose) { @@ -1404,7 +1412,7 @@ void CoalesceDropFactory::Build(Level auto const& y = drop_vec[i]; auto a = x.val / x.diag; auto b = y.val / y.diag; - if (a > realThreshold * b) { + if (realThreshold * realThreshold * a > b) { drop = true; #ifdef HAVE_MUELU_DEBUG if (distanceLaplacianCutVerbose) { diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp index e8902b178708..0073ca7e9bfb 100644 --- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp +++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp @@ -1223,7 +1223,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianScaledCu // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1289,7 +1289,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianUnscaled // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("unscaled cut"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1355,7 +1355,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianCutSym, // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut symmetric"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1389,6 +1389,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType real_type; typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Tpetra::Map map_type; + typedef Tpetra::CrsMatrix crs_matrix_type; MUELU_TESTING_SET_OSTREAM; MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node); @@ -1399,11 +1401,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala Level fineLevel; TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); - RCP A = TestHelpers::TestFactory::Build1DPoisson(36); + const global_size_t globalIndices = 12; + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + RCP A_t(new crs_matrix_type(map, 5)); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); + const SC negOne = static_cast(-1.0); + for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + const GO gblRow = map->getGlobalElement(lclRow); + if(gblRow == 0) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); + } + else if(static_cast(gblRow) == globalIndices - 1) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); + } + else if(gblRow == 2 || gblRow == 9) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); + } + else if(gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } + else if(gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); + } + else { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); + } + } + A_t->fillComplete(); + RCP A_x = rcp(new TpetraCrsMatrix(A_t)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; - galeriList.set("nx", Teuchos::as(36)); + galeriList.set("nx", Teuchos::as(globalIndices)); RCP coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); fineLevel.Set("Coordinates", coordinates); @@ -1429,25 +1461,59 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(36 + (comm->getSize() - 1) * 2)); + TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36); - - TEST_EQUALITY(graph->GetGlobalNumEdges(), 72); - -} // SignaledClassical + TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); + + TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); + + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int columns[28] = {0, 1, + 0, 1, + 2, + 3, 4, + 3, 4, 5, + 3, 4, 5, 6, 7, + 5, 6, 7, + 6, 7, 8, + 7, 8, + 9, + 10, 11, + 10, 11}; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; + TEST_EQUALITY(rowPtrs(0), rowID); + for(size_t i = 0; i < rowPtrs.size()-1; i++) { + auto gblID = myDomainMap->getGlobalElement(i); + int rownnz = rows[gblID+1]-rows[gblID]; + rowID += rownnz; + TEST_EQUALITY(rowPtrs(i+1), rowID); + + std::vector colID; + for(int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + } + std::sort(std::begin(colID), std::end(colID)); + for(int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + } + } +} // ClassicalScaledCut TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Scalar, LocalOrdinal, GlobalOrdinal, Node) { #include typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType real_type; typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Tpetra::Map map_type; + typedef Tpetra::CrsMatrix crs_matrix_type; MUELU_TESTING_SET_OSTREAM; MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node); @@ -1458,11 +1524,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca Level fineLevel; TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); - RCP A = TestHelpers::TestFactory::Build1DPoisson(36); + const global_size_t globalIndices = 12; + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + RCP A_t(new crs_matrix_type(map, 5)); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); + const SC negOne = static_cast(-1.0); + for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + const GO gblRow = map->getGlobalElement(lclRow); + if(gblRow == 0) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); + } + else if(static_cast(gblRow) == globalIndices - 1) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); + } + else if(gblRow == 2 || gblRow == 9) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); + } + else if(gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } + else if(gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); + } + else { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); + } + } + A_t->fillComplete(); + RCP A_x = rcp(new TpetraCrsMatrix(A_t)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; - galeriList.set("nx", Teuchos::as(36)); + galeriList.set("nx", Teuchos::as(globalIndices)); RCP coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); fineLevel.Set("Coordinates", coordinates); @@ -1488,19 +1584,51 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(36 + (comm->getSize() - 1) * 2)); + TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36); - - TEST_EQUALITY(graph->GetGlobalNumEdges(), 72); - -} // SignaledClassical + TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); + + TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); + + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int columns[28] = {0, 1, + 0, 1, + 2, + 3, 4, + 3, 4, 5, + 3, 4, 5, 6, 7, + 5, 6, 7, + 6, 7, 8, + 7, 8, + 9, + 10, 11, + 10, 11}; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; + TEST_EQUALITY(rowPtrs(0), rowID); + for(size_t i = 0; i < rowPtrs.size()-1; i++) { + auto gblID = myDomainMap->getGlobalElement(i); + int rownnz = rows[gblID+1]-rows[gblID]; + rowID += rownnz; + TEST_EQUALITY(rowPtrs(i+1), rowID); + + std::vector colID; + for(int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + } + std::sort(std::begin(colID), std::end(colID)); + for(int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + } + } +} // ClassicalUnScaledCut TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, SignaledClassical, Scalar, LocalOrdinal, GlobalOrdinal, Node) { #include @@ -1902,7 +2030,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonal, Scalar, Lo coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact); coalesceDropFact.SetFactory("BlockNumber", ibFact); - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal"))); coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3)); coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); @@ -1949,7 +2077,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonalClassical, S coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact); coalesceDropFact.SetFactory("BlockNumber", ibFact); - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal classical"))); coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3)); fineLevel.Request("Graph", &coalesceDropFact); From 4db43f835309ec703440a13668a65025cb3415f0 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Sat, 2 Nov 2024 23:14:50 -0600 Subject: [PATCH 090/243] ShyLU - Basker : tune memory pre-allocation Signed-off-by: iyamazaki --- .../basker/src/shylubasker_decl.hpp | 19 -- .../basker/src/shylubasker_error_manager.hpp | 6 +- .../basker/src/shylubasker_nfactor_blk.hpp | 44 +-- .../basker/src/shylubasker_nfactor_col2.hpp | 26 +- .../basker/src/shylubasker_order.hpp | 8 +- .../basker/src/shylubasker_order_scotch.hpp | 40 ++- .../basker/src/shylubasker_sfactor.hpp | 275 ++++++++++-------- .../basker/src/shylubasker_structs.hpp | 3 +- .../basker/src/shylubasker_thread.hpp | 9 +- .../basker/src/shylubasker_types.hpp | 2 +- .../basker/src/shylubasker_util.hpp | 6 +- 11 files changed, 247 insertions(+), 191 deletions(-) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp index 09e3f6f98382..7fd24c7fb1d7 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_decl.hpp @@ -528,25 +528,6 @@ namespace BaskerNS Int off_diag ); - BASKER_INLINE - void L_blk_sfactor - ( - BASKER_MATRIX &MV, - BASKER_SYMBOLIC_TREE &ST, - INT_1DARRAY gcol, - INT_1DARRAY grow - ); - - //old - BASKER_INLINE - void L_blk_sfactor - ( - BASKER_MATRIX_VIEW &MV, - BASKER_SYMBOLIC_TREE &ST, - INT_1DARRAY gcol, - INT_1DARRAY grow - ); - BASKER_INLINE void S_sfactor_reduce ( diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp index cd2c9f57bf0a..b56d378de8a7 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_error_manager.hpp @@ -77,7 +77,7 @@ namespace BaskerNS << " DOMBLK MALLOC : blk=" << thread_array(ti).error_blk << " subblk=" << thread_array(ti).error_subblk << " newsize=" << thread_array(ti).error_info - << std::endl; + << std::endl << std::flush; } //If on diagonal, want to compare L and U @@ -113,7 +113,7 @@ namespace BaskerNS { if(Options.verbose == BASKER_TRUE) { - std::cout << " ++ resize L( tid = " << ti << " ): new size = " << resize_L << std::endl; + std::cout << " ++ resize L( tid = " << ti << " ): new size = " << resize_L << std::endl << std::flush; } BASKER_MATRIX &L = LL(thread_array(ti).error_blk)(thread_array(ti).error_subblk); @@ -139,7 +139,7 @@ namespace BaskerNS { if(Options.verbose == BASKER_TRUE) { - std::cout << " ++ resize U( tid = " << ti << " ): new size = " << resize_U << std::endl; + std::cout << " ++ resize U( tid = " << ti << " ): new size = " << resize_U << std::endl << std::flush; } BASKER_MATRIX &U = LU(thread_array(ti).error_blk)(0); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp index 2e0434796e33..614663f193a9 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_blk.hpp @@ -546,37 +546,37 @@ namespace BaskerNS if (Options.replace_tiny_pivot && normA_blk > abs(zero)) { // just insert tiny pivot on diagonal maxindex = k; - while (gperm(maxindex+brow_g) != BASKER_MAX_IDX && maxindex < M.ncol) { + while (gperm(maxindex+brow_g) != BASKER_MAX_IDX && maxindex < M.ncol) { maxindex ++; - } - if (maxindex < M.ncol) { + } + if (maxindex < M.ncol) { if (Options.verbose == BASKER_TRUE) { cout << " thread-" << kid << " Explicit tiny pivot for maxind = " << maxindex << endl; - } + } pivot = normA_blk * sqrt(eps); lastU = pivot; npivots ++; - explicit_pivot = true; - } + explicit_pivot = true; + } } else if (Options.replace_zero_pivot && normA_blk > abs(zero)) { // just insert tiny pivot on diagonal maxindex = k; - while (gperm(maxindex+brow_g) != BASKER_MAX_IDX && maxindex < M.ncol-1) { + while (gperm(maxindex+brow_g) != BASKER_MAX_IDX && maxindex < M.ncol-1) { maxindex ++; - } - if (maxindex < M.ncol) { + } + if (maxindex < M.ncol) { if (Options.verbose == BASKER_TRUE) { cout << " thread-" << kid << " Explicit nonzero pivot for maxind = " << maxindex << "(" << gperm(maxindex+brow_g) << ")" << endl; - } + } pivot = normA_blk * eps; lastU = pivot; npivots ++; - explicit_pivot = true; - } + explicit_pivot = true; + } } - if (!explicit_pivot) { + if (!explicit_pivot) { thread_array(kid).error_type = BASKER_ERROR_SINGULAR; thread_array(kid).error_blk = b; @@ -1543,8 +1543,8 @@ namespace BaskerNS #ifdef BASKER_DEBUG_NFACTOR_BLK - printf("t_dense_move_offdiag_L, kid=%d, k=%d: L (%d %d) X (%d %d)\n", - kid, k, blkcol,blkrow, X_col, X_row); + printf("t_dense_move_offdiag_L, kid=%d, k=%d: L (%d %d) X (%d %d), nnz=%d\n", + kid, k, blkcol,blkrow, X_col, X_row, L.nnz); #endif @@ -1565,7 +1565,6 @@ namespace BaskerNS } */ - ///for(Int i = 0; i < p_size; i++) for(Int j = 0; j < L.nrow; ++j) { //Int j = pattern[i]; @@ -1573,7 +1572,15 @@ namespace BaskerNS if(X(j) != (Entry)(0) ) { //Int t = gperm(j+brow); - + if (lnnz >= L.nnz) { // this should not happen since allocated as dense separator blocks + if (Options.verbose == BASKER_TRUE) + { + printf("Move Off-diag L failed with insufficient storage L(%d,%d).nnz = %d\n", + (int)blkcol, (int)blkrow, (int)L.nnz ); + } + BASKER_ASSERT(true, "\n Not enough memory allocated for off-diagonal L\n"); + return BASKER_ERROR; + } #ifdef BASKER_DEBUG_NFACTOR_BLK printf("L-Moving, kid: %d j: %d val: %f lnnz: %d \n", kid, j, X[j]/pivot, lnnz); @@ -1594,7 +1601,6 @@ namespace BaskerNS #ifdef BASKER_INC_LVL L.inc_lvl[lnnz] = INC_LVL_TEMP[j]; #endif - lnnz++; } } @@ -1756,7 +1762,7 @@ namespace BaskerNS printf("t_back_solve_diag, kid: %d, ws: %d starting psize: %d \n", kid, ws_size, nnz); printf("t_back_solve_diag, kid: %d, ALM(%d)(%d): %dx%d\n",kid,blkcol,blkrow,B.nrow,B.ncol ); - printf("t_back_solve_diag, kid: %d, LL(%d)(%d): %dx%d\n",kid,blkcol,blkrow,L.nrow,L.ncol ); + printf("t_back_solve_diag, kid: %d, LL(%d)(%d): %dx%d, nnz=%d, X.nnz=%d\n",kid,blkcol,blkrow,L.nrow,L.ncol,LL(blkcol)(blkrow).nnz,X.extent(0) ); printf("\n\n"); fflush(stdout); #endif //B.info(); diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp index 5e9345ed02ec..7b65e1d94ed0 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_nfactor_col2.hpp @@ -233,7 +233,7 @@ namespace BaskerNS }//for - over all sublevel 1...lvl-2 #ifdef BASKER_TIMER printf("Time Upper-Col(%d): %lf \n", (int)kid, timer.seconds()); - timer.reset(); + fflush(stdout); timer.reset(); #endif //---------Lower Factor (old sublevel lvl-1)------- @@ -255,11 +255,11 @@ namespace BaskerNS } } #endif - #ifdef BASKER_DEBUG_NFACTOR_COL2 + #ifdef BASKER_TIMER printf("\n done with UPPER, kid: %d \n\n", kid); + printf("\n\n======= LOWER, KID: %d ======= \n\n", kid); + fflush(stdout); #endif - - //printf("\n\n======= LOWER, KID: %d ======= \n\n", kid); //return; // > accumulate the last update // > factor the diagonal block LU(U_col)(U_row) @@ -284,7 +284,8 @@ namespace BaskerNS if (info == BASKER_SUCCESS) { #ifdef BASKER_DEBUG_NFACTOR_COL2 - printf( " kid=%d: calling t_add_extend(k=%d/%d)\n",kid,k,ncol ); fflush(stdout); + printf( " kid=%d: calling t_add_extend(k=%d/%d) with LU(%d,%d).nnz = %d\n", + kid,k,ncol,U_col,U_row,LU(U_col)(U_row).nnz ); fflush(stdout); #endif t_add_extend(thread, kid,lvl,lvl-1, k, LU(U_col)(U_row).scol, @@ -316,13 +317,13 @@ namespace BaskerNS } } #ifdef BASKER_DEBUG_NFACTOR_COL2 - printf(" > done calling lower factor, kid: %d k: %d info=%d\n", kid, k, info); fflush(stdout); - #endif - #ifdef BASKER_DEBUG_NFACTOR_COL2 else { printf(" + skipping lower factor, kid: %d k: %d \n", kid, k); fflush(stdout); } #endif + #ifdef BASKER_DEBUG_NFACTOR_COL2 + printf(" > done calling lower factor, kid: %d k: %d info=%d\n", kid, k, info); fflush(stdout); + #endif //need barrier if multiple thread uppdate #ifdef USE_TEAM_BARRIER_NFACTOR_COL2 thread.team_barrier(); @@ -356,12 +357,12 @@ namespace BaskerNS timer_facoff.reset(); #endif #ifdef BASKER_DEBUG_NFACTOR_COL2 - printf(" calling lower diag factor, kid: %d k: %d \n", + printf(" calling lower offdiag factor, kid: %d k: %d \n", kid, k); fflush(stdout); #endif t_lower_col_factor_offdiag2(kid, lvl, lvl-1, k, pivot); #ifdef BASKER_DEBUG_NFACTOR_COL2 - printf(" done lower diag factor, kid: %d k: %d \n", + printf(" done lower offdiag factor, kid: %d k: %d \n", kid, k); fflush(stdout); #endif } @@ -906,7 +907,10 @@ namespace BaskerNS L_row < LL_size(L_col); X_row+=(lteam_size), L_row+=(lteam_size)) { - //printf("OFF_DIAG_LOWER. kid: %d k: %d U: %d %d L: %d %d X: %d %d pivot: %f \n", kid, k, U_col, U_row, L_col, L_row, X_col, X_row, pivot); + #ifdef BASKER_TIMER + printf("OFF_DIAG_LOWER. kid: %d k: %d U(%d, %d).nnz = %d L(%d, %d) X(%d, %d) pivot: %f \n", + kid, k, U_col, U_row, LU(U_col)(U_row).nnz, L_col, L_row, X_col, X_row, pivot); + #endif /*old t_back_solve_offdiag(leader_id, L_col, L_row, diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp index 82ea04be3754..e9cddba1b3bb 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_order.hpp @@ -1160,8 +1160,10 @@ static int basker_sort_matrix_col(const void *arg1, const void *arg2) std::cout << " > scotch_partition returned with info = " << info_scotch << " and apply_nd = " << apply_nd << std::endl; } return info_scotch; + } else if(Options.verbose == BASKER_TRUE) { + printf( "\n part_scotch done (num_threads = %d,%d)\n",num_threads,part_tree.leaf_nnz.extent(0) ); + //for (Int i = 0; i < num_threads; i++) printf( " nnz_leaf[%d] = %d\n",i,part_tree.leaf_nnz[i] ); printf( "\n" ); } - nd_flag = BASKER_TRUE; //permute permute_row(M, part_tree.permtab); @@ -2200,7 +2202,9 @@ static int basker_sort_matrix_col(const void *arg1, const void *arg2) INT_1DARRAY row ) { - permute_row(M.nnz, &(M.row_idx(0)), &(row(0))); + if (M.nnz > 0) { + permute_row(M.nnz, &(M.row_idx(0)), &(row(0))); + } return 0; }//end permute_row(matrix,int) diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_order_scotch.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_order_scotch.hpp index c70fe3507862..e30606385847 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_order_scotch.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_order_scotch.hpp @@ -235,7 +235,9 @@ namespace BaskerNS // id of the first leaf node (BF order, post_order maps from BF to ND) Int leaves_id = pow(2.0, (double)(num_levels)) - 1; - //printf( " num_levels = %d, num_doms = %d, leves_id = %d\n",num_levels,num_doms,leaves_id ); + if (Options.verbose == BASKER_TRUE) { + printf( " num_domains = %d: num_levels = %d, num_doms = %d, leves_id = %d\n",num_domains,num_levels,num_doms,leaves_id ); + } // > insert root Int num_queued = 0; @@ -297,11 +299,14 @@ namespace BaskerNS // level goes to num_leaves so that we can call ND on the final leaf nodes last_level = num_levels; } - if (Options.verbose == BASKER_TRUE) { - if (run_nd_on_leaves) { + if (run_nd_on_leaves) { + if (Options.verbose == BASKER_TRUE) { std::cout << std::endl << " + Using ND on leaves + " << std::endl; - } else if (run_amd_on_leaves) { - std::cout << std::endl << " + Using AMD on leaves + " << std::endl; + } + } else if (run_amd_on_leaves) { + MALLOC_INT_1DARRAY(BT.leaf_nnz, num_doms); + if (Options.verbose == BASKER_TRUE) { + std::cout << std::endl << " + Using AMD on leaves (# doms = " << num_doms << ") + " << std::endl; } } // -------------------------------------------------- // @@ -551,11 +556,16 @@ namespace BaskerNS for(Int i = 0; i < metis_size_k; i++) { metis_iperm_k(metis_perm_k(i)) = i; } + if (Options.verbose == BASKER_TRUE) { + std::cout << std::endl << " > Basker AMD on leaf : estimated nnz(L(" << leaf_id << ") = " << l_nnz + << " <" << std::endl << std::endl; + } info = METIS_OK; } else { std::cout << std::endl << " > Basker AMD failed < " << std::endl << std::endl; return BASKER_ERROR; // TODO: what to do here? } + BT.leaf_nnz(leaf_id) = l_nnz; } // update perm/ @@ -888,7 +898,7 @@ namespace BaskerNS sg.nz = sg.Ap[sg.m]; //printf("num self_edge: %d sg.m: %d \n", - // self_edge, sg.m); + // self_edge, sg.m); if(self_edge != (sg.m)) { BASKER_ASSERT(self_edge == (sg.m-1), @@ -990,11 +1000,11 @@ namespace BaskerNS #ifdef BASKER_DEBUG_ORDER_SCOTCH printf("FIX SCOTCH PRINT OUT\n"); printf("SCOTCH: NUM_LEVELS ASKED = %d, NUM DOMS GOT = %d, NUM TREES = %d \n", - num_levels, sg.cblk, num_trees); + num_levels, sg.cblk, num_trees); printf("\n"); printf("%d %d should blks: %f \n", - 2, ((Int)num_levels+1), - pow(2.0,((double)num_levels+1))-1); + 2, ((Int)num_levels+1), + pow(2.0,((double)num_levels+1))-1); #endif if(((sg.cblk) != pow(2.0,((double)num_levels+1))-1) || (num_trees != 1)) @@ -1028,7 +1038,7 @@ namespace BaskerNS #ifdef BASKER_DEBUG_ORDER_SCOTCH printf("\n\n Starting DEBUG COMPLETE OUT \n\n"); printf("Tree: "); - ` for(Int i = 0; i < iblks+1; i++) + for(Int i = 0; i < iblks+1; i++) { printf("%d, ", ttree(i)); } @@ -1217,11 +1227,11 @@ namespace BaskerNS Int mynum = iblks-1; otree(iblks) = -1; rec_build_tree(lvl, - lpos,rpos, - mynum, - otree); + lpos,rpos, + mynum, + otree); - + INT_1DARRAY ws; BASKER_ASSERT((iblks+1)>0, "scotch iblks 2"); MALLOC_INT_1DARRAY(ws, iblks+1); @@ -1486,7 +1496,7 @@ namespace BaskerNS ) { //printf("assign, lpos: %d rpos: %d number: %d\n", - // lpos, rpos, mynum); + // lpos, rpos, mynum); if(lvl > 0) { diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp index c955ff952551..7a91f6c8a577 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_sfactor.hpp @@ -39,7 +39,7 @@ using namespace std; #endif #include "Teuchos_OrdinalTraits.hpp" -//#define BASKER_TIMER +//#define BASKER_TIMER //#define BASKER_DEBUG_SFACTOR //Functor for Kokkos @@ -76,7 +76,7 @@ namespace BaskerNS { #ifdef BASKER_KOKKOS //Int kid = (Int)(thread.league_rank()*thread.team_size()+ - // thread.team_rank()); + // thread.team_rank()); Int kid = basker->t_get_kid(thread); #endif @@ -114,7 +114,7 @@ namespace BaskerNS { #ifdef BASKER_KOKKOS //Int kid = (Int)(thread.league_rank()*thread.team_size()+ - // thread.team_rank()); + // thread.team_rank()); Int kid = basker->t_get_kid(thread); #endif printf( " * kokkos_sfactor_init_factor(%d) *\n",kid ); fflush(stdout); @@ -172,8 +172,8 @@ int Basker::sfactor() //Allocate Factorspace #ifdef BASKER_TIMER - printf(" >> kokkos_sfactor_init_factor( btf_tabs_offset = %d, allocate_nd_workspace = %d ) <<\n", - btf_tabs_offset,allocate_nd_workspace); fflush(stdout); + printf(" >> kokkos_sfactor_init_factor( btf_tabs_offset = %d, allocate_nd_workspace = %d, num_threads= %d ) <<\n", + btf_tabs_offset,allocate_nd_workspace,num_threads); fflush(stdout); #endif if(btf_tabs_offset != 0 && allocate_nd_workspace) { @@ -316,8 +316,6 @@ int Basker::sfactor() timer2.reset(); #endif - //split_num = num_threads/2; - //for(Int p =0; p < 1; ++p) if(Options.verbose == BASKER_TRUE) { printf("\n"); @@ -378,21 +376,35 @@ int Basker::sfactor() //Assign nnz here //leaf_assign_nnz(LL(blk)(0), stree, 0); //leaf_assign_nnz(LU(blk)(LU_size[blk]-1), stree, 0); - if(Options.verbose == BASKER_TRUE) - { - printf( " >> leaf_assign_nnz(LL(%d)(%d))\n",(int)blk,0); - printf( " >> leaf_assign_nnz(LU(%d)(%d))\n",(int)blk,(int)LU_size(blk)-1); - } #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) timer1.reset(); #endif - #ifdef SHYLU_BASKER_STREE_LIST - leaf_assign_nnz(LL(blk)(0), stree_p, 0); - leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree_p, 0); - #else - leaf_assign_nnz(LL(blk)(0), stree, 0); - leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree, 0); - #endif + if (!Options.run_nd_on_leaves && Options.run_amd_on_leaves) { + double fill_factor = 1.0 + BASKER_DOM_NNZ_OVER+Options.user_fill; + if(Options.verbose == BASKER_TRUE) + { + printf( " >> leaf_assign_nnz(LL(%d)(%d)) = (1.0 + %.1f + %.1f) + leaf_nnz[%d] = %d from AMD\n",(int)blk,0, + BASKER_DOM_NNZ_OVER,Options.user_fill,p,part_tree.leaf_nnz[p] ); + printf( " >> leaf_assign_nnz(LU(%d)(%d)) = (1.0 + %.1f + %.1f) + leaf_nnz[%d] = %d from AMD\n",(int)blk,(int)LU_size(blk)-1, + BASKER_DOM_NNZ_OVER,Options.user_fill,p,part_tree.leaf_nnz[p] ); + } + LL(blk)(0).nnz = part_tree.leaf_nnz[p] * fill_factor; + LU(blk)(LU_size(blk)-1).nnz = part_tree.leaf_nnz[p] * fill_factor; + global_nnz += (LL(blk)(0).nnz + LU(blk)(LU_size(blk)-1).nnz); + } else { + if(Options.verbose == BASKER_TRUE) + { + printf( " >> leaf_assign_nnz(LL(%d)(%d))\n",(int)blk,0); + printf( " >> leaf_assign_nnz(LU(%d)(%d))\n",(int)blk,(int)LU_size(blk)-1); + } + #ifdef SHYLU_BASKER_STREE_LIST + leaf_assign_nnz(LL(blk)(0), stree_p, 0); + leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree_p, 0); + #else + leaf_assign_nnz(LL(blk)(0), stree, 0); + leaf_assign_nnz(LU(blk)(LU_size(blk)-1), stree, 0); + #endif + } #if defined(BASKER_TIMER) & !defined(SHYLU_BASKER_STREE_LIST) time2 += timer1.seconds(); #endif @@ -406,8 +418,18 @@ int Basker::sfactor() std::cout << " DOMAIN BLKs done : " << dom_time << std::endl << std::endl; #endif + if(Options.verbose == BASKER_TRUE) + { + printf("\n"); + printf("\n --------------- OVER OFF-DIAGS ---------------\n"); + printf("\n"); + } for(Int p = 0; p < num_threads; ++p) { + if(Options.verbose == BASKER_TRUE) + { + printf(" ============= OFF-DIAG BLK (p=%d) ============\n",(int)p); + } //Do off diag Int blk = S(0)(p); #ifdef SHYLU_BASKER_STREE_LIST @@ -436,7 +458,7 @@ int Basker::sfactor() Int off_diag = 1; //printf( " U_blk_sfactor(AVM(%d,%d))\n",U_col,U_row ); //U_blk_sfactor(AV[U_col][U_row], stree, - // gScol(l), gSrow(glvl),0); + // gScol(l), gSrow(glvl),0); #ifdef BASKER_TIMER timer1.reset(); #endif @@ -457,7 +479,7 @@ int Basker::sfactor() //Reduce all into global (don't need in serial) //S_sfactor_reduce(AV[U_col][U_row], - // stree, gScol, gSrow); + // stree, gScol, gSrow); //Assign nnz counts for leaf off-diag //U_assign_nnz(LU(U_col)(U_row), stree, 0); @@ -493,7 +515,6 @@ int Basker::sfactor() timer.reset(); #endif - //do all the sep if(Options.verbose == BASKER_TRUE) { @@ -503,6 +524,10 @@ int Basker::sfactor() } for(Int lvl=0; lvl < tree.nlvls; lvl++) { + if(Options.verbose == BASKER_TRUE) + { + printf(" ============= SEPARATOR BLK (level=%d) ============\n",(int)lvl); + } //Number of seps in the level Int p = pow(tree.nparts, tree.nlvls-lvl-1); @@ -554,7 +579,6 @@ int Basker::sfactor() printf( " >>> -> nnz = %d\n",ALM(U_col)(U_row).nnz ); fflush(stdout); #endif - //S_assign_nnz(LL(U_col)(U_row), stree, 0); if(Options.verbose == BASKER_TRUE) { printf( " >> S_assign_nnz( LL(%d,%d) )\n",(int)U_col,(int)U_row ); fflush(stdout); @@ -612,7 +636,7 @@ int Basker::sfactor() printf("BLK: %d %d Col: %d Row: %d \n", U_col, U_row, l, pp); #endif - Int off_diag = 1; + Int off_diag = -1; // dense #ifdef SHYLU_BASKER_STREE_LIST U_blk_sfactor(AVM(U_col)(U_row), stree_p, gScol(l), gSrow(pp), off_diag); @@ -626,7 +650,7 @@ int Basker::sfactor() //Don't need in serial //S_sfactor_reduce(AV[U_col][U_row], - // stree, gScol, gSrow); + // stree, gScol, gSrow); //Assign nnz @@ -635,7 +659,7 @@ int Basker::sfactor() { printf( " ++ leaf_assign_nnz(LU(%d, %d)) fill-factor x(%d+%f = %f)\n",(int)U_col,(int)U_row, (int)BASKER_SEP_NNZ_OVER,Options.user_fill,fill_factor); printf( " ++ leaf_assign_nnz(LL(%d, %d)) fill-factor x(%d+%f = %f)\n",(int)inner_blk,(int)(l-lvl), (int)BASKER_SEP_NNZ_OVER,Options.user_fill,fill_factor); - fflush(stdout); + fflush(stdout); } #ifdef SHYLU_BASKER_STREE_LIST U_assign_nnz(LU(U_col)(U_row), stree_p, fill_factor, 0); @@ -1470,8 +1494,8 @@ int Basker::sfactor() printf("\n\n"); printf("BLK: %d %d \n", brow, bcol); printf("row: %d %d col: %d %d \n", - MV.srow, MV.nrow+MV.srow, - MV.scol, MV.ncol+MV.scol); + MV.srow, MV.nrow+MV.srow, + MV.scol, MV.ncol+MV.scol); printf("\n\n"); #endif @@ -1720,6 +1744,10 @@ int Basker::sfactor() { ST.L_row_counts(j) = 0; ST.U_col_counts(j) = 0; + } + // diagonal => square + for(Int j = 0; j < MV.ncol; j++) + { for(Int k = MV.col_ptr(j); k < MV.col_ptr(j+1); ++k) { Int i = MV.row_idx(k); if (i < j) { @@ -1732,26 +1760,59 @@ int Basker::sfactor() } } } + #ifdef BASKER_TIMER + std::cout << " >> U_blk_sfactor(diag): " << timer.seconds() << " seconds" << std::endl; + timer.reset(); + #endif } - #ifdef BASKER_TIMER - std::cout << " >> U_blk_sfactor::loop-columns : " << timer.seconds() << " seconds" << std::endl; - #endif #endif - - //Temp Patch fix - //Note Comebaske if(off_diag == 1) { - for(Int i = 0; i < MV.ncol; i++) + Int tot_nnz = 0; + for(Int j = 0; j < MV.ncol; j++) + { + #if 1 + /*{ // original + Int nnz = MV.col_ptr(j+1) - MV.col_ptr(j); + ST.L_row_counts(j) = nnz; + ST.U_col_counts(j) = nnz; + } */ + // dense after first nz in each column + Int min_i = MV.nrow; + for(Int k = MV.col_ptr(j); k < MV.col_ptr(j+1); ++k) { + Int i = MV.row_idx(k); + min_i = min(i, min_i); + } + ST.L_row_counts(j) = MV.nrow - min_i; + ST.U_col_counts(j) = MV.nrow - min_i; + #else // fully dense + ST.U_col_counts(j) = MV.nrow; + ST.L_row_counts(j) = MV.nrow; + #endif + tot_nnz += ST.L_row_counts(j); + } + #ifdef BASKER_TIMER + std::cout << " >> U_blk_sfactor::off-diag ("<< MV.nrow << " x " << MV.ncol << "): with nnz = " << tot_nnz << " => " + << double(tot_nnz) / double (MV.ncol*MV.nrow) << ", " << double(tot_nnz) / double (MV.nnz) + << ": " << timer.seconds() << " seconds" << std::endl; + timer.reset(); + #endif + } + if(off_diag == -1) + { + Int tot_nnz = 0; + for(Int j = 0; j < MV.ncol; j++) { - ST.U_col_counts(i) = MV.nrow; - ST.L_row_counts(i) = MV.nrow; + ST.U_col_counts(j) = MV.nrow; + ST.L_row_counts(j) = MV.nrow; + tot_nnz += ST.L_row_counts(j); } + #ifdef BASKER_TIMER + std::cout << " >> U_blk_sfactor::dense ("<< MV.nrow << " x " << MV.ncol << "): with nnz = " << tot_nnz << " => " << double(tot_nnz) / double (MV.ncol*MV.nrow) + << ": " << timer.seconds() << " seconds" << std::endl; + timer.reset(); + #endif } - #ifdef BASKER_TIMER - std::cout << " >> U_blk_sfactor::copy : " << timer.seconds() << " seconds" << std::endl; - timer.reset(); - #endif FREE(U_col_count); FREE(color); @@ -1994,8 +2055,9 @@ int Basker::sfactor() //Temp Patch fix //Note Comebaske - if(off_diag ==1) + if(off_diag == 1) { + printf( " U_blk_sfactor(off-diag: %d x %d)\n",MV.nrow,MV.ncol ); for(Int i = 0; i < MV.ncol; i++) { ST.U_col_counts[i] = MV.nrow; @@ -2012,43 +2074,6 @@ int Basker::sfactor() FREE(first_row); }//end U_blk_sfactor() - - template - void Basker::L_blk_sfactor - ( - BASKER_MATRIX &MV, - BASKER_SYMBOLIC_TREE &ST, - INT_1DARRAY gcol, - INT_1DARRAY grow - ) - { - printf("Basker: This L_blk_sfactor algorithm is not implemented\n"); - //Algorithm - //You can either use the Row-count method or - //Assume same as U_blk for symmtric case. - //Note, Very unsymmtric and HUND will most likely not - //Need this called as we will use the QR on nxns - }//end L_blk_sfactor() - - - template - void Basker::L_blk_sfactor - ( - BASKER_MATRIX_VIEW &MV, - BASKER_SYMBOLIC_TREE &ST, - INT_1DARRAY gcol, - INT_1DARRAY grow - ) - { - printf("Basker: This L_blk_sfactor algorithm is not implemented\n"); - //Algorithm - //You can either use the Row-count method or - //Assume same as U_blk for symmtric case. - //Note, Very unsymmtric and HUND will most likely not - //Need this called as we will use the QR on nxns - }//end L_blk_sfactor() - - template void Basker::S_sfactor_reduce ( @@ -2095,9 +2120,7 @@ int Basker::sfactor() //Give a = nnz(L(:,1)) and b = nnz(U(1,:)) //If a*b == (size-size)^2 .... adjust padding - //Int brow = MV.srow; //Not used - //Int bcol = MV.scol; //Not used - + #if 0 //Find nnz L(:,1) Int nnz_c = 0; for(Int i = MV.srow; i < (MV.srow+MV.nrow); i++) @@ -2108,7 +2131,7 @@ int Basker::sfactor() } } nnz_c += 1; - + #endif #ifdef BASKER_DEBUG_SFACTOR printf("S - nnz(L(:,1)): %d \n", nnz_c); #endif @@ -2141,8 +2164,8 @@ int Basker::sfactor() nnz_S = Teuchos::OrdinalTraits::max()/2; } - #ifdef BASKER_DEBUG_SFACTOR - printf("Snnz: %d \n", nnz_S); + #ifdef BASKER_TIMER + printf(" > Snnz: %d \n", nnz_S); #endif ST.init_S_col_counts(1); @@ -2276,11 +2299,16 @@ int Basker::sfactor() t_nnz += ST.col_counts[i]; } else { // let's just hope it is enough, if overflow + t_nnz = Int_MAX; + #ifdef BASKER_TIMER + printf( " - overflow nnz = %ld (%d/%d)\n",t_nnz,i,M.ncol ); + for (Int ii = 0; ii <= i; ii++) printf( " * col_counts[%d] = %ld\n",ii,ST.col_counts[ii] ); + #endif break; } } #ifdef BASKER_TIMER - printf(" > leaf nnz: (%ld + %ld) / 2 = %ld\n", (long)t_nnz,(long)M.ncol,(long)(t_nnz+M.ncol)/2); + printf(" > leaf nnz: (t_nnz = %ld + ncol = %ld) / 2 = %ld\n", (long)t_nnz,(long)M.ncol,(long)(t_nnz+M.ncol)/2); #endif t_nnz = long(t_nnz+M.ncol)/2; @@ -2321,35 +2349,48 @@ int Basker::sfactor() Int option ) { - if(option == 0) + if(option == 0 || option == 1) { const Int Int_MAX = std::numeric_limits::max(); - Int t_nnz = 0; - for(Int i = 0; i < M.ncol; i++) - { - if (t_nnz <= Int_MAX-ST.U_col_counts[i]) { - t_nnz += ST.U_col_counts[i]; - } else { - // let's just hope it is enough, if overflow - break; - } - } - - #ifdef BASKER_TIMER - printf("U_assing_nnz: %ld \n", t_nnz); - #endif - - //double fill_factor = 1.05; - Int temp = min(M.nrow*M.ncol, Int(fill_factor*t_nnz)); - if (temp >= t_nnz) { - M.nnz = temp; + Int t_nnz = 0; + if (option == 1) { + // dense + t_nnz = (M.nrow*M.ncol); } else { + Int k_nnz = 0; + for(Int i = 0; i < M.ncol; i++) + { + if (k_nnz <= Int_MAX-ST.U_col_counts[i]) { + k_nnz += ST.U_col_counts[i]; + } else { + // let's just hope it is enough, if overflow + k_nnz = Int_MAX; + #ifdef BASKER_TIMER + printf( " - overflow U_nnz = %ld (%d/%d)\n",k_nnz,i,M.ncol ); + for (Int ii = 0; ii <= i; ii++) printf( " * U_col_counts[%d] = %ld\n",ii,ST.U_col_counts[ii] ); + #endif + break; + } + } + t_nnz = Int(fill_factor*double(k_nnz)); + if (fill_factor > 1.0 && k_nnz > t_nnz) { + t_nnz = k_nnz; + } + Int mn = max(0,M.nrow*M.ncol); + if (mn > 0 && mn < t_nnz) { + t_nnz = mn; + } M.nnz = t_nnz; + #ifdef BASKER_TIMER + printf("U_assing_nnz: %ld min(%d, %d)-> %ld\n", k_nnz,M.nrow*M.ncol, Int(fill_factor*double(k_nnz)), M.nnz); + #endif } if (global_nnz <= Int_MAX-t_nnz) { // let's just hope it is enough, if overflow global_nnz += t_nnz; + } else { + global_nnz = Int_MAX; } #if 0 printf( " debug: set U.nnz = 0 to force realloc\n" ); @@ -2358,8 +2399,8 @@ int Basker::sfactor() #endif if(Options.verbose == BASKER_TRUE) { - printf("U_assing with elbow global_nnz = %ld, t_nnz = %ld (fill_factor = %f), M.nnz = %ld (%ld x %ld)\n", - (long)global_nnz,(long)t_nnz, fill_factor, (long)M.nnz,(long)M.nrow,(long)M.ncol); + printf("U_assing with elbow global_nnz = %ld, t_nnz = %ld (fill_factor = %f), M.nnz = %ld (%ld x %ld) -> %.2f\n", + (long)global_nnz,(long)t_nnz, fill_factor, (long)M.nnz,(long)M.nrow,(long)M.ncol, ((double)M.nnz)/((double)(M.nrow*M.ncol))); } } }//end assign_upper_nnz @@ -2386,6 +2427,11 @@ int Basker::sfactor() t_nnz += ST.L_row_counts[i]; } else { // let's just hope it is enough, if overflow + t_nnz = Int_MAX; + #ifdef BASKER_TIMER + printf( " - overflow L_nnz = %ld (%d/%d)\n",t_nnz,i,M.ncol ); + for (Int ii = 0; ii <= i; ii++) printf( " * L_col_counts[%d] = %ld\n",ii,ST.L_row_counts[ii] ); + #endif break; } } @@ -2394,9 +2440,8 @@ int Basker::sfactor() printf("L_assign_nnz: %ld \n", t_nnz); #endif - // double fill_factor = 2.05; double old_nnz = M.nnz; - Int temp = min(M.nrow*M.ncol, Int(fill_factor*t_nnz)); + Int temp = min(M.nrow*M.ncol, Int(fill_factor*double(t_nnz))); if (temp >= t_nnz) { M.nnz = temp; } else { @@ -2432,7 +2477,7 @@ int Basker::sfactor() if(option == 0) { M.nnz = ST.S_col_counts(0); - #ifdef BASKER_DEBUG_SFACTOR + #ifdef BASKER_TIMER printf("S_assign_nnz: %ld \n", M.nnz); #endif @@ -2443,7 +2488,7 @@ int Basker::sfactor() } if(Options.verbose == BASKER_TRUE) { - printf("S_assign elbow global_nnz = %ld, M.nnz = %ld + 2\n", (long)global_nnz, (long)M.nnz); + printf("S_assign elbow global_nnz = %ld, M.nnz = %ld + 2 (%d x %d)\n", (long)global_nnz, (long)M.nnz, (int)M.nrow,(int)M.ncol); } if (M.nnz <= Int_MAX - 2) { M.nnz += 2; @@ -2461,9 +2506,9 @@ int Basker::sfactor() #ifdef BASKER_DEBUG_SFACTOR //printf("Test btf_last_dense \n"); //printf("btf_tabs_offset: %d col: %d \n", - // btf_tabs_offset, btf_tabs[btf_tabs_offset]); + // btf_tabs_offset, btf_tabs[btf_tabs_offset]); //printf("number of blks: %d \n", - // btf_nblks-btf_tabs_offset); + // btf_nblks-btf_tabs_offset); #endif #ifdef BASKER_TIMER printf( " > btf_last_dense(%s) <\n",(flag ? "true" : "false") ); fflush(stdout); @@ -2520,7 +2565,7 @@ int Basker::sfactor() //Malloc L and U #ifdef BASKER_DEBUG_SFACTOR printf("btf_nblks %d btf_tabs_offset %d \n", - btf_nblks, btf_tabs_offset); + btf_nblks, btf_tabs_offset); #endif Int nblks_left = btf_nblks - btf_tabs_offset; diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp index ef1e29d597e4..706066f3011b 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_structs.hpp @@ -117,7 +117,7 @@ namespace BaskerNS //Used to store information about the tree template struct basker_tree - { + { BASKER_INLINE basker_tree() { @@ -235,6 +235,7 @@ namespace BaskerNS INT_1DARRAY rowptr; INT_1DARRAY child; INT_1DARRAY sibling; + INT_1DARRAY leaf_nnz; };//end basker_tree diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp index 6e4d1554c754..161be9122f49 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_thread.hpp @@ -127,6 +127,12 @@ namespace BaskerNS Int ltask = (l*ntasks) + task; //printf("Enter Domain Barrier. leader=%d, lsize=%d (%d:%d), my_id=%d, task=%d, k=%d, l=%d -> ltask=%d\n", // my_leader, lsize, my_leader,my_leader+lsize-1, my_id, task, k, l, ltask); fflush(stdout); + #if 0 // debug + if (token[my_id][ltask] == k) { + printf( "\n BarrierDomain already k ??\n " ); + exit(0); + } + #endif token[my_id][ltask] = k; for(Int dp = (my_leader+lsize)-1; dp >= my_leader; dp--) { @@ -179,7 +185,7 @@ namespace BaskerNS //Atomic BaskerBarrier(volatile Int &value_in, volatile Int &value_out, - const Int l_size ) + const Int l_size ) { //jdb value ->value_in atomic_barrier(value_in,l_size); @@ -278,7 +284,6 @@ namespace BaskerNS BASKER_NO_OP; } } - }; //end BaskerBarrier diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp index f57447b10906..9d30b714553d 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_types.hpp @@ -73,7 +73,7 @@ enum BASKER_MATCHING_CODE #define BASKER_BTF_PRUNE_SIZE 100 #define BASKER_DOM_NNZ_OVER 1.0 //Added to control estimate for DOM blocks -#define BASKER_SEP_NNZ_OVER 3.0 //Added to control estimate for SEP blocks +#define BASKER_SEP_NNZ_OVER 2.0 //Added to control estimate for SEP blocks enum BASKER_INCOMPLETE_CODE { diff --git a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp index 455b76004a98..aae9b141aaf8 100644 --- a/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp +++ b/packages/shylu/shylu_node/basker/src/shylubasker_util.hpp @@ -464,9 +464,9 @@ namespace BaskerNS for(Int row = 0; row < LL_size(b); row++) { - #ifdef BASKER_DEBUG_INIT - printf("L Factor Init: %d %d , kid: %d, nnz: %ld \n", - b, row, kid, LL(b)(row).nnz); + #ifdef BASKER_TIMER + printf("L Factor Init: L(%d %d) , kid: %d, nnz = %ld (%d x %d)\n", + b, row, kid, LL(b)(row).nnz, LL(b)(row).nrow,LL(b)(row).ncol); #endif #ifdef BASKER_TIMER From 26b3eac40a0dd818b7cb5950744c5fba53087005 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Mon, 4 Nov 2024 09:11:53 -0700 Subject: [PATCH 091/243] MueLu: Fix clang-format Signed-off-by: Christian Glusa --- .../MueLu_CoalesceDropFactory_def.hpp | 285 +++++++++--------- .../test/unit_tests/CoalesceDropFactory.cpp | 118 ++++---- 2 files changed, 198 insertions(+), 205 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 1f9961289cb0..e2bae01ffa21 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -490,8 +490,8 @@ void CoalesceDropFactory::Build(Level GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; } } else { - ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - if(classicalAlgo == defaultAlgo) { + ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + if (classicalAlgo == defaultAlgo) { ghostedDiagVals = ghostedDiag->getData(0); } } @@ -510,7 +510,7 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - if(classicalAlgo == defaultAlgo) { + if (classicalAlgo == defaultAlgo) { SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { size_t nnz = A->getNumEntriesInLocalRow(row); @@ -578,177 +578,180 @@ void CoalesceDropFactory::Build(Level rows(row + 1) = realnnz; } } // end for row - } - else { + } else { /* Cut Algorithm */ SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); - using ExecSpace = typename Node::execution_space; - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - using ATS = Kokkos::ArithTraits; + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using ATS = Kokkos::ArithTraits; using impl_scalar_type = typename ATS::val_type; - using implATS = Kokkos::ArithTraits; + using implATS = Kokkos::ArithTraits; - //move from host to device + // move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto thresholdKokkos = static_cast(threshold); + auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - auto A_device = A->getLocalMatrixDevice(); - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - RCP importer = A->getCrsGraph()->getImporter(); + auto A_device = A->getLocalMatrixDevice(); + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + RCP importer = A->getCrsGraph()->getImporter(); RCP boundaryNodesVector = Xpetra::VectorFactory::Build(graph->GetDomainMap()); RCP boundaryColumnVector; - for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) { + for (size_t i = 0; i < graph->GetNodeNumVertices(); i++) { boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i]; } - if(!importer.is_null()) { + if (!importer.is_null()) { boundaryColumnVector = Xpetra::VectorFactory::Build(graph->GetImportMap()); boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT); - } - else { + } else { boundaryColumnVector = boundaryNodesVector; } auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly); - auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); + auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); - Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + Kokkos::View rownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); auto index_views = Kokkos::View("index_views", A_device.nnz()); - Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { - LO row = teamMember.league_rank(); - auto rowView = A_device.rowConst(row); - size_t nnz = rowView.length; - - auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - - //find magnitudes - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { - index_view(colID) = colID; - LO col = rowView.colidx(colID); - //ignore diagonals for now, they are checked again later - //Don't aggregate boundaries - if(row == col || boundary(col)) { - drop_view(colID) = true; - } - else { - drop_view(colID) = false; - } - }); - - size_t dropStart = nnz; - if (classicalAlgo == unscaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - return x_aij > y_aij; - } - }); + Kokkos::parallel_reduce( + "classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.rowConst(row); + size_t nnz = rowView.length; - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename implATS::magnitudeType x_aij = 0; - typename implATS::magnitudeType y_aij = 0; - if(!drop_view(x)) { - x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - } - if(!drop_view(y)) { - y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - } + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1))); + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1))); - if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { - if(i < min) { - min = i; + // find magnitudes + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { + index_view(colID) = colID; + LO col = rowView.colidx(colID); + // ignore diagonals for now, they are checked again later + // Don't aggregate boundaries + if (row == col || boundary(col)) { + drop_view(colID) = true; + } else { + drop_view(colID) = false; } - } - }, Kokkos::Min(dropStart)); - } else if (classicalAlgo == scaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); - return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); - } - }); + }); - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename implATS::magnitudeType x_val = 0; - typename implATS::magnitudeType y_val = 0; - if(!drop_view(x)) { - typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - x_val = x_aij / x_aiiajj; - } - if(!drop_view(y)) { - typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); - y_val = y_aij / y_aiiajj; - } + size_t dropStart = nnz; + if (classicalAlgo == unscaled_cut) { + // push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if (drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + return x_aij > y_aij; + } + }); - if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) { - if(i < min) { - min = i; - } + // find index where dropping starts + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_aij = 0; + typename implATS::magnitudeType y_aij = 0; + if (!drop_view(x)) { + x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + } + if (!drop_view(y)) { + y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + } + + if (realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { + if (i < min) { + min = i; + } + } + }, + Kokkos::Min(dropStart)); + } else if (classicalAlgo == scaled_cut) { + // push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if (drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); + } + }); + + // find index where dropping starts + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_val = 0; + typename implATS::magnitudeType y_val = 0; + if (!drop_view(x)) { + typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + x_val = x_aij / x_aiiajj; + } + if (!drop_view(y)) { + typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + y_val = y_aij / y_aiiajj; + } + + if (realThresholdKokkos * realThresholdKokkos * x_val > y_val) { + if (i < min) { + min = i; + } + } + }, + Kokkos::Min(dropStart)); } - }, Kokkos::Min(dropStart)); - } - //drop everything to the right of where values stop passing threshold - if(dropStart < nnz) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { - drop_view(index_view(i)) = true; - }); - } + // drop everything to the right of where values stop passing threshold + if (dropStart < nnz) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { + drop_view(index_view(i)) = true; + }); + } - LO rownnz = 0; - GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { - LO col = rowView.colidx(idxID); - //don't drop diagonal - if(row == col || !drop_view(idxID)) { - columnsDevice(A_device.graph.row_map(row) + idxID) = col; - keep++; - } - else { - columnsDevice(A_device.graph.row_map(row) + idxID) = -1; - drop++; - } - }, rownnz, rowDropped); + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + // don't drop diagonal + if (row == col || !drop_view(idxID)) { + columnsDevice(A_device.graph.row_map(row) + idxID) = col; + keep++; + } else { + columnsDevice(A_device.graph.row_map(row) + idxID) = -1; + drop++; + } + }, + rownnz, rowDropped); - globalnnz += rownnz; - totalDropped += rowDropped; - rownnzView(row) = rownnz; - }, realnnz, numDropped); + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, + realnnz, numDropped); - //update column indices so that kept indices are aligned to the left for subview that happens later on + // update column indices so that kept indices are aligned to the left for subview that happens later on Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); - //update row indices by adding up new # of nnz in each row + // update row indices by adding up new # of nnz in each row auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); - Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { - partial_sum += rownnzView(i); - if(is_final) rowsDevice(i+1) = partial_sum; - }); + Kokkos::parallel_scan( + Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if (is_final) rowsDevice(i + 1) = partial_sum; + }); Kokkos::deep_copy(rows, rowsDevice); } diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp index 0073ca7e9bfb..7ec8dbe27a3a 100644 --- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp +++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp @@ -1402,36 +1402,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); const global_size_t globalIndices = 12; - const GO indexBase = 0; - RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); RCP A_t(new crs_matrix_type(map, 5)); - const SC two = static_cast(2.0); - const SC one = static_cast(1.0); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); const SC negOne = static_cast(-1.0); - for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + for (LO lclRow = 0; lclRow < static_cast(map->getLocalNumElements()); lclRow++) { const GO gblRow = map->getGlobalElement(lclRow); - if(gblRow == 0) { + if (gblRow == 0) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); - } - else if(static_cast(gblRow) == globalIndices - 1) { + } else if (static_cast(gblRow) == globalIndices - 1) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); - } - else if(gblRow == 2 || gblRow == 9) { + } else if (gblRow == 2 || gblRow == 9) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); - } - else if(gblRow == 5) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); - } - else if(gblRow == 6) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); - } - else { + } else if (gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } else if (gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, two, two, two, negOne)); + } else { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); } } A_t->fillComplete(); RCP A_x = rcp(new TpetraCrsMatrix(A_t)); - RCP A = rcp(new CrsMatrixWrap(A_x)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; @@ -1461,19 +1456,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); - int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; int columns[28] = {0, 1, 0, 1, 2, @@ -1486,23 +1481,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala 9, 10, 11, 10, 11}; - auto rowPtrs = graph->getRowPtrs(); - auto entries = graph->getEntries(); - size_t rowID = 0; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; TEST_EQUALITY(rowPtrs(0), rowID); - for(size_t i = 0; i < rowPtrs.size()-1; i++) { + for (size_t i = 0; i < rowPtrs.size() - 1; i++) { auto gblID = myDomainMap->getGlobalElement(i); - int rownnz = rows[gblID+1]-rows[gblID]; + int rownnz = rows[gblID + 1] - rows[gblID]; rowID += rownnz; - TEST_EQUALITY(rowPtrs(i+1), rowID); + TEST_EQUALITY(rowPtrs(i + 1), rowID); std::vector colID; - for(int j = 0; j < rownnz; j++) { - colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + for (int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j))); } std::sort(std::begin(colID), std::end(colID)); - for(int j = 0; j < rownnz; j++) { - TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + for (int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID] + j]); } } } // ClassicalScaledCut @@ -1525,36 +1520,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); const global_size_t globalIndices = 12; - const GO indexBase = 0; - RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); RCP A_t(new crs_matrix_type(map, 5)); - const SC two = static_cast(2.0); - const SC one = static_cast(1.0); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); const SC negOne = static_cast(-1.0); - for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + for (LO lclRow = 0; lclRow < static_cast(map->getLocalNumElements()); lclRow++) { const GO gblRow = map->getGlobalElement(lclRow); - if(gblRow == 0) { + if (gblRow == 0) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); - } - else if(static_cast(gblRow) == globalIndices - 1) { + } else if (static_cast(gblRow) == globalIndices - 1) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); - } - else if(gblRow == 2 || gblRow == 9) { + } else if (gblRow == 2 || gblRow == 9) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); - } - else if(gblRow == 5) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); - } - else if(gblRow == 6) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); - } - else { + } else if (gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } else if (gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, two, two, two, negOne)); + } else { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); } } A_t->fillComplete(); RCP A_x = rcp(new TpetraCrsMatrix(A_t)); - RCP A = rcp(new CrsMatrixWrap(A_x)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; @@ -1584,19 +1574,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); - int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; int columns[28] = {0, 1, 0, 1, 2, @@ -1609,23 +1599,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca 9, 10, 11, 10, 11}; - auto rowPtrs = graph->getRowPtrs(); - auto entries = graph->getEntries(); - size_t rowID = 0; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; TEST_EQUALITY(rowPtrs(0), rowID); - for(size_t i = 0; i < rowPtrs.size()-1; i++) { + for (size_t i = 0; i < rowPtrs.size() - 1; i++) { auto gblID = myDomainMap->getGlobalElement(i); - int rownnz = rows[gblID+1]-rows[gblID]; + int rownnz = rows[gblID + 1] - rows[gblID]; rowID += rownnz; - TEST_EQUALITY(rowPtrs(i+1), rowID); + TEST_EQUALITY(rowPtrs(i + 1), rowID); std::vector colID; - for(int j = 0; j < rownnz; j++) { - colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + for (int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j))); } std::sort(std::begin(colID), std::end(colID)); - for(int j = 0; j < rownnz; j++) { - TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + for (int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID] + j]); } } } // ClassicalUnScaledCut From 9b5fd842f76fdd9bb45d7a62aa8f0e10568f4e52 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 11:05:14 -0700 Subject: [PATCH 092/243] Add default shell setting to CodeQL job Add default shell setting to CodeQL job specifying to use a `bash -l` login shell to clean the syntax throughout the file where these were being individually specified. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index b739518db9ef..82c7df08f3e9 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -35,19 +35,20 @@ jobs: # only required for workflows in private repositories actions: read contents: read - strategy: fail-fast: false matrix: include: - language: c-cpp build-mode: manual + defaults: + run: + shell: bash -l steps: - name: Checkout repository uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: @@ -65,33 +66,31 @@ jobs: env - name: Module list - shell: bash -l {0} run: | module list printenv PATH - if: matrix.build-mode == 'manual' name: Get dependencies + working-directory: ${GITHUB_WORKSPACE}/packages/framework run: | - bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" + ./get_dependencies.sh --container + ${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container - if: matrix.build-mode == 'manual' name: Generate CMake fragments - shell: bash -lc {0} run: | git fetch origin ${GITHUB_BASE_REF} mkdir -p trilinos_build && cd trilinos_build source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables - bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" + ${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake - if: matrix.build-mode == 'manual' name: Configure and build Trilinos - shell: bash -lc {0} + working-directory: ./trilinos_build run: | - cd trilinos_build - cmake -C genconfig_fragment.cmake -C package_enables.cmake \ -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \ -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \ From 6c999fcf300f274adb2e479671f7f3410e84c0b7 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 11:12:39 -0700 Subject: [PATCH 093/243] Fix bash argument syntax Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 82c7df08f3e9..4ddc84a920b7 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -43,7 +43,7 @@ jobs: build-mode: manual defaults: run: - shell: bash -l + shell: bash -l {0} steps: - name: Checkout repository From c322f5a454f59bf0c3048df3cdd72ca4e7f1ba26 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 11:19:40 -0700 Subject: [PATCH 094/243] Fix working-directory workflow setting Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4ddc84a920b7..6daaeb02e030 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -72,10 +72,9 @@ jobs: - if: matrix.build-mode == 'manual' name: Get dependencies - working-directory: ${GITHUB_WORKSPACE}/packages/framework + working-directory: ./packages/framework run: | ./get_dependencies.sh --container - ${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container - if: matrix.build-mode == 'manual' name: Generate CMake fragments From f7fdee0000c9d9d81c2ff4affb22aa740c20a2be Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 12:49:18 -0700 Subject: [PATCH 095/243] Add workflow concurrency cancel and rename workflow Add workflow concurrency cancelation check for previous workflows associated with the same PR. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 6daaeb02e030..fff932c18dc7 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,25 +1,20 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# -name: "CodeQL: Linear Solvers" +name: "CodeQL Security Scan" on: pull_request: - branches: [ "develop" ] + branches: + - develop types: - opened - synchronize schedule: - cron: '41 23 * * 2' +# Cancels any in progress workflows associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + permissions: contents: read From 127a471342446f90dcbd614596f6a8da8adc5738 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 5 Nov 2024 07:51:38 +0000 Subject: [PATCH 096/243] Bump actions/checkout from 4.2.1 to 4.2.2 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.1 to 4.2.2. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871...11bd71901bbe5b1630ceea73d27597364c9af683) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/AT2.yml | 8 ++++---- .github/workflows/clang_format.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/dependency-review.yml | 2 +- .github/workflows/detect-git-lfs.yml | 2 +- .github/workflows/detect-mpi-comm-world.yml | 2 +- .github/workflows/per-commit.yml | 2 +- .github/workflows/scorecards.yml | 2 +- .github/workflows/spack.yml | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/AT2.yml b/.github/workflows/AT2.yml index b232051eddf2..c085620db33a 100644 --- a/.github/workflows/AT2.yml +++ b/.github/workflows/AT2.yml @@ -60,7 +60,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status @@ -151,7 +151,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status @@ -242,7 +242,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status @@ -334,7 +334,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index a3fd0968ad75..d0b7392226a0 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: DoozyX/clang-format-lint-action@c71d0bf4e21876ebec3e5647491186f8797fde31 # v0.18.2 with: source: './packages/muelu ./packages/tempus ./packages/teko ./packages/xpetra' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4139508fa42b..3ee521f94e90 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -58,7 +58,7 @@ jobs: # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages steps: - name: Checkout repository - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index bf29beac76d5..955b3b3fb2d0 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -22,6 +22,6 @@ jobs: egress-policy: audit - name: 'Checkout Repository' - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: 'Dependency Review' uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0 diff --git a/.github/workflows/detect-git-lfs.yml b/.github/workflows/detect-git-lfs.yml index ebe778088863..68595577ec7c 100644 --- a/.github/workflows/detect-git-lfs.yml +++ b/.github/workflows/detect-git-lfs.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/.github/workflows/detect-mpi-comm-world.yml b/.github/workflows/detect-mpi-comm-world.yml index 1fd6790c8c86..e85d71db2f6a 100644 --- a/.github/workflows/detect-mpi-comm-world.yml +++ b/.github/workflows/detect-mpi-comm-world.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/.github/workflows/per-commit.yml b/.github/workflows/per-commit.yml index 3f619a7dbbc0..80dfc8b94008 100644 --- a/.github/workflows/per-commit.yml +++ b/.github/workflows/per-commit.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 46a2c4571aff..1ac917d3af8a 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -31,7 +31,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: persist-credentials: false diff --git a/.github/workflows/spack.yml b/.github/workflows/spack.yml index 59976c1d9b3e..3c3c01b75849 100644 --- a/.github/workflows/spack.yml +++ b/.github/workflows/spack.yml @@ -24,7 +24,7 @@ jobs: runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6] steps: - name: Clone Trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 1 - name: Spack build From a61d6cf58541c77aadb82334ee0d89c350eb2521 Mon Sep 17 00:00:00 2001 From: "Curtis C. Ober" Date: Tue, 5 Nov 2024 16:32:11 -0700 Subject: [PATCH 097/243] Tpetra: Add LinearProblem As part of the effort to transition from Epetra to Tpetra, several features of EpetraExt need transition (e.g., Singleton Filtering). To begin, we need to create a Tpetra version of LinearProblem, which includes left and right scaling. A basic unit test is also included. Signed-off-by: Curtis C. Ober --- packages/tpetra/core/src/CMakeLists.txt | 4 + .../core/src/Tpetra_LinearProblem_decl.hpp | 218 +++++++++++++ .../core/src/Tpetra_LinearProblem_def.hpp | 192 +++++++++++ .../core/src/Tpetra_LinearProblem_fwd.hpp | 29 ++ .../core/src/Tpetra_MultiVector_decl.hpp | 2 +- packages/tpetra/core/test/CMakeLists.txt | 1 + .../core/test/LinearProblem/CMakeLists.txt | 8 + .../LinearProblem/LinearProblem_UnitTests.cpp | 297 ++++++++++++++++++ 8 files changed, 750 insertions(+), 1 deletion(-) create mode 100644 packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp create mode 100644 packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp create mode 100644 packages/tpetra/core/src/Tpetra_LinearProblem_fwd.hpp create mode 100644 packages/tpetra/core/test/LinearProblem/CMakeLists.txt create mode 100644 packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp diff --git a/packages/tpetra/core/src/CMakeLists.txt b/packages/tpetra/core/src/CMakeLists.txt index e77abc4d98c6..0cfece8b18f5 100644 --- a/packages/tpetra/core/src/CMakeLists.txt +++ b/packages/tpetra/core/src/CMakeLists.txt @@ -729,6 +729,10 @@ IF (${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION) TPETRA_PROCESS_ALL_SLGN_TEMPLATES(BLOCKMULTIVECTOR_OUTPUT_FILES "Tpetra_ETI_SC_LO_GO_NT.tmpl" "BlockMultiVector" "BLOCKMULTIVECTOR" "${CrsMatrix_ETI_SCALARS}" "${TpetraCore_ETI_LORDS}" "${TpetraCore_ETI_GORDS}" "${TpetraCore_ETI_NODES}" TRUE ) LIST(APPEND SOURCES ${BLOCKMULTIVECTOR_OUTPUT_FILES}) + # Generate ETI .cpp files for Tpetra::LinearProblem. + TPETRA_PROCESS_ALL_SLGN_TEMPLATES(LINEARPROBLEM_OUTPUT_FILES "Tpetra_ETI_SC_LO_GO_NT.tmpl" "LinearProblem" "LINEARPROBLEM" "${CrsMatrix_ETI_SCALARS}" "${TpetraCore_ETI_LORDS}" "${TpetraCore_ETI_GORDS}" "${TpetraCore_ETI_NODES}" TRUE) + LIST(APPEND SOURCES ${LINEARPROBLEM_OUTPUT_FILES}) + # Generate ETI .cpp files for Tpetra::BlockVector. TPETRA_PROCESS_ALL_SLGN_TEMPLATES(BLOCKVECTOR_OUTPUT_FILES "Tpetra_ETI_SC_LO_GO_NT.tmpl" "BlockVector" "BLOCKVECTOR" diff --git a/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp b/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp new file mode 100644 index 000000000000..a5601589fa2f --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp @@ -0,0 +1,218 @@ +// @HEADER +// ***************************************************************************** +// Tpetra: Templated Linear Algebra Services Package +// +// Copyright 2008 NTESS and the Tpetra contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +#ifndef TPETRA_LINEARPROBLEM_DECL_HPP +#define TPETRA_LINEARPROBLEM_DECL_HPP + +/// \file Tpetra_LinearProblem_decl.hpp +/// \brief Declaration of the Tpetra::LinearProblem class + +#include "Teuchos_DataAccess.hpp" + +#include "Tpetra_Vector_decl.hpp" +#include "Tpetra_MultiVector_decl.hpp" +#include "Tpetra_RowMatrix_decl.hpp" +#include "Tpetra_DistObject.hpp" +#include "Tpetra_Details_ExecutionSpacesUser.hpp" + +namespace Tpetra { + + /// \class LinearProblem + /// \brief Class that encapulates linear problem (Ax = b). + /// + /// The LinearProblem class is a wrapper that encapsulates the + /// general information needed for solving a linear system of + /// equations. Currently it accepts a Tpetra matrix/operator, + /// initial guess and RHS and returns the solution. + /// + /// \tparam Scalar The type of the numerical entries of the matrix. + /// (You can use real-valued or complex-valued types here.) + /// \tparam LocalOrdinal The type of local indices. See the + /// documentation of Map for requirements. + /// \tparam GlobalOrdinal The type of global indices. See the + /// documentation of Map for requirements. + /// \tparam Node The Kokkos Node type. See the documentation + /// of Map for requirements. + + template + class LinearProblem : + public DistObject, + public Details::Spaces::User + { + + private: + /// Type of the DistObject specialization from which this class inherits. + using dist_object_type = DistObject; + + public: + //! @name Typedefs + //@{ + + using map_type = Map; + using row_matrix_type = RowMatrix; + using multivector_type = MultiVector; + using vector_type = Vector; + using operator_type = Operator; + using linear_problem_type = LinearProblem; + + //@} + + //! @name Constructors/Destructor + //@{ + + /// \brief Default Constructor. + /// + /// Creates an empty LinearProblem instance. The operator + /// A, left-hand-side X and right-hand-side B must be set + /// use the setOperator(), SetLHS() and SetRHS() methods + /// respectively. + LinearProblem(); + + /// \brief Constructor with a matrix as the operator. + /// + /// Creates a LinearProblem instance where the operator + /// is passed in as a matrix. + LinearProblem(const Teuchos::RCP & A, + const Teuchos::RCP& X, + const Teuchos::RCP& B); + + /// \brief Constructor with Operator. + /// + /// Creates a LinearProblem instance for the case where + /// an operator is not necessarily a matrix. + LinearProblem(const Teuchos::RCP & A, + const Teuchos::RCP& X, + const Teuchos::RCP& B); + + //! Copy Constructor. + LinearProblem(const LinearProblem& Problem); + + //! LinearProblem Destructor. + virtual ~LinearProblem() = default; + + //@} + + //! @name Integrity check method + //@{ + + /// \brief Check input parameters for existence and size consistency. + /// + /// Returns 0 if all input parameters are valid. Returns +1 + /// if operator is not a matrix. This is not necessarily + /// an error, but no scaling can be done if the user passes + /// in an operator that is not an matrix. + int checkInput(bool fail_on_error = true) const; + + //@} + + //! @name Implementation of DistObject interface + //@{ + + virtual bool + checkSizes (const SrcDistObject& source) override; + + //@} + + + //! @name Set methods + //@{ + + /// \brief Set Operator A of linear problem AX = B using a RowMatrix. + /// + /// Sets an RCP to a RowMatrix. No copy of the operator is made. + void setOperator(Teuchos::RCP A) + { A_ = A; Operator_ = A; } + + /// \brief Set Operator A of linear problem AX = B using an Operator. + /// + /// Sets an RCP to an Operator. No copy of the operator is made. + void setOperator(Teuchos::RCP A) + { A_ = Teuchos::rcp_dynamic_cast(A); Operator_ = A; } + + /// \brief Set left-hand-side X of linear problem AX = B. + /// + /// Sets an RCP to a MultiVector. No copy of the object is made. + void setLHS(Teuchos::RCP X) {X_ = X;} + + /// \brief Set right-hand-side B of linear problem AX = B. + /// + /// Sets an RCP to a MultiVector. No copy of the object is made. + void setRHS(Teuchos::RCP B) {B_ = B;} + + //@} + + //! @name Computational methods + //@{ + + /// \brief Perform left scaling of a linear problem. + /// + /// Applies the scaling vector D to the left side of the + /// matrix A() and to the right hand side B(). Note that + /// the operator must be a RowMatrix, not just an Operator. + /// + /// \param In + /// D - Vector containing scaling values. D[i] will + /// be applied to the ith row of A() and B(). + /// mode - Indicating if transposed. + /// \return Integer error code, set to 0 if successful. + /// Return -1 if operator is not a matrix. + void leftScale(const Teuchos::RCP & D, + Teuchos::ETransp mode = Teuchos::NO_TRANS); + + /// \brief Perform right scaling of a linear problem. + /// + /// Applies the scaling vector D to the right side of the + /// matrix A(). Apply the inverse of D to the initial + /// guess. Note that the operator must be a RowMatrix, + /// not just an Operator. + /// + /// \param In + /// D - Vector containing scaling values. D[i] will + /// be applied to the ith row of A(). 1/D[i] will + /// be applied to the ith row of B(). + /// mode - Indicating if transposed. + /// \return Integer error code, set to 0 if successful. + /// Return -1 if operator is not a matrix. + void rightScale(const Teuchos::RCP & D, + Teuchos::ETransp mode = Teuchos::NO_TRANS); + + //@} + + //! @name Accessor methods + //@{ + + //! Get an RCP to the operator A. + Teuchos::RCP getOperator() const {return(Operator_);}; + //! Get an RCP to the matrix A. + Teuchos::RCP getMatrix() const {return(A_);}; + //! Get an RCP to the left-hand-side X. + Teuchos::RCP getLHS() const {return(X_);}; + //! Get an RCP to the right-hand-side B. + Teuchos::RCP getRHS() const {return(B_);}; + + //@} + + private: + + Teuchos::RCP Operator_; + Teuchos::RCP A_; + Teuchos::RCP X_; + Teuchos::RCP B_; + + LinearProblem & operator=(const LinearProblem& Problem) = default; +}; + +} // namespace Tpetra + +#endif // TPETRA_LINEARPROBLEM_DECL_HPP diff --git a/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp b/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp new file mode 100644 index 000000000000..074cda87581c --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp @@ -0,0 +1,192 @@ +// @HEADER +// ***************************************************************************** +// Tpetra: Templated Linear Algebra Services Package +// +// Copyright 2008 NTESS and the Tpetra contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +#ifndef TPETRA_LINEARPROBLEM_DEF_HPP +#define TPETRA_LINEARPROBLEM_DEF_HPP + +/// \file Tpetra_LinearProblem_def.hpp +/// \brief Definition of the Tpetra::LinearProblem class +/// +/// If you want to use Tpetra::LinearProblem, include +/// "Tpetra_LinearProblem.hpp" (a file which CMake generates and installs +/// for you). If you only want the declaration of Tpetra::LinearProblem, +/// include "Tpetra_LinearProblem_decl.hpp". + +#include "Teuchos_DataAccess.hpp" +#include "Teuchos_TestForException.hpp" +#include "Tpetra_Details_Behavior.hpp" +#include "Tpetra_MultiVector.hpp" +#include "Tpetra_MultiVector_decl.hpp" + +namespace Tpetra { + + template + LinearProblem:: + LinearProblem () + : dist_object_type (Teuchos::rcp (new map_type ())), + Operator_(Teuchos::null), + A_(Teuchos::null), + X_(Teuchos::null), + B_(Teuchos::null) + { + } + + template + LinearProblem:: + LinearProblem (const Teuchos::RCP & A, + const Teuchos::RCP& X, + const Teuchos::RCP& B) + : dist_object_type (A->getDomainMap()), + Operator_(Teuchos::null), + A_(A), + X_(X), + B_(B) + { + // Try to make matrix an operator + Operator_ = Teuchos::rcp_dynamic_cast(A_); + } + + template + LinearProblem:: + LinearProblem (const Teuchos::RCP & A, + const Teuchos::RCP& X, + const Teuchos::RCP& B) + : dist_object_type (*X), + Operator_(A), + A_(Teuchos::null), + X_(X), + B_(B) + { + // Try to make operator a matrix + A_ = Teuchos::rcp_dynamic_cast(Operator_); + } + + template + LinearProblem:: + LinearProblem (const LinearProblem& Problem) + : dist_object_type (Problem), + Operator_(Problem.Operator_), + A_(Problem.A_), + X_(Problem.X_), + B_(Problem.B_) + { + } + + template + void LinearProblem:: + leftScale(const Teuchos::RCP & D, Teuchos::ETransp mode) + { + const Scalar ST0 = Teuchos::ScalarTraits::zero(); + const Scalar ST1 = Teuchos::ScalarTraits::one(); + if (mode == Teuchos::NO_TRANS) { + A_->leftScale(*D); + B_->elementWiseMultiply(ST1, *D, *B_, ST0); + } + else { + A_->rightScale(*D); + vector_type R(*D, Teuchos::DataAccess::Copy); + R.reciprocal(*D); + X_->elementWiseMultiply(ST1, R, *X_, ST0); + } + + return; + } + + template + void LinearProblem:: + rightScale(const Teuchos::RCP & D, Teuchos::ETransp mode) + { + const Scalar ST0 = Teuchos::ScalarTraits::zero(); + const Scalar ST1 = Teuchos::ScalarTraits::one(); + if (mode == Teuchos::NO_TRANS) { + A_->rightScale(*D); + vector_type R(*D, Teuchos::DataAccess::Copy); + R.reciprocal(*D); + X_->elementWiseMultiply(ST1, R, *X_, ST0); + } + else { + A_->leftScale(*D); + B_->elementWiseMultiply(ST1, *D, *B_, ST0); + } + return; + } + + template + int LinearProblem:: + checkInput(bool fail_on_error) const { + + int error = 0; + if (fail_on_error) { + + const char tfecfFuncName[] = "checkInput: "; + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(Operator_==Teuchos::null, + std::logic_error, "Operator_ is unset."); + + TPETRA_ABUSE_WARNING(A_==Teuchos::null, std::runtime_error, + "Linear problem does not have a matrix (A_), just an operator."); + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X_==Teuchos::null, + std::logic_error, "Solution vector (X_) is unset."); + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(B_==Teuchos::null, + std::logic_error, "RHS vector (B_) is unset."); + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!A_->getRowMap()->isSameAs(*(X_->getMap())), + std::logic_error, "Domain map of matrix is not the 'same as' the solution map."); + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!A_->getRowMap()->isSameAs(*(B_->getMap())), + std::logic_error, "Range map of matrix is not the 'same as' the RHS map."); + } + else { + if (Operator_==Teuchos::null) error = -1; + if (A_==Teuchos::null) error = 1; // Return warning error because this problem has no matrix (just an operator) + if (X_==Teuchos::null) error = -2; + if (B_==Teuchos::null) error = -3; + + if (!A_->getRowMap()->isSameAs(*(X_->getMap()))) error = -4; + if (!A_->getRowMap()->isSameAs(*(B_->getMap()))) error = -5; + } + + return error; + } + + template + bool LinearProblem:: + checkSizes (const SrcDistObject& sourceObj) + { + // Check whether the source object is a LinearProblem. If not, then + // we can't even compare sizes. + typedef LinearProblem LP; + const LP* src = dynamic_cast (&sourceObj); + if (src == nullptr) { + return false; + } + else { + this->checkInput(false); + src->checkInput(false); + + return ((this->A_->getDomainMap() == src->getMatrix()->getDomainMap()) and + (this->A_->getRangeMap() == src->getMatrix()->getRangeMap())); + } + } + +} // namespace Tpetra + +// +// Explicit instantiation macro +// +// Must be expanded from within the Tpetra namespace! +// + +#define TPETRA_LINEARPROBLEM_INSTANT(SCALAR,LO,GO,NODE) \ + template class LinearProblem< SCALAR , LO , GO , NODE >; + + +#endif // TPETRA_LINEARPROBLEM_DEF_HPP diff --git a/packages/tpetra/core/src/Tpetra_LinearProblem_fwd.hpp b/packages/tpetra/core/src/Tpetra_LinearProblem_fwd.hpp new file mode 100644 index 000000000000..143e5a89d719 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_LinearProblem_fwd.hpp @@ -0,0 +1,29 @@ +// @HEADER +// ***************************************************************************** +// Tpetra: Templated Linear Algebra Services Package +// +// Copyright 2008 NTESS and the Tpetra contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +#ifndef TPETRA_LINEARPROBLEM_FWD_HPP +#define TPETRA_LINEARPROBLEM_FWD_HPP + +#include "Tpetra_Details_DefaultTypes.hpp" + +/// \file Tpetra_LinearProblem_fwd.hpp +/// \brief Forward declaration of Tpetra::LinearProblem + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace Tpetra { +template +class LinearProblem; + +} // namespace Tpetra +#endif // DOXYGEN_SHOULD_SKIP_THIS + +#endif // TPETRA_LINEARPROBLEM_FWD_HPP diff --git a/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp b/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp index fe1c639c02c2..17d51223c3d0 100644 --- a/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp @@ -2016,7 +2016,7 @@ namespace Tpetra { /// \brief Multiply a Vector A elementwise by a MultiVector B. /// /// Compute this = scalarThis * this + scalarAB * B @ A - /// where @ denotes element-wise multiplication. In + /// where \@ denotes element-wise multiplication. In /// pseudocode, if C denotes *this MultiVector: /// \code /// C(i,j) = scalarThis * C(i,j) + scalarAB * B(i,j) * A(i,1); diff --git a/packages/tpetra/core/test/CMakeLists.txt b/packages/tpetra/core/test/CMakeLists.txt index 2144ebaf2c07..a53e978403c5 100644 --- a/packages/tpetra/core/test/CMakeLists.txt +++ b/packages/tpetra/core/test/CMakeLists.txt @@ -22,6 +22,7 @@ ADD_SUBDIRECTORIES( ImportExport ImportExport2 inout + LinearProblem Map MatrixMatrix Merge diff --git a/packages/tpetra/core/test/LinearProblem/CMakeLists.txt b/packages/tpetra/core/test/LinearProblem/CMakeLists.txt new file mode 100644 index 000000000000..732166cb223b --- /dev/null +++ b/packages/tpetra/core/test/LinearProblem/CMakeLists.txt @@ -0,0 +1,8 @@ +TRIBITS_ADD_EXECUTABLE_AND_TEST( + LinearProblem_UnitsTests + SOURCES + LinearProblem_UnitTests.cpp + ${TEUCHOS_STD_UNIT_TEST_MAIN} + COMM serial mpi + STANDARD_PASS_OUTPUT + ) diff --git a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp new file mode 100644 index 000000000000..6d76d58dd997 --- /dev/null +++ b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp @@ -0,0 +1,297 @@ +// @HEADER +// ***************************************************************************** +// Tpetra: Templated Linear Algebra Services Package +// +// Copyright 2008 NTESS and the Tpetra contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +#include "Tpetra_TestingUtilities.hpp" +#include "Tpetra_LinearProblem.hpp" + +#include "Tpetra_CrsMatrix.hpp" + + +namespace { // (anonymous) + + using Tpetra::TestingUtilities::getDefaultComm; + using Tpetra::createContigMapWithNode; + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::Comm; + using Teuchos::Array; + using Teuchos::tuple; + //using Teuchos::NO_TRANS; + //using Teuchos::TRANS; + //using Teuchos::CONJ_TRANS; + using std::endl; + using GST = Tpetra::global_size_t; + + + /// \brief Print out pretty version of RowMatrix. + template + void Display_CrsMatrix (std::string label, RCP > A, RCP< const Comm< int > > comm, Teuchos::FancyOStream& myOut) + { + using local_ordinal_type = typename Tpetra::Vector::local_ordinal_type; + + using crs_matrix_type = typename Tpetra::CrsMatrix; + + using crs_local_inds_host_view_type = typename crs_matrix_type::local_inds_host_view_type; + using crs_values_host_view_type = typename crs_matrix_type::values_host_view_type; + + const local_ordinal_type INVALID = Teuchos::OrdinalTraits::invalid(); + + // Get the number of rows and columns + GST numRows = A->getGlobalNumRows(); + GST numCols = A->getGlobalNumCols(); + + // Loop over all global rows + for (GST globalRow = 0; globalRow < numRows; ++globalRow) { + // Check if this row belongs to the current process + if (A->getRowMap()->getLocalElement(globalRow) != INVALID) { + myOut << "Row " << std::setw(2) << globalRow << " [ "; + + // Extract the row view + crs_local_inds_host_view_type localIndices; + crs_values_host_view_type values; + size_t localRow = A->getRowMap()->getLocalElement(globalRow); + A->getLocalRowView(localRow, localIndices, values); + + // Initialize a vector to track printed entries + std::vector printed(numCols, false); + + // Print the entries in the row + size_t numEntries = A->getNumEntriesInLocalRow(localRow); + for (size_t k = 0; k < numEntries; ++k) { + // Convert local index to global index + GST globalIndex = A->getColMap()->getGlobalElement(localIndices(k)); + printed[globalIndex] = true; // Mark the index as having a non-zero entry + } + + // Print the values for each column + for (GST j = 0; j < numCols; ++j) { + if (printed[j]) { + // Find the corresponding value for the global index + for (size_t k = 0; k < numEntries; ++k) { + // Convert local index to global index + GST globalIndex = A->getColMap()->getGlobalElement(localIndices(k)); + if (globalIndex == j) { + myOut << std::setw(8) << values(k) << " "; + break; + } + } + } else { + myOut << std::setw(8) << 0 << " "; + } + } + myOut << "]" << endl; + } + // Synchronize processes before printing + MPI_Barrier(MPI_COMM_WORLD); + } + } + + template + void Display_MultiVector (std::string label, Teuchos::RCP> multivector, Teuchos::RCP< const Teuchos::Comm< int > > comm, Teuchos::FancyOStream& myOut) + { + using local_ordinal_type = typename Tpetra::Vector<>::local_ordinal_type; + const local_ordinal_type INVALID = Teuchos::OrdinalTraits::invalid(); + + auto map = multivector->getMap(); + const size_t myImageID = comm->getRank(); + + if (myImageID==0) { + myOut << label << endl; + myOut << std::setw(8) << "Rank" << std::setw(12) << "GID" << std::setw(20) << "Value(s)" << endl; + } + GST numRows = multivector->getGlobalLength(); + for (GST globalRow = 0; globalRow < numRows; ++globalRow) { + // Check if this row belongs to the current process + if (map->getLocalElement(globalRow) != INVALID) { + size_t localElement = map->getLocalElement(globalRow); + myOut << std::setw(8) << myImageID << std::setw(12) << globalRow << " "; + for (size_t j = 0; j < multivector->getNumVectors(); ++j) { + myOut << std::setw(10) << multivector->getData(j)[localElement]; + } + myOut << endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + } + + template + void Display_Vector (std::string label, Teuchos::RCP> vector, Teuchos::RCP< const Teuchos::Comm< int > > comm, Teuchos::FancyOStream& myOut) + { + auto multivector = Teuchos::rcp_dynamic_cast> (vector); + Display_MultiVector(label, multivector, comm, myOut); + } + + + // + // UNIT TESTS + // + + //// + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( LinearProblem, basic, LO, GO, Scalar, Node ) + { + using map_type = Tpetra::Map; + using ST = Teuchos::ScalarTraits; + using mag_type = typename ST::magnitudeType; + + using MAT = Tpetra::CrsMatrix; + using VT = Tpetra::Vector; + using MV = Tpetra::MultiVector; + using LPT = Tpetra::LinearProblem; + //using local_ordinal_type = typename Tpetra::Vector::local_ordinal_type; + using global_ordinal_type = typename Tpetra::Vector::global_ordinal_type; + + + const global_ordinal_type INVALID = Teuchos::OrdinalTraits::invalid(); + constexpr bool debug = true; + + RCP outPtr = debug ? + Teuchos::getFancyOStream (Teuchos::rcpFromRef (std::cerr)) : + Teuchos::rcpFromRef (out); + Teuchos::FancyOStream& myOut = *outPtr; + Teuchos::OSTab tab0 (myOut); + + myOut << "Test: LinearProblem, Constructors" << endl; + + RCP > comm = getDefaultComm(); + //const size_t numImages = comm->getSize(); + const size_t myImageID = comm->getRank(); + // create a Map + const size_t numLocal = 10; + const size_t numVecs = 1; + RCP map = createContigMapWithNode(INVALID,numLocal,comm); + GO base = numLocal*myImageID; + GST globalNumElements = map->getGlobalNumElements(); + RCP > A; + { + RCP A_crs = rcp(new MAT(map,3)); + for (size_t i=0; iinsertGlobalValues(base+i,tuple(base+i),tuple(ST::one())); + A_crs->insertGlobalValues(base + i, tuple(base + i), tuple(2.0)); // Diagonal entry + + GST globalIndex = base + i; + // Insert the first subdiagonal entry if not in the first row + if (globalIndex > 0) { + A_crs->insertGlobalValues(base + i, tuple(base + i - 1), tuple(1.0)); // Subdiagonal entry + } + + // Insert the first superdiagonal entry if not in the last row + if (globalIndex < globalNumElements - 1) { + A_crs->insertGlobalValues(base + i, tuple(base + i + 1), tuple(1.0)); // Superdiagonal entry + } + } + A_crs->fillComplete(); + A = A_crs; + } + + // create solution, rhs and scaling vector + RCP X = rcp (new MV (map, numVecs)); + RCP B = rcp (new MV (map, numVecs)); + RCP S = rcp (new VT (map)); + + // Assign values to the MultiVector based on the global index + for (size_t j = 0; j < numVecs; ++j) { // Loop over each vector (column) + for (GST i = 0; i < globalNumElements; ++i) { + // Assign a value (for example, the global index plus the vector index) + X->replaceGlobalValue(i, j, Teuchos::as(i + j + 1)); + B->replaceGlobalValue(i, j, Teuchos::as(i + j + 1)); + } + } + for (GST i = 0; i < globalNumElements; ++i) { + S->replaceGlobalValue(i, Teuchos::as(i + 1)); + } + + RCP linearProblem = rcp(new LPT()); + + linearProblem->setOperator(A); + linearProblem->setLHS(X); + linearProblem->setRHS(B); + + linearProblem->checkInput(); + + //if (myImageID==0) myOut << "Original LinearProblem" << endl; + //Display_CrsMatrix("A", A, comm, myOut); + //Display_MultiVector("Solution Vector", X, comm, myOut); + //Display_MultiVector("RHS Vector", B, comm, myOut); + //Display_Vector("Scaling Vector", S, comm, myOut); + + // Original LinearProblem + GST N = globalNumElements; + double normF = std::sqrt(6*N - 2); + TEST_FLOATING_EQUALITY(linearProblem->getMatrix()->getFrobeniusNorm(), + Teuchos::as(normF), Teuchos::as(1.0e-14)); + //Teuchos::as(7.615773105863909), Teuchos::as(1.0e-14)); + + Array norms(numVecs); + linearProblem->getLHS()->norm1(norms()); + size_t vector_sum = N*(N+1)/2; + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), Teuchos::as(1.0e-14)); + linearProblem->getRHS()->norm1(norms()); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), Teuchos::as(1.0e-14)); + + // Left Scaling + linearProblem->leftScale(S); + + size_t vector_sum_squared = N*(N+1)*(2*N+1)/6; + normF = std::sqrt(6*vector_sum_squared - N*N - 1); + TEST_FLOATING_EQUALITY(linearProblem->getMatrix()->getFrobeniusNorm(), + Teuchos::as(normF), Teuchos::as(1.0e-14)); + linearProblem->getLHS()->norm1(norms()); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), Teuchos::as(1.0e-14)); + linearProblem->getRHS()->norm1(norms()); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum_squared), Teuchos::as(1.0e-14)); + + //if (myImageID==0) myOut << "After Left Scaling" << endl; + //Display_CrsMatrix("A", A, comm, myOut); + //Display_MultiVector("Solution Vector", X, comm, myOut); + //Display_MultiVector("RHS Vector", B, comm, myOut); + //Display_Vector("Scaling Vector", S, comm, myOut); + + // Right Scaling + linearProblem->rightScale(S); + + N = N-1; + size_t off_diags = 2.0*((N * (N + 1) * (2 * N + 1) * (3 * N * N + 3 * N - 1)) / 30.0 + + (N * N * (N + 1) * (N + 1)) / 2.0 + + (N * (N + 1) * (2 * N + 1)) / 6.0); + N = N+1; + size_t diag = (2.0 * N * (N + 1) * (2 * N + 1) * (3 * N * N + 3 * N - 1)) / 15.0; + normF = std::sqrt(diag + off_diags); + TEST_FLOATING_EQUALITY(linearProblem->getMatrix()->getFrobeniusNorm(), + Teuchos::as(normF), Teuchos::as(1.0e-14)); + linearProblem->getLHS()->norm1(norms()); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(N), Teuchos::as(1.0e-14)); + linearProblem->getRHS()->norm1(norms()); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum_squared), Teuchos::as(1.0e-14)); + + //if (myImageID==0) myOut << "After Right Scaling" << endl; + //Display_CrsMatrix("A", A, comm, myOut); + //Display_MultiVector("Solution Vector", X, comm, myOut); + //Display_MultiVector("RHS Vector", B, comm, myOut); + //Display_Vector("Scaling Vector", S, comm, myOut); + + // Constructor with matrix + { + RCP linearProblem_Matrix = rcp(new LPT(A,X,B)); + linearProblem_Matrix->checkInput(); + } + + } + +// +// INSTANTIATIONS +// + +#define UNIT_TEST_GROUP( SCALAR, LO, GO, NODE ) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( LinearProblem, basic, LO, GO, SCALAR, NODE ) + + TPETRA_ETI_MANGLING_TYPEDEFS() + + TPETRA_INSTANTIATE_SLGN( UNIT_TEST_GROUP ) + +} From d2ec21bd0c345fd1cf0150355885b6d2d28a4387 Mon Sep 17 00:00:00 2001 From: "Curtis C. Ober" Date: Wed, 6 Nov 2024 08:52:57 -0700 Subject: [PATCH 098/243] Remove error code from checkInput and remove explicit usage of MPI. Signed-off-by: Curtis C. Ober --- .../core/src/Tpetra_LinearProblem_decl.hpp | 2 +- .../core/src/Tpetra_LinearProblem_def.hpp | 49 +++++++------------ .../LinearProblem/LinearProblem_UnitTests.cpp | 4 +- 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp b/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp index a5601589fa2f..bdea3fc35626 100644 --- a/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp @@ -111,7 +111,7 @@ namespace Tpetra { /// if operator is not a matrix. This is not necessarily /// an error, but no scaling can be done if the user passes /// in an operator that is not an matrix. - int checkInput(bool fail_on_error = true) const; + void checkInput() const; //@} diff --git a/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp b/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp index 074cda87581c..78612e75e344 100644 --- a/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp +++ b/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp @@ -118,43 +118,30 @@ namespace Tpetra { } template - int LinearProblem:: - checkInput(bool fail_on_error) const { - - int error = 0; - if (fail_on_error) { + void LinearProblem:: + checkInput() const { - const char tfecfFuncName[] = "checkInput: "; + const char tfecfFuncName[] = "checkInput: "; - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(Operator_==Teuchos::null, - std::logic_error, "Operator_ is unset."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(Operator_==Teuchos::null, + std::logic_error, "Operator_ is unset."); - TPETRA_ABUSE_WARNING(A_==Teuchos::null, std::runtime_error, - "Linear problem does not have a matrix (A_), just an operator."); + TPETRA_ABUSE_WARNING(A_==Teuchos::null, std::runtime_error, + "Linear problem does not have a matrix (A_), just an operator."); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X_==Teuchos::null, - std::logic_error, "Solution vector (X_) is unset."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X_==Teuchos::null, + std::logic_error, "Solution vector (X_) is unset."); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(B_==Teuchos::null, - std::logic_error, "RHS vector (B_) is unset."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(B_==Teuchos::null, + std::logic_error, "RHS vector (B_) is unset."); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!A_->getRowMap()->isSameAs(*(X_->getMap())), - std::logic_error, "Domain map of matrix is not the 'same as' the solution map."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!A_->getRowMap()->isSameAs(*(X_->getMap())), + std::logic_error, "Domain map of matrix is not the 'same as' the solution map."); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!A_->getRowMap()->isSameAs(*(B_->getMap())), - std::logic_error, "Range map of matrix is not the 'same as' the RHS map."); - } - else { - if (Operator_==Teuchos::null) error = -1; - if (A_==Teuchos::null) error = 1; // Return warning error because this problem has no matrix (just an operator) - if (X_==Teuchos::null) error = -2; - if (B_==Teuchos::null) error = -3; + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!A_->getRowMap()->isSameAs(*(B_->getMap())), + std::logic_error, "Range map of matrix is not the 'same as' the RHS map."); - if (!A_->getRowMap()->isSameAs(*(X_->getMap()))) error = -4; - if (!A_->getRowMap()->isSameAs(*(B_->getMap()))) error = -5; - } - - return error; + return; } template @@ -169,8 +156,8 @@ namespace Tpetra { return false; } else { - this->checkInput(false); - src->checkInput(false); + this->checkInput(); + src->checkInput(); return ((this->A_->getDomainMap() == src->getMatrix()->getDomainMap()) and (this->A_->getRangeMap() == src->getMatrix()->getRangeMap())); diff --git a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp index 6d76d58dd997..d56480dcc2e0 100644 --- a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp +++ b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp @@ -88,7 +88,7 @@ namespace { // (anonymous) myOut << "]" << endl; } // Synchronize processes before printing - MPI_Barrier(MPI_COMM_WORLD); + comm->barrier(); } } @@ -116,7 +116,7 @@ namespace { // (anonymous) } myOut << endl; } - MPI_Barrier(MPI_COMM_WORLD); + comm->barrier(); } } From 0b3f08df7492cb9c580b902b862bfcf07097a1f1 Mon Sep 17 00:00:00 2001 From: maxfirmbach Date: Mon, 4 Nov 2024 10:01:09 -0700 Subject: [PATCH 099/243] Make AggregateQualityFactory a transfer factory Signed-off-by: maxfirmbach --- .../MueLu_NotayAggregationFactory_def.hpp | 7 --- .../MueLu_UncoupledAggregationFactory_def.hpp | 10 ----- .../MueLu_ParameterListInterpreter_def.hpp | 43 ++++++++++--------- ...u_AggregateQualityEstimateFactory_decl.hpp | 13 +++--- ...Lu_AggregateQualityEstimateFactory_def.hpp | 26 +++++------ .../aggregatequalities.xml | 7 +++ .../aggregatequalities.xml | 6 +-- .../Output/aggregatequalities_epetra.gold | 20 ++++----- .../Output/aggregatequalities_tpetra.gold | 20 ++++----- .../AggregateQualityEstimateFactory.cpp | 37 ++++++++++------ 10 files changed, 98 insertions(+), 91 deletions(-) rename packages/muelu/src/{Misc => Utils}/MueLu_AggregateQualityEstimateFactory_decl.hpp (88%) rename packages/muelu/src/{Misc => Utils}/MueLu_AggregateQualityEstimateFactory_def.hpp (96%) create mode 100644 packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml diff --git a/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp b/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp index b432ffb1d868..40f4635e0b3d 100644 --- a/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp +++ b/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp @@ -55,7 +55,6 @@ RCP NotayAggregationFactorysetEntry(name, MasterList::getEntry(name)) SET_VALID_ENTRY("aggregation: pairwise: size"); SET_VALID_ENTRY("aggregation: pairwise: tie threshold"); - SET_VALID_ENTRY("aggregation: compute aggregate qualities"); SET_VALID_ENTRY("aggregation: Dirichlet threshold"); SET_VALID_ENTRY("aggregation: ordering"); #undef SET_VALID_ENTRY @@ -64,21 +63,15 @@ RCP NotayAggregationFactoryset>("A", null, "Generating factory of the matrix"); validParamList->set>("Graph", null, "Generating factory of the graph"); validParamList->set>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'"); - validParamList->set>("AggregateQualities", null, "Generating factory for variable \'AggregateQualities\'"); return validParamList; } template void NotayAggregationFactory::DeclareInput(Level& currentLevel) const { - const ParameterList& pL = GetParameterList(); - Input(currentLevel, "A"); Input(currentLevel, "Graph"); Input(currentLevel, "DofsPerNode"); - if (pL.get("aggregation: compute aggregate qualities")) { - Input(currentLevel, "AggregateQualities"); - } } template diff --git a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp index fdbb1106294c..386451d1cfc3 100644 --- a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp +++ b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp @@ -75,14 +75,12 @@ RCP UncoupledAggregationFactoryset>("Graph", null, "Generating factory of the graph"); validParamList->set>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'"); - validParamList->set>("AggregateQualities", null, "Generating factory for variable \'AggregateQualities\'"); // special variables necessary for OnePtAggregationAlgorithm validParamList->set("OnePt aggregate map name", "", "Name of input map for single node aggregates. (default='')"); @@ -131,10 +129,6 @@ void UncoupledAggregationFactory::DeclareInpu Input(currentLevel, "nodeOnInterface"); } } - - if (pL.get("aggregation: compute aggregate qualities")) { - Input(currentLevel, "AggregateQualities"); - } } template @@ -375,10 +369,6 @@ void UncoupledAggregationFactory::Build(Level aggregates->ComputeAggregateSizes(true /*forceRecompute*/); Set(currentLevel, "Aggregates", aggregates); - - if (pL.get("aggregation: compute aggregate qualities")) { - RCP> aggQualities = Get>>(currentLevel, "AggregateQualities"); - } } template diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp index e46d286abb90..207791bf5b5b 100644 --- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp +++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp @@ -1098,7 +1098,6 @@ void ParameterListInterpreter:: MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: preserve Dirichlet points", bool, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: error on nodes with no on-rank neighbors", bool, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: phase3 avoid singletons", bool, aggParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, aggParams); aggFactory->SetParameterList(aggParams); // make sure that the aggregation factory has all necessary data aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph")); @@ -1180,7 +1179,6 @@ void ParameterListInterpreter:: MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: pairwise: tie threshold", double, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: Dirichlet threshold", double, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: ordering", std::string, aggParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, aggParams); aggFactory->SetParameterList(aggParams); aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph")); aggFactory->SetFactory("Graph", manager.GetFactory("Graph")); @@ -1200,25 +1198,6 @@ void ParameterListInterpreter:: coarseMap->SetFactory("Aggregates", manager.GetFactory("Aggregates")); manager.SetFactory("CoarseMap", coarseMap); - // Aggregate qualities - if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, true)) { - RCP aggQualityFact = rcp(new AggregateQualityEstimateFactory()); - ParameterList aggQualityParams; - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: good aggregate threshold", double, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file output", bool, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file base", std::string, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: check symmetry", bool, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: algorithm", std::string, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: zero threshold", double, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: percentiles", Teuchos::Array, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: mode", std::string, aggQualityParams); - aggQualityFact->SetParameterList(aggQualityParams); - manager.SetFactory("AggregateQualities", aggQualityFact); - - assert(aggType == "uncoupled"); - aggFactory->SetFactory("AggregateQualities", aggQualityFact); - } - // Tentative P MUELU_KOKKOS_FACTORY(Ptent, TentativePFactory, TentativePFactory_kokkos); ParameterList ptentParams; @@ -1319,6 +1298,28 @@ void ParameterListInterpreter:: RAPs->SetFactory("R", manager.GetFactory("R")); } + // Aggregate qualities + if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, true)) { + RCP aggQualityFact = rcp(new AggregateQualityEstimateFactory()); + ParameterList aggQualityParams; + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: good aggregate threshold", double, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file output", bool, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file base", std::string, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: check symmetry", bool, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: algorithm", std::string, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: zero threshold", double, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: percentiles", Teuchos::Array, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: mode", std::string, aggQualityParams); + aggQualityFact->SetParameterList(aggQualityParams); + aggQualityFact->SetFactory("Aggregates", manager.GetFactory("Aggregates")); + aggQualityFact->SetFactory("CoarseMap", manager.GetFactory("CoarseMap")); + + if (!RAP.is_null()) + RAP->AddTransferFactory(aggQualityFact); + else + RAPs->AddTransferFactory(aggQualityFact); + } + if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: export visualization data", bool, true)) { RCP aggExport = rcp(new AggregationExportFactory()); ParameterList aggExportParams; diff --git a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp similarity index 88% rename from packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp rename to packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp index be87ec960139..473ad53ce0bf 100644 --- a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp @@ -11,7 +11,7 @@ #define MUELU_AGGREGATEQUALITYESTIMATEFACTORY_DECL_HPP #include "MueLu_ConfigDefs.hpp" -#include "MueLu_SingleLevelFactoryBase.hpp" +#include "MueLu_TwoLevelFactoryBase.hpp" #include "MueLu_AggregateQualityEstimateFactory_fwd.hpp" #include @@ -41,8 +41,11 @@ namespace MueLu { computing, 34(2), A1079-A1109. */ -template -class AggregateQualityEstimateFactory : public SingleLevelFactoryBase { +template +class AggregateQualityEstimateFactory : public TwoLevelFactoryBase { #undef MUELU_AGGREGATEQUALITYESTIMATEFACTORY_SHORT #include "MueLu_UseShortNames.hpp" @@ -70,7 +73,7 @@ class AggregateQualityEstimateFactory : public SingleLevelFactoryBase { If the Build method of this class requires some data, but the generating factory is not specified in DeclareInput, then this class will fall back to the settings in FactoryManager. */ - void DeclareInput(Level& currentLevel) const; + void DeclareInput(Level& fineLevel, Level& coarseLevel) const; //@} @@ -78,7 +81,7 @@ class AggregateQualityEstimateFactory : public SingleLevelFactoryBase { //@{ //! Build aggregate quality esimates with this factory. - void Build(Level& currentLevel) const; + void Build(Level& fineLevel, Level& coarseLevel) const; //@} diff --git a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp similarity index 96% rename from packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp rename to packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp index c2c288192214..e7a2943d9969 100644 --- a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp +++ b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp @@ -34,10 +34,10 @@ template AggregateQualityEstimateFactory::~AggregateQualityEstimateFactory() {} template -void AggregateQualityEstimateFactory::DeclareInput(Level& currentLevel) const { - Input(currentLevel, "A"); - Input(currentLevel, "Aggregates"); - Input(currentLevel, "CoarseMap"); +void AggregateQualityEstimateFactory::DeclareInput(Level& fineLevel, Level& coarseLevel) const { + Input(fineLevel, "A"); + Input(fineLevel, "Aggregates"); + Input(fineLevel, "CoarseMap"); } template @@ -64,13 +64,13 @@ RCP AggregateQualityEstimateFactory -void AggregateQualityEstimateFactory::Build(Level& currentLevel) const { - FactoryMonitor m(*this, "Build", currentLevel); +void AggregateQualityEstimateFactory::Build(Level& fineLevel, Level& coarseLevel) const { + FactoryMonitor m(*this, "Build", fineLevel); - RCP A = Get>(currentLevel, "A"); - RCP aggregates = Get>(currentLevel, "Aggregates"); + RCP A = Get>(fineLevel, "A"); + RCP aggregates = Get>(fineLevel, "Aggregates"); - RCP map = Get>(currentLevel, "CoarseMap"); + RCP map = Get>(fineLevel, "CoarseMap"); assert(!aggregates->AggregatesCrossProcessors()); ParameterList pL = GetParameterList(); @@ -81,15 +81,15 @@ void AggregateQualityEstimateFactory: if (mode == "eigenvalue" || mode == "both") { aggregate_qualities = Xpetra::MultiVectorFactory::Build(map, 1); ComputeAggregateQualities(A, aggregates, aggregate_qualities); - OutputAggQualities(currentLevel, aggregate_qualities); + OutputAggQualities(fineLevel, aggregate_qualities); } if (mode == "size" || mode == "both") { RCP aggregate_sizes = Xpetra::VectorFactory::Build(map); ComputeAggregateSizes(A, aggregates, aggregate_sizes); - Set(currentLevel, "AggregateSizes", aggregate_sizes); - OutputAggSizes(currentLevel, aggregate_sizes); + Set(fineLevel, "AggregateSizes", aggregate_sizes); + OutputAggSizes(fineLevel, aggregate_sizes); } - Set(currentLevel, "AggregateQualities", aggregate_qualities); + Set(coarseLevel, "AggregateQualities", aggregate_qualities); } template diff --git a/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml b/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml new file mode 100644 index 000000000000..f732f2a3c9b5 --- /dev/null +++ b/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml b/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml index b36abd859cdd..56565e5f4de7 100644 --- a/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml +++ b/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml @@ -47,6 +47,7 @@ + @@ -58,15 +59,14 @@ - - - + + diff --git a/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold b/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold index 3714e69e8895..5d4a2e452dab 100644 --- a/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold +++ b/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold @@ -26,13 +26,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -41,6 +37,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates @@ -71,13 +71,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -86,6 +82,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates diff --git a/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold b/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold index ef6897802897..4c9b7d57f952 100644 --- a/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold +++ b/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold @@ -27,13 +27,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -42,6 +38,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates @@ -73,13 +73,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -88,6 +84,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates diff --git a/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp b/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp index 769b47c77c19..dd095e626038 100644 --- a/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp +++ b/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp @@ -90,26 +90,40 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(AggregateQualityEstimateFactory, Poisson2D, Sc RCP> comm = Parameters::getDefaultComm(); - Level level; - TestHelpers::TestFactory::createSingleLevelHierarchy(level); + Level fineLevel, coarseLevel; + TestHelpers::TestFactory::createTwoLevelHierarchy(fineLevel, coarseLevel); GO nx = 20 * comm->getSize(); GO ny = nx; RCP Op = TestHelpers::TestFactory::Build2DPoisson(nx, ny); - level.Set("A", Op); + fineLevel.Set("A", Op); - AggregateQualityEstimateFactory aggQualityEstimateFactory; - std::cout << *(aggQualityEstimateFactory.GetValidParameterList()) << std::endl; - aggQualityEstimateFactory.SetParameter("aggregate qualities: check symmetry", Teuchos::ParameterEntry(false)); - aggQualityEstimateFactory.SetParameter("aggregate qualities: good aggregate threshold", Teuchos::ParameterEntry(100.0)); - aggQualityEstimateFactory.SetParameter("aggregate qualities: file output", Teuchos::ParameterEntry(false)); + RCP aggQualityEstimateFactory = rcp(new AggregateQualityEstimateFactory()); + aggQualityEstimateFactory->SetParameter("aggregate qualities: check symmetry", Teuchos::ParameterEntry(false)); + aggQualityEstimateFactory->SetParameter("aggregate qualities: good aggregate threshold", Teuchos::ParameterEntry(100.0)); + aggQualityEstimateFactory->SetParameter("aggregate qualities: file output", Teuchos::ParameterEntry(false)); - level.Request("AggregateQualities", &aggQualityEstimateFactory); - level.Request(aggQualityEstimateFactory); + RCP amalgFact = rcp(new AmalgamationFactory()); + RCP dropFact = rcp(new CoalesceDropFactory()); + dropFact->SetFactory("UnAmalgamationInfo", amalgFact); + RCP aggFact = rcp(new UncoupledAggregationFactory()); + aggFact->SetFactory("Graph", dropFact); + RCP coarsemapFact = Teuchos::rcp(new CoarseMapFactory()); + coarsemapFact->SetFactory("Aggregates", aggFact); + aggQualityEstimateFactory->SetFactory("Aggregates", aggFact); + aggQualityEstimateFactory->SetFactory("CoarseMap", coarsemapFact); + + coarseLevel.Request(*aggQualityEstimateFactory); + fineLevel.Request(*aggFact); + fineLevel.Request(*coarsemapFact); + + aggQualityEstimateFactory->Build(fineLevel, coarseLevel); + + coarseLevel.Request("AggregateQualities", aggQualityEstimateFactory.get()); out << "Getting aggregate qualities...\n\n"; - RCP aggQualities = level.Get>("AggregateQualities", &aggQualityEstimateFactory); + RCP aggQualities = coarseLevel.Get>("AggregateQualities", aggQualityEstimateFactory.get()); out << "Testing aggregate qualities to make sure all aggregates are of good quality...\n\n"; @@ -536,7 +550,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(AggregateQualityEstimateFactory, ConvectionDif TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory, Constructor, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory, Poisson2D, Scalar, LO, GO, Node) // TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory,AnisotropicDiffusion2D,Scalar,LO,GO,Node) - // TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory,ConvectionDiffusion2D,Scalar,LO,GO,Node) #include From 4e6e2ca89df63eb0050d822a60b575e2c0ddcd9c Mon Sep 17 00:00:00 2001 From: reuterb Date: Wed, 6 Nov 2024 16:40:02 -0700 Subject: [PATCH 100/243] Panzer tangent unit tests (Blocked Gather) (#13576) Refresh Gather_BlockedTpetra evaluator, put tangent capability on device, and update the unit test. --------- Signed-off-by: Bryan Reuter --- .../test/evaluator_tests/CMakeLists.txt | 7 + .../tpetra_blocked_gather_solution.cpp | 721 ++++++++++++++++++ .../Panzer_GatherSolution_BlockedTpetra.hpp | 21 +- ...nzer_GatherSolution_BlockedTpetra_impl.hpp | 178 +++-- 4 files changed, 853 insertions(+), 74 deletions(-) create mode 100644 packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp diff --git a/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt b/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt index 3b2202563a32..d871d0375cb0 100644 --- a/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt +++ b/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt @@ -43,6 +43,13 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( NUM_MPI_PROCS 2 ) +TRIBITS_ADD_EXECUTABLE_AND_TEST( + tGatherSolution_BlockedTpetra + SOURCES tpetra_blocked_gather_solution.cpp ${UNIT_TEST_DRIVER} + COMM serial mpi + NUM_MPI_PROCS 2 + ) + TRIBITS_ADD_EXECUTABLE_AND_TEST( tScatterResidual_Tpetra SOURCES tpetra_scatter_residual.cpp ${UNIT_TEST_DRIVER} diff --git a/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp b/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp new file mode 100644 index 000000000000..279956f8d6eb --- /dev/null +++ b/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp @@ -0,0 +1,721 @@ +// @HEADER +// ***************************************************************************** +// Panzer: A partial differential equation assembly +// engine for strongly coupled complex multiphysics systems +// +// Copyright 2011 NTESS and the Panzer contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/////////////////////////////////////////////////////////////////////////////// +// +// Include Files +// +/////////////////////////////////////////////////////////////////////////////// + +// C++ +#include +#include +#include + +// Kokkos +#include "Kokkos_View_Fad.hpp" + +// Panzer +#include "PanzerAdaptersSTK_config.hpp" +#include "Panzer_BasisIRLayout.hpp" +#include "Panzer_BlockedTpetraLinearObjFactory.hpp" +#include "Panzer_BlockedDOFManager.hpp" +#include "Panzer_DOFManager.hpp" +#include "Panzer_Evaluator_WithBaseImpl.hpp" +#include "Panzer_FieldManagerBuilder.hpp" +#include "Panzer_GatherOrientation.hpp" +#include "Panzer_PureBasis.hpp" +#include "Panzer_STKConnManager.hpp" +#include "Panzer_STK_Interface.hpp" +#include "Panzer_STK_SetupUtilities.hpp" +#include "Panzer_STK_SquareQuadMeshFactory.hpp" +#include "Panzer_STK_Version.hpp" +#include "Panzer_Workset.hpp" +#include "Panzer_LOCPair_GlobalEvaluationData.hpp" +#include "Panzer_GlobalEvaluationDataContainer.hpp" + +// Teuchos +#include "Teuchos_DefaultMpiComm.hpp" +#include "Teuchos_GlobalMPISession.hpp" +#include "Teuchos_OpaqueWrapper.hpp" +#include "Teuchos_RCP.hpp" +#include "Teuchos_TimeMonitor.hpp" +#include "Teuchos_UnitTestHarness.hpp" + +// Thyra +#include "Thyra_ProductVectorBase.hpp" +#include "Thyra_VectorStdOps.hpp" + +// Tpetra +#include "Tpetra_Vector.hpp" + +// user_app +#include "user_app_EquationSetFactory.hpp" + +typedef double ScalarT; +using LocalOrdinalT = panzer::LocalOrdinal; +using GlobalOrdinalT = panzer::GlobalOrdinal; + +typedef Tpetra::Vector VectorType; +typedef Tpetra::Operator OperatorType; +typedef Tpetra::CrsMatrix CrsMatrixType; +typedef Tpetra::CrsGraph CrsGraphType; +typedef Tpetra::Map MapType; +typedef Tpetra::Import ImportType; +typedef Tpetra::Export ExportType; + +typedef Thyra::TpetraLinearOp ThyraLinearOp; + +typedef panzer::BlockedTpetraLinearObjFactory BlockedTpetraLinObjFactoryType; +typedef panzer::TpetraLinearObjFactory TpetraLinObjFactoryType; +typedef panzer::BlockedTpetraLinearObjContainer BlockedTpetraLinObjContainerType; +typedef panzer::TpetraLinearObjContainer TpetraLinObjContainerType; + +namespace panzer +{ + + Teuchos::RCP buildBasis(std::size_t worksetSize, const std::string &basisName); + void testInitialization(const Teuchos::RCP &ipb); + Teuchos::RCP buildMesh(int elemX, int elemY); + void testGatherScatter(const bool enable_tangents, Teuchos::FancyOStream &out, bool &success); + + // Test without tangent fields in gather evaluator + TEUCHOS_UNIT_TEST(tpetra_assembly, gather_solution_no_tangents) + { + testGatherScatter(false, out, success); + } + + // Test with tangent fields in gather evaluator + TEUCHOS_UNIT_TEST(tpetra_assembly, gather_solution_tangents) + { + testGatherScatter(true, out, success); + } + + // enable_tangents determines whether tangent fields dx/dp are added to gather evaluator. + // These are used when computing df/dx*dx/dp with the tangent evaluation type + void testGatherScatter(const bool enable_tangents, Teuchos::FancyOStream &out, bool &success) + { +#ifdef HAVE_MPI + Teuchos::RCP> tComm = Teuchos::rcp(new Teuchos::MpiComm(MPI_COMM_WORLD)); +#else + Teuchos::RCP> tComm = Teuchos::rcp(new Teuchos::SerialComm(MPI_COMM_WORLD)); +#endif + + int myRank = tComm->getRank(); + int numProcs = tComm->getSize(); + + const std::size_t workset_size = 4 / numProcs; + const std::string fieldName1_q1 = "U"; + const std::string fieldName2_q1 = "V"; + const std::string fieldName_qedge1 = "B"; + const int num_tangent = enable_tangents ? 5 : 0; + + Teuchos::RCP mesh = buildMesh(2, 2); + + // build input physics block + Teuchos::RCP basis_q1 = buildBasis(workset_size, "Q1"); + Teuchos::RCP basis_qedge1 = buildBasis(workset_size, "QEdge1"); + + Teuchos::RCP ipb = Teuchos::parameterList(); + testInitialization(ipb); + + const int default_int_order = 1; + std::string eBlockID = "eblock-0_0"; + Teuchos::RCP eqset_factory = Teuchos::rcp(new user_app::MyFactory); + panzer::CellData cellData(workset_size, mesh->getCellTopology("eblock-0_0")); + Teuchos::RCP gd = panzer::createGlobalData(); + Teuchos::RCP physicsBlock = + Teuchos::rcp(new PhysicsBlock(ipb, eBlockID, default_int_order, cellData, eqset_factory, gd, false)); + + Teuchos::RCP> work_sets = panzer_stk::buildWorksets(*mesh, physicsBlock->elementBlockID(), + physicsBlock->getWorksetNeeds()); + TEST_EQUALITY(work_sets->size(), 1); + + // build connection manager and field manager + const Teuchos::RCP conn_manager = Teuchos::rcp(new panzer_stk::STKConnManager(mesh)); + Teuchos::RCP blocked_dofManager = Teuchos::rcp(new panzer::BlockedDOFManager(conn_manager, MPI_COMM_WORLD)); + + blocked_dofManager->addField(fieldName1_q1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + blocked_dofManager->addField(fieldName2_q1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + blocked_dofManager->addField(fieldName_qedge1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_qedge1->getIntrepid2Basis()))); + + std::vector > fieldOrder(3); + fieldOrder[0].push_back(fieldName1_q1); + fieldOrder[1].push_back(fieldName_qedge1); + fieldOrder[2].push_back(fieldName2_q1); + blocked_dofManager->setFieldOrder(fieldOrder); + + blocked_dofManager->buildGlobalUnknowns(); + + // setup linear object factory + ///////////////////////////////////////////////////////////// + + Teuchos::RCP t_lof = Teuchos::rcp(new BlockedTpetraLinObjFactoryType(tComm.getConst(), blocked_dofManager)); + Teuchos::RCP> lof = t_lof; + Teuchos::RCP loc = t_lof->buildGhostedLinearObjContainer(); + t_lof->initializeGhostedContainer(LinearObjContainer::X, *loc); + loc->initialize(); + + Teuchos::RCP t_loc = Teuchos::rcp_dynamic_cast(loc); + Teuchos::RCP> x_vec = t_loc->get_x_th(); + Thyra::assign(x_vec.ptr(), 123.0 + myRank); + + // need a place to evaluate the tangent fields, so we create a + // unblocked DOFManager and LOF and set up if needed + std::vector> tangentContainers; + Teuchos::RCP dofManager = Teuchos::rcp(new panzer::DOFManager(conn_manager, MPI_COMM_WORLD)); + Teuchos::RCP tangent_lof = Teuchos::rcp(new TpetraLinObjFactoryType(tComm.getConst(), dofManager)); + Teuchos::RCP> parent_tangent_lof = tangent_lof; + + if (enable_tangents) + { + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::rcp_dynamic_cast; + using Thyra::ProductVectorBase; + using LOCPair = panzer::LOCPair_GlobalEvaluationData; + + std::vector tangent_fieldOrder; + for (int i(0); i < num_tangent; ++i) { + std::stringstream ssedge; + ssedge << fieldName_qedge1 << " Tangent " << i; + std::stringstream ss1, ss2; + ss1 << fieldName1_q1 << " Tangent " << i; + ss2 << fieldName2_q1 << " Tangent " << i; + + dofManager->addField(ss1.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + dofManager->addField(ss2.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + dofManager->addField(ssedge.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_qedge1->getIntrepid2Basis()))); + tangent_fieldOrder.push_back(ss1.str()); + tangent_fieldOrder.push_back(ss2.str()); + tangent_fieldOrder.push_back(ssedge.str()); + } + dofManager->setFieldOrder(tangent_fieldOrder); + dofManager->buildGlobalUnknowns(); + + // generate and evaluate some fields + Teuchos::RCP tangent_loc = tangent_lof->buildGhostedLinearObjContainer(); + tangent_lof->initializeGhostedContainer(LinearObjContainer::X, *tangent_loc); + tangent_loc->initialize(); + + for (int i(0); i < num_tangent; ++i) + { + auto locPair = Teuchos::rcp(new LOCPair(tangent_lof, panzer::LinearObjContainer::X)); + + auto global_t_loc = rcp_dynamic_cast(locPair->getGlobalLOC()); + Teuchos::RCP> global_x_vec = global_t_loc->get_x_th(); + Thyra::assign(global_x_vec.ptr(), 0.123 + myRank + i); + + auto ghosted_t_loc = rcp_dynamic_cast(locPair->getGhostedLOC()); + Teuchos::RCP> ghosted_x_vec = ghosted_t_loc->get_x_th(); + Thyra::assign(ghosted_x_vec.ptr(), 0.123 + myRank + i); + + tangentContainers.push_back(locPair); + } // end loop over the tangents + } // end if (enable_tangents) + + // setup field manager, add evaluator under test + ///////////////////////////////////////////////////////////// + + PHX::FieldManager fm; + + std::vector derivative_dimensions; + derivative_dimensions.push_back(12); + fm.setKokkosExtendedDataTypeDimensions(derivative_dimensions); + + std::vector tan_derivative_dimensions; + if (enable_tangents) + tan_derivative_dimensions.push_back(num_tangent); + else + tan_derivative_dimensions.push_back(0); + fm.setKokkosExtendedDataTypeDimensions(tan_derivative_dimensions); + + Teuchos::RCP evalField_q1, evalField_qedge1; + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + if (enable_tangents) + { + RCP>> tangent_names = + rcp(new std::vector>(2)); + for (int i = 0; i < num_tangent; ++i) + { + std::stringstream ss1, ss2; + ss1 << fieldName1_q1 << " Tangent " << i; + ss2 << fieldName2_q1 << " Tangent " << i; + (*tangent_names)[0].push_back(ss1.str()); + (*tangent_names)[1].push_back(ss2.str()); + } + pl.set("Tangent Names", tangent_names); + } + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + if (enable_tangents) + { + RCP>> tangent_names = + rcp(new std::vector>(1)); + for (int i = 0; i < num_tangent; ++i) + { + std::stringstream ss; + ss << fieldName_qedge1 << " Tangent " << i; + (*tangent_names)[0].push_back(ss.str()); + } + pl.set("Tangent Names", tangent_names); + } + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + + if (enable_tangents) + { + for (int i = 0; i < num_tangent; ++i) + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + RCP> tangent_names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + { + std::stringstream ss1, ss2; + ss1 << fieldName1_q1 << " Tangent " << i; + ss2 << fieldName2_q1 << " Tangent " << i; + tangent_names->push_back(ss1.str()); + tangent_names->push_back(ss2.str()); + } + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", tangent_names); + pl.set("Indexer Names", tangent_names); + + { + std::stringstream ss; + ss << "Tangent Container " << i; + pl.set("Global Data Key", ss.str()); + } + + Teuchos::RCP> evaluator = + parent_tangent_lof->buildGatherTangent(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + } + for (int i = 0; i < num_tangent; ++i) + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + RCP> tangent_names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + { + std::stringstream ss; + ss << fieldName_qedge1 << " Tangent " << i; + tangent_names->push_back(ss.str()); + } + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", tangent_names); + pl.set("Indexer Names", tangent_names); + + { + std::stringstream ss; + ss << "Tangent Container " << i; + pl.set("Global Data Key", ss.str()); + } + + Teuchos::RCP> evaluator = + parent_tangent_lof->buildGatherTangent(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + } + } + + panzer::Traits::SD sd; + + panzer::Workset &workset = (*work_sets)[0]; + workset.alpha = 0.0; + workset.beta = 2.0; // derivatives multiplied by 2 + workset.time = 0.0; + workset.evaluate_transient_terms = false; + + sd.worksets_ = work_sets; + + fm.postRegistrationSetup(sd); + + panzer::Traits::PED ped; + ped.gedc->addDataObject("Solution Gather Container", loc); + if (enable_tangents) + { + for (int i(0); i < num_tangent; ++i) + { + std::stringstream ss; + ss << "Tangent Container " << i; + ped.gedc->addDataObject(ss.str(), tangentContainers[i]); + } + } + + fm.preEvaluate(ped); + fm.evaluateFields(workset); + fm.postEvaluate(0); + + fm.preEvaluate(ped); + fm.evaluateFields(workset); + fm.postEvaluate(0); + + fm.preEvaluate(ped); + fm.evaluateFields(workset); + fm.postEvaluate(0); + + // test Residual fields + { + PHX::MDField + fieldData1_q1(fieldName1_q1, basis_q1->functional); + PHX::MDField + fieldData2_q1(fieldName2_q1, basis_qedge1->functional); + + fm.getFieldData(fieldData1_q1); + fm.getFieldData(fieldData2_q1); + + TEST_EQUALITY(fieldData1_q1.extent(0), Teuchos::as(4 / numProcs)); + TEST_EQUALITY(fieldData1_q1.extent(1), 4); + TEST_EQUALITY(fieldData2_q1.extent(0), Teuchos::as(4 / numProcs)); + TEST_EQUALITY(fieldData2_q1.extent(1), 4); + TEST_EQUALITY(fieldData1_q1.size(), Teuchos::as(4 * 4 / numProcs)); + TEST_EQUALITY(fieldData2_q1.size(), Teuchos::as(4 * 4 / numProcs)); + + auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view()); + auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view()); + Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view()); + Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view()); + + for (unsigned int i = 0; i < fieldData1_q1.extent(0); i++) + for (unsigned int j = 0; j < fieldData1_q1.extent(1); j++) + TEST_EQUALITY(fieldData1_q1_h(i, j), 123.0 + myRank); + + for (unsigned int i = 0; i < fieldData2_q1.extent(0); i++) + for (unsigned int j = 0; j < fieldData2_q1.extent(1); j++) + TEST_EQUALITY(fieldData2_q1_h(i, j), 123.0 + myRank); + } + { + PHX::MDField + fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional); + + fm.getFieldData(fieldData_qedge1); + + auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view()); + Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view()); + + TEST_EQUALITY(fieldData_qedge1.extent(0), Teuchos::as(4 / numProcs)); + TEST_EQUALITY(fieldData_qedge1.extent(1), 4); + TEST_EQUALITY(fieldData_qedge1.size(), Teuchos::as(4 * 4 / numProcs)); + + for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell) + for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); pt++) + TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank); + } + + // test Jacobian fields + { + PHX::MDField + fieldData1_q1(fieldName1_q1, basis_q1->functional); + PHX::MDField + fieldData2_q1(fieldName2_q1, basis_qedge1->functional); + + fm.getFieldData(fieldData1_q1); + fm.getFieldData(fieldData2_q1); + + auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view()); + auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view()); + Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view()); + Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData1_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData1_q1.extent(1); pt++) + { + TEST_EQUALITY(fieldData1_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), 12); + } + } + for (unsigned int cell = 0; cell < fieldData2_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData2_q1.extent(1); pt++) + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), 12); + } + } + } + { + PHX::MDField + fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional); + + fm.getFieldData(fieldData_qedge1); + + auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view()); + Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); ++pt) + { + TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), 12); + } + } + } + + // test Tangent fields + { + PHX::MDField + fieldData1_q1(fieldName1_q1, basis_q1->functional); + PHX::MDField + fieldData2_q1(fieldName2_q1, basis_qedge1->functional); + + fm.getFieldData(fieldData1_q1); + fm.getFieldData(fieldData2_q1); + + auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view()); + auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view()); + Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view()); + Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData1_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData1_q1.extent(1); pt++) + { + if (enable_tangents) + { + TEST_EQUALITY(fieldData1_q1_h(cell, pt).val(), 123.0 + myRank); + TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), num_tangent); + for (int i = 0; i < num_tangent; ++i) + TEST_EQUALITY(fieldData1_q1_h(cell, pt).dx(i), 0.123 + myRank + i); + } + else + { + TEST_EQUALITY(fieldData1_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), 0); + } + } + } + for (unsigned int cell = 0; cell < fieldData2_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData2_q1.extent(1); pt++) + { + if (enable_tangents) + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt).val(), 123.0 + myRank); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), num_tangent); + for (int i = 0; i < num_tangent; ++i) + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt).dx(i), 0.123 + myRank + i); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).dx(i), 0.123 + myRank + i); + } + } + else + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), 0); + } + } + } + } + { + PHX::MDField + fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional); + + fm.getFieldData(fieldData_qedge1); + + auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view()); + Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); ++pt) + { + if (enable_tangents) + { + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).val(), 123.0 + myRank); + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), num_tangent); + for (int i = 0; i < num_tangent; ++i) + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).dx(i), 0.123 + myRank + i); + } + else + { + TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), 0); + } + } + } + } + } + + Teuchos::RCP buildBasis(std::size_t worksetSize, const std::string &basisName) + { + Teuchos::RCP topo = + Teuchos::rcp(new shards::CellTopology(shards::getCellTopologyData>())); + + panzer::CellData cellData(worksetSize, topo); + return Teuchos::rcp(new panzer::PureBasis(basisName, 1, cellData)); + } + + Teuchos::RCP buildMesh(int elemX, int elemY) + { + Teuchos::RCP pl = rcp(new Teuchos::ParameterList); + pl->set("X Blocks", 1); + pl->set("Y Blocks", 1); + pl->set("X Elements", elemX); + pl->set("Y Elements", elemY); + + panzer_stk::SquareQuadMeshFactory factory; + factory.setParameterList(pl); + Teuchos::RCP mesh = factory.buildUncommitedMesh(MPI_COMM_WORLD); + factory.completeMeshConstruction(*mesh, MPI_COMM_WORLD); + + return mesh; + } + + void testInitialization(const Teuchos::RCP &ipb) + { + // Physics block + ipb->setName("test physics"); + { + Teuchos::ParameterList &p = ipb->sublist("a"); + p.set("Type", "Energy"); + p.set("Prefix", ""); + p.set("Model ID", "solid"); + p.set("Basis Type", "HGrad"); + p.set("Basis Order", 1); + p.set("Integration Order", 1); + } + { + Teuchos::ParameterList &p = ipb->sublist("b"); + p.set("Type", "Energy"); + p.set("Prefix", "ION_"); + p.set("Model ID", "solid"); + p.set("Basis Type", "HCurl"); + p.set("Basis Order", 1); + p.set("Integration Order", 1); + } + } + +} diff --git a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp index aec43b41dfbc..6d9bde9d1a3b 100644 --- a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp +++ b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp @@ -163,7 +163,7 @@ class GatherSolution_BlockedTpetra public: GatherSolution_BlockedTpetra(const Teuchos::RCP & indexer) - : gidIndexer_(indexer) {} + : globalIndexer_(indexer) {} GatherSolution_BlockedTpetra(const Teuchos::RCP & indexer, const Teuchos::ParameterList& p); @@ -176,13 +176,13 @@ class GatherSolution_BlockedTpetra void evaluateFields(typename TRAITS::EvalData d); virtual Teuchos::RCP clone(const Teuchos::ParameterList & pl) const - { return Teuchos::rcp(new GatherSolution_BlockedTpetra(gidIndexer_,pl)); } + { return Teuchos::rcp(new GatherSolution_BlockedTpetra(globalIndexer_,pl)); } private: typedef typename panzer::Traits::Tangent EvalT; typedef typename panzer::Traits::Tangent::ScalarT ScalarT; - //typedef typename panzer::Traits::RealType RealT; + typedef typename panzer::Traits::RealType RealT; typedef BlockedTpetraLinearObjContainer ContainerType; typedef Tpetra::Vector VectorType; @@ -194,10 +194,14 @@ class GatherSolution_BlockedTpetra // maps the local (field,element,basis) triplet to a global ID // for scattering - Teuchos::RCP gidIndexer_; + Teuchos::RCP globalIndexer_; std::vector fieldIds_; // field IDs needing mapping + //! Returns the index into the Thyra ProductVector sub-block. Size + //! of number of fields to scatter + std::vector productVectorBlockIndex_; + std::vector< PHX::MDField > gatherFields_; std::vector indexerNames_; @@ -206,9 +210,16 @@ class GatherSolution_BlockedTpetra Teuchos::RCP > blockedContainer_; + //! Local indices for unknowns + PHX::View worksetLIDs_; + + //! Offset into the cell lids for each field. Size of number of fields to scatter. + std::vector> fieldOffsets_; + // Fields for storing tangent components dx/dp of solution vector x bool has_tangent_fields_; - std::vector< std::vector< PHX::MDField > > tangentFields_; + std::vector< std::vector< PHX::MDField > > tangentFields_; + PHX::ViewOfViews<2,PHX::View> tangentFieldsVoV_; GatherSolution_BlockedTpetra(); }; diff --git a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp index b0ef54fdd70b..52488585d37e 100644 --- a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp +++ b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp @@ -8,8 +8,8 @@ // ***************************************************************************** // @HEADER -#ifndef PANZER_GATHER_SOLUTION_BLOCKED_EPETRA_IMPL_HPP -#define PANZER_GATHER_SOLUTION_BLOCKED_EPETRA_IMPL_HPP +#ifndef PANZER_GATHER_SOLUTION_BLOCKED_TPETRA_IMPL_HPP +#define PANZER_GATHER_SOLUTION_BLOCKED_TPETRA_IMPL_HPP #include "Teuchos_Assert.hpp" #include "Phalanx_DataLayout.hpp" @@ -216,7 +216,7 @@ panzer::GatherSolution_BlockedTpetra & indexer, const Teuchos::ParameterList& p) - : gidIndexer_(indexer) + : globalIndexer_(indexer) , has_tangent_fields_(false) { typedef std::vector< std::vector > vvstring; @@ -250,7 +250,7 @@ GatherSolution_BlockedTpetra( tangentFields_[fd].resize(tangent_field_names[fd].size()); for (std::size_t i=0; i(tangent_field_names[fd][i],basis->functional); + PHX::MDField(tangent_field_names[fd][i],basis->functional); this->addDependentField(tangentFields_[fd][i]); } } @@ -268,17 +268,60 @@ GatherSolution_BlockedTpetra( // ********************************************************************** template void panzer::GatherSolution_BlockedTpetra:: -postRegistrationSetup(typename TRAITS::SetupData /* d */, +postRegistrationSetup(typename TRAITS::SetupData d, PHX::FieldManager& /* fm */) { TEUCHOS_ASSERT(gatherFields_.size() == indexerNames_.size()); - fieldIds_.resize(gatherFields_.size()); + const Workset & workset_0 = (*d.worksets_)[0]; + const std::string blockId = this->wda(workset_0).block_id; + fieldIds_.resize(gatherFields_.size()); + fieldOffsets_.resize(gatherFields_.size()); + productVectorBlockIndex_.resize(gatherFields_.size()); + int maxElementBlockGIDCount = -1; for (std::size_t fd = 0; fd < gatherFields_.size(); ++fd) { - // get field ID from DOF manager - const std::string& fieldName = indexerNames_[fd]; - fieldIds_[fd] = gidIndexer_->getFieldNum(fieldName); + + const std::string fieldName = indexerNames_[fd]; + const int globalFieldNum = globalIndexer_->getFieldNum(fieldName); // Field number in the aggregate BlockDOFManager + productVectorBlockIndex_[fd] = globalIndexer_->getFieldBlock(globalFieldNum); + const auto& subGlobalIndexer = globalIndexer_->getFieldDOFManagers()[productVectorBlockIndex_[fd]]; + fieldIds_[fd] = subGlobalIndexer->getFieldNum(fieldName); // Field number in the sub-global-indexer + + const std::vector& offsets = subGlobalIndexer->getGIDFieldOffsets(blockId,fieldIds_[fd]); + fieldOffsets_[fd] = PHX::View("GatherSolution_BlockedTpetra(Tangent):fieldOffsets",offsets.size()); + auto hostOffsets = Kokkos::create_mirror_view(fieldOffsets_[fd]); + for (std::size_t i=0; i < offsets.size(); ++i) + hostOffsets(i) = offsets[i]; + Kokkos::deep_copy(fieldOffsets_[fd], hostOffsets); + maxElementBlockGIDCount = std::max(subGlobalIndexer->getElementBlockGIDCount(blockId),maxElementBlockGIDCount); + } + + // We will use one workset lid view for all fields, but has to be + // sized big enough to hold the largest elementBlockGIDCount in the + // ProductVector. + worksetLIDs_ = PHX::View("ScatterResidual_BlockedTpetra(Tangent):worksetLIDs", + gatherFields_[0].extent(0), + maxElementBlockGIDCount); + + // Set up storage for tangentFields using view of views + // We also need storage for the number of tangent fields associated with + // each gatherField + + if (has_tangent_fields_) { + + size_t inner_vector_max_size = 0; + for (std::size_t fd = 0; fd < tangentFields_.size(); ++fd) + inner_vector_max_size = std::max(inner_vector_max_size,tangentFields_[fd].size()); + tangentFieldsVoV_.initialize("GatherSolution_BlockedTpetra::tangentFieldsVoV_",gatherFields_.size(),inner_vector_max_size); + + for (std::size_t fd = 0; fd < gatherFields_.size(); ++fd) { + for (std::size_t i=0; i void panzer::GatherSolution_BlockedTpetra:: evaluateFields(typename TRAITS::EvalData workset) { - using Teuchos::RCP; - using Teuchos::ArrayRCP; - using Teuchos::ptrFromRef; - using Teuchos::rcp_dynamic_cast; - - using Thyra::VectorBase; - using Thyra::SpmdVectorBase; - using Thyra::ProductVectorBase; + using Teuchos::RCP; + using Teuchos::ArrayRCP; + using Teuchos::ptrFromRef; + using Teuchos::rcp_dynamic_cast; - Teuchos::FancyOStream out(Teuchos::rcpFromRef(std::cout)); - out.setShowProcRank(true); - out.setOutputToRootOnly(-1); + using Thyra::VectorBase; + using Thyra::SpmdVectorBase; + using Thyra::ProductVectorBase; - std::vector > GIDs; - std::vector LIDs; + Teuchos::FancyOStream out(Teuchos::rcpFromRef(std::cout)); + out.setShowProcRank(true); + out.setOutputToRootOnly(-1); - // for convenience pull out some objects from workset - std::string blockId = this->wda(workset).block_id; - const std::vector & localCellIds = this->wda(workset).cell_local_ids; + const PHX::View & localCellIds = this->wda(workset).getLocalCellIDs(); - Teuchos::RCP > x; - if (useTimeDerivativeSolutionVector_) - x = rcp_dynamic_cast >(blockedContainer_->get_dxdt()); - else - x = rcp_dynamic_cast >(blockedContainer_->get_x()); + Teuchos::RCP > blockedSolution; + if (useTimeDerivativeSolutionVector_) + blockedSolution = rcp_dynamic_cast >(blockedContainer_->get_dxdt()); + else + blockedSolution = rcp_dynamic_cast >(blockedContainer_->get_x()); - // gather operation for each cell in workset - for(std::size_t worksetCellIndex=0;worksetCellIndexgetFieldDOFManagers()[productVectorBlockIndex_[fieldIndex]]; + const std::string blockId = this->wda(workset).block_id; + const int num_dofs = globalIndexer_->getFieldDOFManagers()[productVectorBlockIndex_[fieldIndex]]->getElementBlockGIDCount(blockId); + blockIndexer->getElementLIDs(localCellIds,worksetLIDs_,num_dofs); + currentWorksetLIDSubBlock = productVectorBlockIndex_[fieldIndex]; + } - gidIndexer_->getElementGIDsPair(cellLocalId,GIDs,blockId); + const int blockRowIndex = productVectorBlockIndex_[fieldIndex]; + const auto& subblockSolution = *((rcp_dynamic_cast>(blockedSolution->getNonconstVectorBlock(blockRowIndex),true))->getTpetraVector()); + const auto kokkosSolution = subblockSolution.getLocalViewDevice(Tpetra::Access::ReadOnly); - // caculate the local IDs for this element - LIDs.resize(GIDs.size()); - for(std::size_t i=0;i x_map = blockedContainer_->getMapForBlock(GIDs[i].first); + // Class data fields for lambda capture + const PHX::View fieldOffsets = fieldOffsets_[fieldIndex]; + const PHX::View worksetLIDs = worksetLIDs_; + const PHX::View fieldValues = gatherFields_[fieldIndex].get_static_view(); - LIDs[i] = x_map->getLocalElement(GIDs[i].second); - } + if (has_tangent_fields_) { + const int numTangents = tangentFields_[fieldIndex].size(); + const auto tangentFieldsDevice = tangentFieldsVoV_.getViewDevice(); + const auto kokkosTangents = Kokkos::subview(tangentFieldsDevice,fieldIndex,Kokkos::ALL()); + Kokkos::parallel_for(Kokkos::RangePolicy(0,workset.num_cells), KOKKOS_LAMBDA (const int& cell) { + for (int basis=0; basis < static_cast(fieldOffsets.size()); ++basis) { + const int rowLID = worksetLIDs(cell,fieldOffsets(basis)); + fieldValues(cell,basis).zero(); + fieldValues(cell,basis).val() = kokkosSolution(rowLID,0); + for (int i_tangent=0; i_tangent(0,workset.num_cells), KOKKOS_LAMBDA (const int& cell) { + for (int basis=0; basis < static_cast(fieldOffsets.size()); ++basis) { + const int rowLID = worksetLIDs(cell,fieldOffsets(basis)); + fieldValues(cell,basis).zero(); + fieldValues(cell,basis) = kokkosSolution(rowLID,0); + } + }); + } + } - // loop over the fields to be gathered - Teuchos::ArrayRCP local_x; - for (std::size_t fieldIndex=0; fieldIndexgetFieldBlock(fieldNum); - - // grab local data for inputing - RCP > block_x = rcp_dynamic_cast >(x->getNonconstVectorBlock(indexerId)); - block_x->getLocalData(ptrFromRef(local_x)); - - const std::vector & elmtOffset = gidIndexer_->getGIDFieldOffsets(blockId,fieldNum); - - // loop over basis functions and fill the fields - for(std::size_t basis=0;basis Date: Tue, 5 Nov 2024 14:42:32 -0700 Subject: [PATCH 101/243] Tell aggregate export about Graph and Aggregates Signed-off-by: maxfirmbach --- .../MueLu_ParameterListInterpreter_def.hpp | 2 ++ .../MueLu_AggregationExportFactory_decl.hpp | 2 +- .../MueLu_AggregationExportFactory_def.hpp | 28 ++++++++++--------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp index 207791bf5b5b..cc39c32e146d 100644 --- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp +++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp @@ -1332,6 +1332,8 @@ void ParameterListInterpreter:: MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: output file: build colormap", bool, aggExportParams); aggExport->SetParameterList(aggExportParams); aggExport->SetFactory("DofsPerNode", manager.GetFactory("DofsPerNode")); + aggExport->SetFactory("Aggregates", manager.GetFactory("Aggregates")); + aggExport->SetFactory("Graph", manager.GetFactory("Graph")); if (!RAP.is_null()) RAP->AddTransferFactory(aggExport); diff --git a/packages/muelu/src/Utils/MueLu_AggregationExportFactory_decl.hpp b/packages/muelu/src/Utils/MueLu_AggregationExportFactory_decl.hpp index 58d2936e32ed..e5cbcb5693e9 100644 --- a/packages/muelu/src/Utils/MueLu_AggregationExportFactory_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_AggregationExportFactory_decl.hpp @@ -123,7 +123,7 @@ class AggregationExportFactory : public TwoLevelFactoryBase, public Visualizatio // Data that the different styles need to have available when building geometry mutable Teuchos::RCP coords_; // fine local coordinates mutable Teuchos::RCP coordsCoarse_; // coarse local coordinates - mutable Teuchos::ArrayRCP vertex2AggId_; + mutable Teuchos::RCP vertex2AggId_; mutable Teuchos::ArrayRCP aggSizes_; mutable std::vector isRoot_; mutable bool doFineGraphEdges_; diff --git a/packages/muelu/src/Utils/MueLu_AggregationExportFactory_def.hpp b/packages/muelu/src/Utils/MueLu_AggregationExportFactory_def.hpp index 823349a58b34..2e2ae7ba9acc 100644 --- a/packages/muelu/src/Utils/MueLu_AggregationExportFactory_def.hpp +++ b/packages/muelu/src/Utils/MueLu_AggregationExportFactory_def.hpp @@ -167,12 +167,9 @@ void AggregationExportFactory::Build( } } GetOStream(Runtime0) << "AggregationExportFactory: DofsPerNode: " << DofsPerNode << std::endl; - Teuchos::RCP vertex2AggId_vector = aggregates->GetVertex2AggId(); - Teuchos::RCP procWinner_vector = aggregates->GetProcWinner(); - Teuchos::ArrayRCP vertex2AggId = aggregates->GetVertex2AggId()->getDataNonConst(0); - Teuchos::ArrayRCP procWinner = aggregates->GetProcWinner()->getDataNonConst(0); - vertex2AggId_ = vertex2AggId; + Teuchos::RCP vertex2AggId = aggregates->GetVertex2AggId(); + vertex2AggId_ = vertex2AggId; // prepare for calculating global aggregate ids std::vector numAggsGlobal(numProcs, 0); @@ -263,9 +260,10 @@ void AggregationExportFactory::Build( } if (aggStyle == "Point Cloud") this->doPointCloud(vertices, geomSizes, numAggs_, numNodes_); - else if (aggStyle == "Jacks") - this->doJacks(vertices, geomSizes, numAggs_, numNodes_, isRoot_, vertex2AggId_); - else if (aggStyle == "Jacks++") // Not actually implemented + else if (aggStyle == "Jacks") { + auto vertex2AggIds = vertex2AggId_->getDataNonConst(0); + this->doJacks(vertices, geomSizes, numAggs_, numNodes_, isRoot_, vertex2AggIds); + } else if (aggStyle == "Jacks++") // Not actually implemented doJacksPlus_(vertices, geomSizes); else if (aggStyle == "Convex Hulls") doConvexHulls(vertices, geomSizes); @@ -305,11 +303,13 @@ void AggregationExportFactory::doConv Teuchos::ArrayRCP::coordinateType> yCoords = coords_->getData(1); Teuchos::ArrayRCP::coordinateType> zCoords = Teuchos::null; + auto vertex2AggIds = vertex2AggId_->getDataNonConst(0); + if (dims_ == 2) { - this->doConvexHulls2D(vertices, geomSizes, numAggs_, numNodes_, isRoot_, vertex2AggId_, xCoords, yCoords); + this->doConvexHulls2D(vertices, geomSizes, numAggs_, numNodes_, isRoot_, vertex2AggIds, xCoords, yCoords); } else { zCoords = coords_->getData(2); - this->doConvexHulls3D(vertices, geomSizes, numAggs_, numNodes_, isRoot_, vertex2AggId_, xCoords, yCoords, zCoords); + this->doConvexHulls3D(vertices, geomSizes, numAggs_, numNodes_, isRoot_, vertex2AggIds, xCoords, yCoords, zCoords); } } @@ -571,6 +571,8 @@ void AggregationExportFactory::writeF if (dims_ == 3) zCoords = coords_->getData(2); + auto vertex2AggIds = vertex2AggId_->getDataNonConst(0); + vector uniqueFine = this->makeUnique(vertices); string indent = " "; fout << "" << endl; @@ -596,10 +598,10 @@ void AggregationExportFactory::writeF fout << " " << endl; fout << indent; for (size_t i = 0; i < uniqueFine.size(); i++) { - if (vertex2AggId_[uniqueFine[i]] == -1) - fout << vertex2AggId_[uniqueFine[i]] << " "; + if (vertex2AggIds[uniqueFine[i]] == -1) + fout << vertex2AggIds[uniqueFine[i]] << " "; else - fout << aggsOffset_ + vertex2AggId_[uniqueFine[i]] << " "; + fout << aggsOffset_ + vertex2AggIds[uniqueFine[i]] << " "; if (i % 10 == 9) fout << endl << indent; From 6f090bc8e524d35fad010f682d4badb9c9918c11 Mon Sep 17 00:00:00 2001 From: "Curtis C. Ober" Date: Thu, 7 Nov 2024 13:07:31 -0700 Subject: [PATCH 102/243] Suggested changes from PR and Removal of Operator. Not transitioning the operator functionality. Signed-off-by: Curtis C. Ober --- .../core/src/Tpetra_LinearProblem_decl.hpp | 45 ++++-------- .../core/src/Tpetra_LinearProblem_def.hpp | 27 +------ .../LinearProblem/LinearProblem_UnitTests.cpp | 71 +++++++++++-------- 3 files changed, 54 insertions(+), 89 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp b/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp index bdea3fc35626..0b951f99124f 100644 --- a/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_LinearProblem_decl.hpp @@ -61,7 +61,6 @@ namespace Tpetra { using row_matrix_type = RowMatrix; using multivector_type = MultiVector; using vector_type = Vector; - using operator_type = Operator; using linear_problem_type = LinearProblem; //@} @@ -71,28 +70,19 @@ namespace Tpetra { /// \brief Default Constructor. /// - /// Creates an empty LinearProblem instance. The operator + /// Creates an empty LinearProblem instance. The matrix /// A, left-hand-side X and right-hand-side B must be set - /// use the setOperator(), SetLHS() and SetRHS() methods + /// use the setMatrix(), SetLHS() and SetRHS() methods /// respectively. LinearProblem(); - /// \brief Constructor with a matrix as the operator. + /// \brief Constructor with a matrix. /// - /// Creates a LinearProblem instance where the operator - /// is passed in as a matrix. + /// Creates a LinearProblem instance with a matrix. LinearProblem(const Teuchos::RCP & A, const Teuchos::RCP& X, const Teuchos::RCP& B); - /// \brief Constructor with Operator. - /// - /// Creates a LinearProblem instance for the case where - /// an operator is not necessarily a matrix. - LinearProblem(const Teuchos::RCP & A, - const Teuchos::RCP& X, - const Teuchos::RCP& B); - //! Copy Constructor. LinearProblem(const LinearProblem& Problem); @@ -127,17 +117,11 @@ namespace Tpetra { //! @name Set methods //@{ - /// \brief Set Operator A of linear problem AX = B using a RowMatrix. + /// \brief Set Matrix A of linear problem AX = B using a RowMatrix. /// /// Sets an RCP to a RowMatrix. No copy of the operator is made. - void setOperator(Teuchos::RCP A) - { A_ = A; Operator_ = A; } - - /// \brief Set Operator A of linear problem AX = B using an Operator. - /// - /// Sets an RCP to an Operator. No copy of the operator is made. - void setOperator(Teuchos::RCP A) - { A_ = Teuchos::rcp_dynamic_cast(A); Operator_ = A; } + void setMatrix(Teuchos::RCP A) + { A_ = A; } /// \brief Set left-hand-side X of linear problem AX = B. /// @@ -157,8 +141,7 @@ namespace Tpetra { /// \brief Perform left scaling of a linear problem. /// /// Applies the scaling vector D to the left side of the - /// matrix A() and to the right hand side B(). Note that - /// the operator must be a RowMatrix, not just an Operator. + /// matrix A() and to the right hand side B(). /// /// \param In /// D - Vector containing scaling values. D[i] will @@ -173,8 +156,7 @@ namespace Tpetra { /// /// Applies the scaling vector D to the right side of the /// matrix A(). Apply the inverse of D to the initial - /// guess. Note that the operator must be a RowMatrix, - /// not just an Operator. + /// guess. /// /// \param In /// D - Vector containing scaling values. D[i] will @@ -191,20 +173,17 @@ namespace Tpetra { //! @name Accessor methods //@{ - //! Get an RCP to the operator A. - Teuchos::RCP getOperator() const {return(Operator_);}; //! Get an RCP to the matrix A. - Teuchos::RCP getMatrix() const {return(A_);}; + Teuchos::RCP getMatrix() const {return(A_);} //! Get an RCP to the left-hand-side X. - Teuchos::RCP getLHS() const {return(X_);}; + Teuchos::RCP getLHS() const {return(X_);} //! Get an RCP to the right-hand-side B. - Teuchos::RCP getRHS() const {return(B_);}; + Teuchos::RCP getRHS() const {return(B_);} //@} private: - Teuchos::RCP Operator_; Teuchos::RCP A_; Teuchos::RCP X_; Teuchos::RCP B_; diff --git a/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp b/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp index 78612e75e344..ff0134852b74 100644 --- a/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp +++ b/packages/tpetra/core/src/Tpetra_LinearProblem_def.hpp @@ -30,7 +30,6 @@ namespace Tpetra { LinearProblem:: LinearProblem () : dist_object_type (Teuchos::rcp (new map_type ())), - Operator_(Teuchos::null), A_(Teuchos::null), X_(Teuchos::null), B_(Teuchos::null) @@ -43,35 +42,16 @@ namespace Tpetra { const Teuchos::RCP& X, const Teuchos::RCP& B) : dist_object_type (A->getDomainMap()), - Operator_(Teuchos::null), A_(A), X_(X), B_(B) { - // Try to make matrix an operator - Operator_ = Teuchos::rcp_dynamic_cast(A_); - } - - template - LinearProblem:: - LinearProblem (const Teuchos::RCP & A, - const Teuchos::RCP& X, - const Teuchos::RCP& B) - : dist_object_type (*X), - Operator_(A), - A_(Teuchos::null), - X_(X), - B_(B) - { - // Try to make operator a matrix - A_ = Teuchos::rcp_dynamic_cast(Operator_); } template LinearProblem:: LinearProblem (const LinearProblem& Problem) : dist_object_type (Problem), - Operator_(Problem.Operator_), A_(Problem.A_), X_(Problem.X_), B_(Problem.B_) @@ -123,11 +103,8 @@ namespace Tpetra { const char tfecfFuncName[] = "checkInput: "; - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(Operator_==Teuchos::null, - std::logic_error, "Operator_ is unset."); - - TPETRA_ABUSE_WARNING(A_==Teuchos::null, std::runtime_error, - "Linear problem does not have a matrix (A_), just an operator."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(A_==Teuchos::null, std::runtime_error, + "Linear problem does not have a matrix (A_)."); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X_==Teuchos::null, std::logic_error, "Solution vector (X_) is unset."); diff --git a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp index d56480dcc2e0..cdca5cccc8d1 100644 --- a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp +++ b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp @@ -9,9 +9,10 @@ #include "Tpetra_TestingUtilities.hpp" #include "Tpetra_LinearProblem.hpp" - #include "Tpetra_CrsMatrix.hpp" +#include "Teuchos_ScalarTraits.hpp" + namespace { // (anonymous) @@ -29,6 +30,8 @@ namespace { // (anonymous) using GST = Tpetra::global_size_t; +#define DEBUG_TEST +#ifdef DEBUG_TEST /// \brief Print out pretty version of RowMatrix. template void Display_CrsMatrix (std::string label, RCP > A, RCP< const Comm< int > > comm, Teuchos::FancyOStream& myOut) @@ -77,12 +80,12 @@ namespace { // (anonymous) // Convert local index to global index GST globalIndex = A->getColMap()->getGlobalElement(localIndices(k)); if (globalIndex == j) { - myOut << std::setw(8) << values(k) << " "; + myOut << std::setw(3) << values(k) << " "; break; } } } else { - myOut << std::setw(8) << 0 << " "; + myOut << std::setw(3) << 0 << " "; } } myOut << "]" << endl; @@ -126,6 +129,7 @@ namespace { // (anonymous) auto multivector = Teuchos::rcp_dynamic_cast> (vector); Display_MultiVector(label, multivector, comm, myOut); } +#endif // @@ -162,7 +166,7 @@ namespace { // (anonymous) //const size_t numImages = comm->getSize(); const size_t myImageID = comm->getRank(); // create a Map - const size_t numLocal = 10; + const size_t numLocal = 5; const size_t numVecs = 1; RCP map = createContigMapWithNode(INVALID,numLocal,comm); GO base = numLocal*myImageID; @@ -171,7 +175,6 @@ namespace { // (anonymous) { RCP A_crs = rcp(new MAT(map,3)); for (size_t i=0; iinsertGlobalValues(base+i,tuple(base+i),tuple(ST::one())); A_crs->insertGlobalValues(base + i, tuple(base + i), tuple(2.0)); // Diagonal entry GST globalIndex = base + i; @@ -208,31 +211,33 @@ namespace { // (anonymous) RCP linearProblem = rcp(new LPT()); - linearProblem->setOperator(A); + linearProblem->setMatrix(A); linearProblem->setLHS(X); linearProblem->setRHS(B); linearProblem->checkInput(); - //if (myImageID==0) myOut << "Original LinearProblem" << endl; - //Display_CrsMatrix("A", A, comm, myOut); - //Display_MultiVector("Solution Vector", X, comm, myOut); - //Display_MultiVector("RHS Vector", B, comm, myOut); - //Display_Vector("Scaling Vector", S, comm, myOut); +#ifdef DEBUG_TEST + if (myImageID==0) myOut << "Original LinearProblem" << endl; + Display_CrsMatrix("A", A, comm, myOut); + Display_MultiVector("Solution Vector", X, comm, myOut); + Display_MultiVector("RHS Vector", B, comm, myOut); + Display_Vector("Scaling Vector", S, comm, myOut); +#endif + mag_type eps = Teuchos::as(100)*Teuchos::ScalarTraits::eps(); // Original LinearProblem GST N = globalNumElements; double normF = std::sqrt(6*N - 2); TEST_FLOATING_EQUALITY(linearProblem->getMatrix()->getFrobeniusNorm(), - Teuchos::as(normF), Teuchos::as(1.0e-14)); - //Teuchos::as(7.615773105863909), Teuchos::as(1.0e-14)); + Teuchos::as(normF), eps); Array norms(numVecs); linearProblem->getLHS()->norm1(norms()); size_t vector_sum = N*(N+1)/2; - TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), Teuchos::as(1.0e-14)); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), eps); linearProblem->getRHS()->norm1(norms()); - TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), Teuchos::as(1.0e-14)); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), eps); // Left Scaling linearProblem->leftScale(S); @@ -240,17 +245,19 @@ namespace { // (anonymous) size_t vector_sum_squared = N*(N+1)*(2*N+1)/6; normF = std::sqrt(6*vector_sum_squared - N*N - 1); TEST_FLOATING_EQUALITY(linearProblem->getMatrix()->getFrobeniusNorm(), - Teuchos::as(normF), Teuchos::as(1.0e-14)); + Teuchos::as(normF), eps); linearProblem->getLHS()->norm1(norms()); - TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), Teuchos::as(1.0e-14)); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), eps); linearProblem->getRHS()->norm1(norms()); - TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum_squared), Teuchos::as(1.0e-14)); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum_squared), eps); - //if (myImageID==0) myOut << "After Left Scaling" << endl; - //Display_CrsMatrix("A", A, comm, myOut); - //Display_MultiVector("Solution Vector", X, comm, myOut); - //Display_MultiVector("RHS Vector", B, comm, myOut); - //Display_Vector("Scaling Vector", S, comm, myOut); +#ifdef DEBUG_TEST + if (myImageID==0) myOut << "After Left Scaling" << endl; + Display_CrsMatrix("A", A, comm, myOut); + Display_MultiVector("Solution Vector", X, comm, myOut); + Display_MultiVector("RHS Vector", B, comm, myOut); + Display_Vector("Scaling Vector", S, comm, myOut); +#endif // Right Scaling linearProblem->rightScale(S); @@ -263,17 +270,19 @@ namespace { // (anonymous) size_t diag = (2.0 * N * (N + 1) * (2 * N + 1) * (3 * N * N + 3 * N - 1)) / 15.0; normF = std::sqrt(diag + off_diags); TEST_FLOATING_EQUALITY(linearProblem->getMatrix()->getFrobeniusNorm(), - Teuchos::as(normF), Teuchos::as(1.0e-14)); + Teuchos::as(normF), eps); linearProblem->getLHS()->norm1(norms()); - TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(N), Teuchos::as(1.0e-14)); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(N), eps); linearProblem->getRHS()->norm1(norms()); - TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum_squared), Teuchos::as(1.0e-14)); + TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum_squared), eps); - //if (myImageID==0) myOut << "After Right Scaling" << endl; - //Display_CrsMatrix("A", A, comm, myOut); - //Display_MultiVector("Solution Vector", X, comm, myOut); - //Display_MultiVector("RHS Vector", B, comm, myOut); - //Display_Vector("Scaling Vector", S, comm, myOut); +#ifdef DEBUG_TEST + if (myImageID==0) myOut << "After Right Scaling" << endl; + Display_CrsMatrix("A", A, comm, myOut); + Display_MultiVector("Solution Vector", X, comm, myOut); + Display_MultiVector("RHS Vector", B, comm, myOut); + Display_Vector("Scaling Vector", S, comm, myOut); +#endif // Constructor with matrix { From c54fe80458bc5cec1ff52b912f52952779534812 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Thu, 7 Nov 2024 10:48:29 -0500 Subject: [PATCH 103/243] Remove use of Kokkos::Impl::DynRankViewFill Signed-off-by: Paul Zehner --- .../sacado/src/Kokkos_DynRankView_Fad.hpp | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/packages/sacado/src/Kokkos_DynRankView_Fad.hpp b/packages/sacado/src/Kokkos_DynRankView_Fad.hpp index e2c1d78aa81b..7e413cbc9393 100644 --- a/packages/sacado/src/Kokkos_DynRankView_Fad.hpp +++ b/packages/sacado/src/Kokkos_DynRankView_Fad.hpp @@ -988,9 +988,16 @@ void deep_copy( typename ViewTraits::non_const_value_type >::value , "Can only deep copy into non-const type" ); - Kokkos::fence(); - Kokkos::Impl::DynRankViewFill< DynRankView >( view , value ); - Kokkos::fence(); + switch(view.rank()) { + case 0: deep_copy(Impl::as_view_of_rank_n<0>(view), value); break; + case 1: deep_copy(Impl::as_view_of_rank_n<1>(view), value); break; + case 2: deep_copy(Impl::as_view_of_rank_n<2>(view), value); break; + case 3: deep_copy(Impl::as_view_of_rank_n<3>(view), value); break; + case 4: deep_copy(Impl::as_view_of_rank_n<4>(view), value); break; + case 5: deep_copy(Impl::as_view_of_rank_n<5>(view), value); break; + case 6: deep_copy(Impl::as_view_of_rank_n<6>(view), value); break; + case 7: deep_copy(Impl::as_view_of_rank_n<7>(view), value); break; + } } // Overload of deep_copy for Fad views intializing to a constant Fad @@ -1010,9 +1017,16 @@ void deep_copy( typename ViewTraits::non_const_value_type >::value , "Can only deep copy into non-const type" ); - Kokkos::fence(); - Kokkos::Impl::DynRankViewFill< DynRankView >( view , value ); - Kokkos::fence(); + switch(view.rank()) { + case 0: deep_copy(Impl::as_view_of_rank_n<0>(view), value); break; + case 1: deep_copy(Impl::as_view_of_rank_n<1>(view), value); break; + case 2: deep_copy(Impl::as_view_of_rank_n<2>(view), value); break; + case 3: deep_copy(Impl::as_view_of_rank_n<3>(view), value); break; + case 4: deep_copy(Impl::as_view_of_rank_n<4>(view), value); break; + case 5: deep_copy(Impl::as_view_of_rank_n<5>(view), value); break; + case 6: deep_copy(Impl::as_view_of_rank_n<6>(view), value); break; + case 7: deep_copy(Impl::as_view_of_rank_n<7>(view), value); break; + } } template< class DstType , class SrcType > From c2580a8b738488780c0b95431c5195c0a143f452 Mon Sep 17 00:00:00 2001 From: "Curtis C. Ober" Date: Fri, 8 Nov 2024 10:25:23 -0700 Subject: [PATCH 104/243] Fix eps calc. and rework vector assign loop. Signed-off-by: Curtis C. Ober --- .../LinearProblem/LinearProblem_UnitTests.cpp | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp index cdca5cccc8d1..3411eeaa1efa 100644 --- a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp +++ b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp @@ -140,8 +140,7 @@ namespace { // (anonymous) TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( LinearProblem, basic, LO, GO, Scalar, Node ) { using map_type = Tpetra::Map; - using ST = Teuchos::ScalarTraits; - using mag_type = typename ST::magnitudeType; + //using mag_type = typename Teuchos::ScalarTraits::magnitudeType; using MAT = Tpetra::CrsMatrix; using VT = Tpetra::Vector; @@ -198,16 +197,16 @@ namespace { // (anonymous) RCP S = rcp (new VT (map)); // Assign values to the MultiVector based on the global index - for (size_t j = 0; j < numVecs; ++j) { // Loop over each vector (column) - for (GST i = 0; i < globalNumElements; ++i) { - // Assign a value (for example, the global index plus the vector index) - X->replaceGlobalValue(i, j, Teuchos::as(i + j + 1)); - B->replaceGlobalValue(i, j, Teuchos::as(i + j + 1)); + for (size_t i = 0; i < numLocal; ++i) { + auto localIndex = map->getLocalElement(i); + auto globalIndex = map->getGlobalElement(i); + S->replaceLocalValue(localIndex, Teuchos::as(globalIndex + 1)); + for (size_t j = 0; j < numVecs; ++j) { // Loop over each vector (column) + // Assign a value (for example, the global index plus the vector index) + X->replaceLocalValue(localIndex, j, Teuchos::as(globalIndex + j + 1)); + B->replaceLocalValue(localIndex, j, Teuchos::as(globalIndex + j + 1)); } } - for (GST i = 0; i < globalNumElements; ++i) { - S->replaceGlobalValue(i, Teuchos::as(i + 1)); - } RCP linearProblem = rcp(new LPT()); @@ -225,14 +224,14 @@ namespace { // (anonymous) Display_Vector("Scaling Vector", S, comm, myOut); #endif - mag_type eps = Teuchos::as(100)*Teuchos::ScalarTraits::eps(); + Scalar eps = Teuchos::as(Teuchos::as(100)*Teuchos::ScalarTraits::eps()); // Original LinearProblem GST N = globalNumElements; double normF = std::sqrt(6*N - 2); TEST_FLOATING_EQUALITY(linearProblem->getMatrix()->getFrobeniusNorm(), Teuchos::as(normF), eps); - Array norms(numVecs); + Array norms(numVecs); linearProblem->getLHS()->norm1(norms()); size_t vector_sum = N*(N+1)/2; TEST_FLOATING_EQUALITY(norms[0], Teuchos::as(vector_sum), eps); From eb2709c46d2bad3e5aaff6ebe1fb9516d39baccc Mon Sep 17 00:00:00 2001 From: "Curtis C. Ober" Date: Fri, 8 Nov 2024 14:39:07 -0700 Subject: [PATCH 105/243] Fix bug and turn off debug output. Signed-off-by: Curtis C. Ober --- .../core/test/LinearProblem/LinearProblem_UnitTests.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp index 3411eeaa1efa..dd164350b978 100644 --- a/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp +++ b/packages/tpetra/core/test/LinearProblem/LinearProblem_UnitTests.cpp @@ -30,7 +30,7 @@ namespace { // (anonymous) using GST = Tpetra::global_size_t; -#define DEBUG_TEST +#undef DEBUG_TEST #ifdef DEBUG_TEST /// \brief Print out pretty version of RowMatrix. template @@ -197,9 +197,8 @@ namespace { // (anonymous) RCP S = rcp (new VT (map)); // Assign values to the MultiVector based on the global index - for (size_t i = 0; i < numLocal; ++i) { - auto localIndex = map->getLocalElement(i); - auto globalIndex = map->getGlobalElement(i); + for (size_t localIndex = 0; localIndex < numLocal; ++localIndex) { + auto globalIndex = map->getGlobalElement(localIndex); S->replaceLocalValue(localIndex, Teuchos::as(globalIndex + 1)); for (size_t j = 0; j < numVecs; ++j) { // Loop over each vector (column) // Assign a value (for example, the global index plus the vector index) From 1550fd99fec2caf862df4814ec21fef590f52f37 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Fri, 8 Nov 2024 20:25:52 -0700 Subject: [PATCH 106/243] Tacho : new options (dofs-per-node, pivot-tol, amd) Signed-off-by: iyamazaki --- packages/amesos2/src/Amesos2_Tacho_decl.hpp | 2 + packages/amesos2/src/Amesos2_Tacho_def.hpp | 27 +++++-- .../frosch/test/Thyra_Xpetra_Laplace/main.cpp | 1 + .../shylu_node/tacho/cmake/Tacho_config.h.in | 3 + .../tacho/example/Tacho_ExampleDriver.hpp | 36 +++++++-- .../tacho/src/Tacho_CrsMatrixBase.hpp | 5 +- .../shylu_node/tacho/src/Tacho_Driver.hpp | 54 ++++++++++++- .../tacho/src/impl/Tacho_Driver_Impl.hpp | 19 ++++- .../tacho/src/impl/Tacho_GraphTools.hpp | 5 ++ .../tacho/src/impl/Tacho_GraphTools_Metis.cpp | 35 ++++++-- .../tacho/src/impl/Tacho_GraphTools_Metis.hpp | 13 ++- .../tacho/src/impl/Tacho_LU_Internal.hpp | 19 ++++- .../tacho/src/impl/Tacho_Lapack_Team.hpp | 79 +++++++++++++++++-- .../src/impl/Tacho_NumericTools_Base.hpp | 3 +- .../src/impl/Tacho_NumericTools_LevelSet.hpp | 21 +++-- .../src/impl/Tacho_NumericTools_Serial.hpp | 3 +- .../impl/Tacho_TeamFunctor_FactorizeLU.hpp | 23 +++++- 17 files changed, 301 insertions(+), 47 deletions(-) diff --git a/packages/amesos2/src/Amesos2_Tacho_decl.hpp b/packages/amesos2/src/Amesos2_Tacho_decl.hpp index 95c71b184dc6..07acdaa91e49 100644 --- a/packages/amesos2/src/Amesos2_Tacho_decl.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_decl.hpp @@ -196,6 +196,8 @@ class TachoSolver : public SolverCore int small_problem_threshold_size; int streams; bool verbose; + int dofs_per_node; + bool pivot_pert; // int num_kokkos_threads; // int max_num_superblocks; } data_; diff --git a/packages/amesos2/src/Amesos2_Tacho_def.hpp b/packages/amesos2/src/Amesos2_Tacho_def.hpp index 221e505dbc54..e4f1bd98566b 100644 --- a/packages/amesos2/src/Amesos2_Tacho_def.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_def.hpp @@ -27,10 +27,12 @@ TachoSolver::TachoSolver( Teuchos::RCP B ) : SolverCore(A, X, B) { - data_.method = 1; // Cholesky - data_.variant = 2; // solver variant - data_.streams = 1; // # of streams - data_.verbose = false; // verbose + data_.method = 1; // Cholesky + data_.variant = 2; // solver variant + data_.streams = 1; // # of streams + data_.dofs_per_node = 1; // DoFs / node + data_.pivot_pert = false; // Diagonal pertubation + data_.verbose = false; // verbose } @@ -82,7 +84,11 @@ TachoSolver::symbolicFactorization_impl() // data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks); // Symbolic factorization currently must be done on host - data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_); + if (data_.dofs_per_node > 1) { + data_.solver.analyze(this->globalNumCols_, data_.dofs_per_node, host_row_ptr_view_, host_cols_view_); + } else { + data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_); + } data_.solver.initialize(); } return status; @@ -102,6 +108,11 @@ TachoSolver::numericFactorization_impl() if(do_optimization()) { this->matrixA_->returnValues_kokkos_view(device_nzvals_view_); } + if (data_.pivot_pert) { + data_.solver.useDefaultPivotTolerance(); + } else { + data_.solver.useNoPivotTolerance(); + } data_.solver.factorize(device_nzvals_view_); } return status; @@ -223,6 +234,10 @@ TachoSolver::setParameters_impl(const Teuchos::RCPget ("verbose", false); // # of streams data_.streams = parameterList->get ("num-streams", 1); + // DoFs / node + data_.dofs_per_node = parameterList->get ("dofs-per-node", 1); + // Perturb tiny pivots + data_.pivot_pert = parameterList->get ("perturb-pivot", false); // TODO: Confirm param options // data_.num_kokkos_threads = parameterList->get("kokkos-threads", 1); // data_.max_num_superblocks = parameterList->get("max-num-superblocks", 4); @@ -243,6 +258,8 @@ TachoSolver::getValidParameters_impl() const pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK."); pl->set("verbose", false, "Verbosity"); pl->set("num-streams", 1, "Number of GPU streams"); + pl->set("dofs-per-node", 1, "DoFs per node"); + pl->set("perturb-pivot", false, "Perturb tiny pivots"); // TODO: Confirm param options // pl->set("kokkos-threads", 1, "Number of threads"); diff --git a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp index 18b413f02d96..3e0fbdb81231 100644 --- a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp +++ b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp @@ -270,6 +270,7 @@ int main(int argc, char *argv[]) } else { assert(false); } + writeMM("Laplace.mtx",KMonolithic); RCP > xSolution = MultiVectorFactory::Build(KMonolithic->getMap(),1); RCP > xRightHandSide = MultiVectorFactory::Build(KMonolithic->getMap(),1); diff --git a/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in b/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in index 9daaa2f69860..a537bf9648c0 100644 --- a/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in +++ b/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in @@ -25,6 +25,9 @@ /* Define if want to build with CHOLMOD enabled */ #cmakedefine TACHO_HAVE_SUITESPARSE +/* Define if want to build with TrilinosSS enabled */ +#cmakedefine TACHO_HAVE_TRILINOS_SS + /* Define if want to build with VTune enabled */ #cmakedefine TACHO_HAVE_VTUNE diff --git a/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp b/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp index 450e04608954..dce1a645b801 100644 --- a/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp +++ b/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp @@ -25,8 +25,11 @@ template int driver(int argc, char *argv[]) { std::string file = "test.mtx"; std::string graph_file = ""; std::string weight_file = ""; + int dofs_per_node = 1; + bool perturbPivot = false; int nrhs = 1; bool randomRHS = true; + bool onesRHS = false; std::string method_name = "chol"; int method = 1; // 1 - Chol, 2 - LDL, 3 - SymLU int small_problem_thres = 1024; @@ -47,6 +50,8 @@ template int driver(int argc, char *argv[]) { opts.set_option("file", "Input file (MatrixMarket SPD matrix)", &file); opts.set_option("graph", "Input condensed graph", &graph_file); opts.set_option("weight", "Input condensed graph weight", &weight_file); + opts.set_option("dofs-per-node", "# DoFs per node", &dofs_per_node); + opts.set_option("perturb", "Flag to perturb tiny pivots", &perturbPivot); opts.set_option("nrhs", "Number of RHS vectors", &nrhs); opts.set_option("method", "Solution method: chol, ldl, lu", &method_name); opts.set_option("small-problem-thres", "LAPACK is used smaller than this thres", &small_problem_thres); @@ -55,6 +60,7 @@ template int driver(int argc, char *argv[]) { opts.set_option("device-solve-thres", "Device function is used above this subproblem size", &device_solve_thres); opts.set_option("variant", "algorithm variant in levelset scheduling; 0, 1 and 2", &variant); opts.set_option("nstreams", "# of streams used in CUDA; on host, it is ignored", &nstreams); + opts.set_option("one-rhs", "Set RHS to be ones", &onesRHS); opts.set_option("no-warmup", "Flag to turn off warmup", &no_warmup); opts.set_option("nfacts", "# of factorizations to perform", &nfacts); opts.set_option("nsolves", "# of solves to perform", &nsolves); @@ -125,6 +131,8 @@ template int driver(int argc, char *argv[]) { if (!in.good()) { std::cout << "Failed in open the file: " << graph_file << std::endl; return -1; + } else if (verbose) { + std::cout << " > Condensed graph file: " << graph_file << std::endl; } in >> m_graph; @@ -135,8 +143,10 @@ template int driver(int argc, char *argv[]) { aj_graph = ordinal_type_array_host("aj", ap_graph(m_graph)); for (ordinal_type i = 0; i < m_graph; ++i) { const ordinal_type jbeg = ap_graph(i), jend = ap_graph(i + 1); - for (ordinal_type j = jbeg; j < jend; ++j) + for (ordinal_type j = jbeg; j < jend; ++j) { in >> aj_graph(j); + aj_graph(j) --; // base-one + } } } @@ -146,6 +156,8 @@ template int driver(int argc, char *argv[]) { if (!in.good()) { std::cout << "Failed in open the file: " << weight_file << std::endl; return -1; + } else if (verbose) { + std::cout << " > Weight file for condensed graph: " << weight_file << std::endl; } ordinal_type m(0); in >> m; @@ -160,17 +172,21 @@ template int driver(int argc, char *argv[]) { Tacho::Driver solver; /// common options - solver.setSolutionMethod(method); - solver.setSmallProblemThresholdsize(small_problem_thres); solver.setVerbose(verbose); + solver.setSolutionMethod(method); + solver.setLevelSetOptionAlgorithmVariant(variant); + solver.setLevelSetOptionNumStreams(nstreams); /// graph options solver.setOrderConnectedGraphSeparately(); /// levelset options + solver.setSmallProblemThresholdsize(small_problem_thres); solver.setLevelSetOptionDeviceFunctionThreshold(device_factor_thres, device_solve_thres); - solver.setLevelSetOptionAlgorithmVariant(variant); - solver.setLevelSetOptionNumStreams(nstreams); + if (perturbPivot) { + if (verbose) std::cout << " > perturb tiny pivots" << std::endl; + solver.useDefaultPivotTolerance(); + } auto values_on_device = Kokkos::create_mirror_view(typename device_type::memory_space(), A.Values()); Kokkos::deep_copy(values_on_device, A.Values()); @@ -178,7 +194,10 @@ template int driver(int argc, char *argv[]) { /// inputs are used for graph reordering and analysis if (m_graph > 0 && m_graph < A.NumRows()) solver.analyze(A.NumRows(), A.RowPtr(), A.Cols(), m_graph, ap_graph, aj_graph, aw_graph); - else + else if (dofs_per_node > 1) { + if (verbose) std::cout << " > DoFs / node = " << dofs_per_node << std::endl; + solver.analyze(A.NumRows(), dofs_per_node, A.RowPtr(), A.Cols()); + } else solver.analyze(A.NumRows(), A.RowPtr(), A.Cols()); /// create numeric tools and levelset tools @@ -202,7 +221,10 @@ template int driver(int argc, char *argv[]) { t("t", A.NumRows(), nrhs); // temp workspace (store permuted rhs) { - if (randomRHS) { + if (onesRHS) { + const value_type one(1.0); + Kokkos::deep_copy (b, one); + } else if (randomRHS) { Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(b, random, value_type(1)); } else { diff --git a/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp b/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp index 5f5278497a86..ed9b3e0ae693 100644 --- a/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp +++ b/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp @@ -371,7 +371,8 @@ inline static void applyPermutationToCrsMatrixLower(/* */ CrsMatrixType &A, cons template inline double computeRelativeResidual(const CrsMatrixBase &A, const Kokkos::View &x, - const Kokkos::View &b) { + const Kokkos::View &b, + const bool verbose = false) { const bool test = (size_t(A.NumRows()) != size_t(A.NumCols()) || size_t(A.NumRows()) != size_t(b.extent(0)) || size_t(x.extent(0)) != size_t(b.extent(0)) || size_t(x.extent(1)) != size_t(b.extent(1))); if (test) @@ -405,6 +406,8 @@ inline double computeRelativeResidual(const CrsMatrixBase diff += arith_traits::real((h_b(i, p) - s) * arith_traits::conj(h_b(i, p) - s)); } } + if (verbose) + std::cout << " Relative residual norm = " << sqrt(diff) << " / " << sqrt(norm) << " = " << sqrt(diff/norm) << std::endl; return sqrt(diff / norm); } diff --git a/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp b/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp index 274f4c952092..a8da49d93806 100644 --- a/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp +++ b/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp @@ -16,6 +16,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "Tacho.hpp" +#include "Tacho_Util.hpp" #include #include @@ -24,7 +25,7 @@ namespace Tacho { /// forward decl class Graph; -#if defined(TACHO_HAVE_METIS) +#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) class GraphTools_Metis; #else class GraphTools; @@ -42,6 +43,7 @@ template class NumericToolsLe template struct Driver { public: using value_type = ValueType; + using mag_type = typename ArithTraits::mag_type; using device_type = DeviceType; using exec_space = typename device_type::execution_space; using exec_memory_space = typename device_type::memory_space; @@ -63,7 +65,7 @@ template struct Driver { using crs_matrix_type = CrsMatrixBase; using crs_matrix_type_host = CrsMatrixBase; -#if defined(TACHO_HAVE_METIS) +#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) using graph_tools_type = GraphTools_Metis; #else using graph_tools_type = GraphTools; @@ -160,6 +162,8 @@ template struct Driver { ordinal_type _variant; // algorithmic variant in levelset 0: naive, 1: invert diagonals ordinal_type _nstreams; // on cuda, multi streams are used + mag_type _pivot_tol; // tolerance for tiny pivot perturbation + // parallelism and memory constraint is made via this parameter ordinal_type _max_num_superblocks; // # of superblocks in the memoyrpool @@ -206,6 +210,10 @@ template struct Driver { void setLevelSetOptionNumStreams(const ordinal_type nstreams); void setLevelSetOptionAlgorithmVariant(const ordinal_type variant); + void setPivotTolerance(const mag_type pivot_tol); + void useNoPivotTolerance(); + void useDefaultPivotTolerance(); + /// /// get interface /// @@ -222,6 +230,7 @@ template struct Driver { template int analyze(const ordinal_type m, const arg_size_type_array &ap, const arg_ordinal_type_array &aj, const bool duplicate = false) { + _m = m; if (duplicate) { @@ -270,6 +279,7 @@ template struct Driver { const arg_perm_type_array &perm, const arg_perm_type_array &peri, const bool duplicate = false) { _m = m; + // this takes the user-specified perm, such that analyze() won't call graph partitioner if (duplicate) { /// for most cases, ap and aj are from host; so construct ap and aj and mirror to device _h_ap = size_type_array_host(Kokkos::ViewAllocateWithoutInitializing("h_ap"), ap.extent(0)); @@ -375,6 +385,46 @@ template struct Driver { return analyze(); } + template + int analyze(const ordinal_type m, const ordinal_type blk_size, + const arg_size_type_array &ap, const arg_ordinal_type_array &aj, + const bool duplicate = false) { + + if (blk_size > 1) { + //condense graph before calling analyze + const size_type nnz = ap(m); + size_type m_graph = m / blk_size; + size_type nnz_graph = nnz / (blk_size*blk_size); + TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != blk_size*blk_size * nnz_graph), + std::logic_error, "Failed to initialize the condensed graph"); + + size_type_array_host ap_graph + (Kokkos::ViewAllocateWithoutInitializing("ap_graph"), 1+m_graph); + ordinal_type_array_host aj_graph + (Kokkos::ViewAllocateWithoutInitializing("aj_graph"), nnz_graph); + ordinal_type_array_host aw_graph + (Kokkos::ViewAllocateWithoutInitializing("wgs"), m_graph); + // condense the graph + nnz_graph = 0; + ap_graph(0) = 0; + for (size_type i = 0; i < m; i += blk_size) { + for (size_type k = ap(i); k < ap(i+1); k++) { + if (aj(k)%blk_size == 0) { + aj_graph(nnz_graph) = aj(k)/blk_size; + nnz_graph++; + } + aw_graph(i/blk_size) = blk_size; + ap_graph((i/blk_size)+1) = nnz_graph; + } + } + TACHO_TEST_FOR_EXCEPTION((nnz != blk_size*blk_size * nnz_graph), + std::logic_error, "Failed to condense graph"); + return analyze(m, ap, aj, m_graph, ap_graph, aj_graph, aw_graph, duplicate); + } else { + return analyze(m, ap, aj, duplicate); + } + } + int initialize(); int factorize(const value_type_array &ax); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp index bf5e720265ee..605130e14fa9 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp @@ -26,7 +26,7 @@ Driver::Driver() _h_perm(), _peri(), _h_peri(), _m_graph(0), _nnz_graph(0), _h_ap_graph(), _h_aj_graph(), _h_perm_graph(), _h_peri_graph(), _nsupernodes(0), _N(nullptr), _verbose(0), _small_problem_thres(1024), _serial_thres_size(-1), _mb(-1), _nb(-1), _front_update_mode(-1), _levelset(0), _device_level_cut(0), _device_factor_thres(128), - _device_solve_thres(128), _variant(2), _nstreams(16), _max_num_superblocks(-1) {} + _device_solve_thres(128), _variant(2), _nstreams(16), _pivot_tol(0.0), _max_num_superblocks(-1) {} /// /// duplicate the object @@ -157,6 +157,19 @@ template void Driver::setLevelSetOptionNumStr _nstreams = nstreams; } +template void Driver::setPivotTolerance(const mag_type pivot_tol) { + _pivot_tol = pivot_tol; +} + +template void Driver::useNoPivotTolerance() { + _pivot_tol = 0.0; +} + +template void Driver::useDefaultPivotTolerance() { + using arith_traits = ArithTraits; + _pivot_tol = sqrt(arith_traits::epsilon()); +} + /// /// get interface /// @@ -373,7 +386,7 @@ template int Driver::factorize(const value_ty if (_m < _small_problem_thres) { factorize_small_host(ax); } else { - _N->factorize(ax, _verbose); + _N->factorize(ax, _pivot_tol, _verbose); } return 0; } @@ -541,7 +554,7 @@ double Driver::computeRelativeResidual(const value_type_array &ax, const CrsMatrixBase A; A.setExternalMatrix(_m, _m, _nnz, _ap, _aj, ax); - return Tacho::computeRelativeResidual(A, x, b); + return Tacho::computeRelativeResidual(A, x, b, _verbose); } template diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp index 9d48cd14fb96..a4a7c2948e46 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp @@ -67,6 +67,11 @@ class GraphTools { _perm(i) = i; _peri(i) = i; } + if (verbose) { + printf("Summary: GraphTools (Default)\n"); + printf("=============================\n"); + printf( " Use Natural Ordering\n\n" ); + } } ordinal_type_array PermVector() const { return _perm; } diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp index a85ef651cc4a..a475f729f38a 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp @@ -13,7 +13,7 @@ #include "Tacho_Util.hpp" -#if defined(TACHO_HAVE_METIS) +#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) #include "Tacho_GraphTools_Metis.hpp" namespace Tacho { @@ -39,8 +39,15 @@ GraphTools_Metis::GraphTools_Metis(const Graph &g) { for (ordinal_type i = 0; i < static_cast(_adjncy.extent(0)); ++i) _adjncy(i) = g_col_idx(i); +#if defined(TACHO_HAVE_METIS) + _algo = 2; METIS_SetDefaultOptions(_options); _options[METIS_OPTION_NUMBERING] = 0; +#elif defined(TACHO_HAVE_TRILINOS_SS) + _algo = 1; +#else + _algo = 0; +#endif _perm_t = idx_t_array(do_not_initialize_tag("idx_t_perm"), _nvts); _peri_t = idx_t_array(do_not_initialize_tag("idx_t_peri"), _nvts); @@ -52,7 +59,12 @@ GraphTools_Metis::GraphTools_Metis(const Graph &g) { GraphTools_Metis::~GraphTools_Metis() {} void GraphTools_Metis::setVerbose(const bool verbose) { _verbose = verbose; } -void GraphTools_Metis::setOption(const int id, const idx_t value) { _options[id] = value; } +void GraphTools_Metis::setOption(const int id, const idx_t value) { +#if defined(TACHO_HAVE_METIS) + _options[id] = value; +#endif +} +void GraphTools_Metis::setAlgorithm(const int algo) { _algo = algo; } /// /// reorder by amd @@ -81,13 +93,12 @@ void GraphTools_Metis::reorder(const ordinal_type verbose) { Kokkos::Timer timer; double t_metis = 0; - int algo = 2; - if (algo == 0) { + if (_algo == 0) { for (ordinal_type i = 0; i < _nvts; ++i) { _perm(i) = i; _peri(i) = i; } - } else if (algo == 1) { + } else if (_algo == 1) { int ierr = 0; double amd_info[TRILINOS_AMD_INFO]; @@ -100,8 +111,10 @@ void GraphTools_Metis::reorder(const ordinal_type verbose) { _peri(_perm(i)) = i; } - TACHO_TEST_FOR_EXCEPTION(ierr != METIS_OK, std::runtime_error, "Failed in trilinos_amd"); + // ierr != TRILINOS_AMD_OK && ierr != TRILINOS_AMD_OK_BUT_JUMBLED + TACHO_TEST_FOR_EXCEPTION(ierr < TRILINOS_AMD_OK, std::runtime_error, "Failed in trilinos_amd"); } else { +#if defined(TACHO_HAVE_METIS) int ierr = 0; idx_t *xadj = (idx_t *)_xadj.data(); @@ -121,11 +134,19 @@ void GraphTools_Metis::reorder(const ordinal_type verbose) { } TACHO_TEST_FOR_EXCEPTION(ierr != METIS_OK, std::runtime_error, "Failed in METIS_NodeND"); +#else + TACHO_TEST_FOR_EXCEPTION(true, std::runtime_error, "METIS is not enabled"); +#endif } _is_ordered = true; if (verbose) { - printf("Summary: GraphTools (Metis)\n"); + if (_algo == 0) + printf("Summary: GraphTools (Natural)\n"); + else if (_algo == 1) + printf("Summary: GraphTools (AMD)\n"); + else + printf("Summary: GraphTools (Metis)\n"); printf("===========================\n"); switch (verbose) { diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp index e3dd1856e601..87119b84de0f 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp @@ -16,11 +16,12 @@ #include "Tacho_Util.hpp" -#if defined(TACHO_HAVE_METIS) #include "Tacho_Graph.hpp" #include "trilinos_amd.h" -#include "metis.h" +#if defined(TACHO_HAVE_METIS) + #include "metis.h" +#endif namespace Tacho { @@ -28,6 +29,9 @@ class GraphTools_Metis { public: typedef typename UseThisDevice::type host_device_type; + #if !defined(TACHO_HAVE_METIS) + typedef ordinal_type idx_t; + #endif typedef Kokkos::View idx_t_array; typedef Kokkos::View ordinal_type_array; @@ -36,7 +40,10 @@ class GraphTools_Metis { idx_t _nvts; idx_t_array _xadj, _adjncy, _vwgt; + int _algo; + #if defined(TACHO_HAVE_METIS) idx_t _options[METIS_NOPTIONS]; + #endif // metis output idx_t_array _perm_t, _peri_t; @@ -61,6 +68,7 @@ class GraphTools_Metis { void setVerbose(const bool verbose); void setOption(const int id, const idx_t value); + void setAlgorithm(const int algo); template ordering_type amd_order(ordering_type n, const ordering_type *xadj, @@ -82,4 +90,3 @@ class GraphTools_Metis { } // namespace Tacho #endif -#endif diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp index b30c0c85c34f..26f4c4c202c6 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp @@ -25,7 +25,6 @@ template <> struct LU { template KOKKOS_INLINE_FUNCTION static int invoke(MemberType &member, const ViewTypeA &A, const ViewTypeP &P) { typedef typename ViewTypeA::non_const_value_type value_type; - // typedef typename ViewTypeP::non_const_value_type p_value_type; static_assert(ViewTypeA::rank == 2, "A is not rank 2 view."); static_assert(ViewTypeP::rank == 1, "P is not rank 1 view."); @@ -41,6 +40,24 @@ template <> struct LU { return r_val; } + template + KOKKOS_INLINE_FUNCTION static int invoke(MemberType &member, const double tol, const ViewTypeA &A, const ViewTypeP &P) { + typedef typename ViewTypeA::non_const_value_type value_type; + + static_assert(ViewTypeA::rank == 2, "A is not rank 2 view."); + static_assert(ViewTypeP::rank == 1, "P is not rank 1 view."); + + TACHO_TEST_FOR_ABORT(P.extent(0) < 4 * A.extent(0), "P should be 4*A.extent(0) ."); + + int r_val(0); + const ordinal_type m = A.extent(0), n = A.extent(1); + if (m > 0 && n > 0) { + /// factorize LU + LapackTeam::getrf(member, tol, m, n, A.data(), A.stride_1(), P.data(), &r_val); + } + return r_val; + } + template KOKKOS_INLINE_FUNCTION static int modify(const MemberType &member, const ordinal_type m, const ViewTypeP &P) { static_assert(ViewTypeP::rank == 1, "P is not rank 1 view."); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp index cde52b82693a..939ff6f240d8 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp @@ -231,13 +231,13 @@ template struct LapackTeam { template static KOKKOS_INLINE_FUNCTION void getrf(const MemberType &member, const int m, const int n, T *KOKKOS_RESTRICT A, const int as1, int *KOKKOS_RESTRICT ipiv, int *info) { + *info = 0; if (m <= 0 || n <= 0) return; using arith_traits = ArithTraits; using mag_type = typename arith_traits::mag_type; - - const T zero(0); + const mag_type zero(0); const int as0 = 1; for (int p = 0; p < m; ++p) { const int iend = m - p - 1, jend = n - p - 1; @@ -248,8 +248,9 @@ template struct LapackTeam { *KOKKOS_RESTRICT a12 = A + (p) * as0 + (p + 1) * as1, *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; + int idx(0); + mag_type val(0.0); { - int idx(0); using reducer_value_type = typename Kokkos::MaxLoc::value_type; reducer_value_type value; Kokkos::MaxLoc reducer_value(value); @@ -265,10 +266,11 @@ template struct LapackTeam { reducer_value); member.team_barrier(); idx = value.loc; + val = value.val; /// pivot Kokkos::single(Kokkos::PerThread(member), [&]() { - if (*info == 0 && *alpha11 == zero) { + if (*info == 0 && val == zero) { *info = 1+p; } ipiv[p] = p + idx + 1; @@ -279,9 +281,74 @@ template struct LapackTeam { member.team_barrier(); } } - + const T alpha = *alpha11; // swapped, so contains new pivot + if(val != zero) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, iend), [&](const int &i) { a21[i * as0] /= alpha; }); + member.team_barrier(); + } + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, jend), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, iend), + [&](const int &i) { A22[i * as0 + j * as1] -= a21[i * as0] * a12[j * as1]; }); + }); member.team_barrier(); - const T alpha = *alpha11; + } + } + + template + static KOKKOS_INLINE_FUNCTION void getrf(const MemberType &member, const double tol, const int m, const int n, T *KOKKOS_RESTRICT A, + const int as1, int *KOKKOS_RESTRICT ipiv, int *info) { + *info = 0; + if (m <= 0 || n <= 0) + return; + + using arith_traits = ArithTraits; + using mag_type = typename arith_traits::mag_type; + const mag_type zero(0); + //const mag_type tol = sqrt(arith_traits::epsilon()); + const int as0 = 1; + for (int p = 0; p < m; ++p) { + const int iend = m - p - 1, jend = n - p - 1; + T *KOKKOS_RESTRICT alpha11 = A + (p)*as0 + (p)*as1, // as0 & as1 are leading dimension for rows & cols + *KOKKOS_RESTRICT AB = A + (p) * as0, + *KOKKOS_RESTRICT ABR = alpha11, + *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p) * as1, + *KOKKOS_RESTRICT a12 = A + (p) * as0 + (p + 1) * as1, + *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; + + int idx(0); + mag_type val(0.0); + { + using reducer_value_type = typename Kokkos::MaxLoc::value_type; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, 1 + iend), + [&](const int &i, reducer_value_type &update) { + const mag_type val = arith_traits::abs(ABR[i * as0]); + if (val > update.val) { + update.val = val; + update.loc = i; + } + }, + reducer_value); + member.team_barrier(); + idx = value.loc; + val = value.val; + + /// pivot + Kokkos::single(Kokkos::PerThread(member), [&]() { + if (val < tol) { + ABR[idx * as0] = (arith_traits::real(ABR[idx * as0]) < zero ? -T(tol) : T(tol)); + } + ipiv[p] = p + idx + 1; + }); + if (idx) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), + [&](const int &j) { swap(AB[j * as1], AB[idx * as0 + j * as1]); }); + member.team_barrier(); + } + } + const T alpha = *alpha11; // swapped, so contains new pivot Kokkos::parallel_for(Kokkos::TeamVectorRange(member, iend), [&](const int &i) { a21[i * as0] /= alpha; }); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, jend), [&](const int &j) { diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp index 312c2bfcefd9..5430e789a462 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp @@ -24,6 +24,7 @@ namespace Tacho { template class NumericToolsBase { public: using value_type = ValueType; + using mag_type = typename ArithTraits::mag_type; using device_type = DeviceType; using exec_space = typename device_type::execution_space; using exec_memory_space = typename device_type::memory_space; @@ -243,7 +244,7 @@ template class NumericToolsBase { } } - inline virtual void factorize(const value_type_array &ax, const ordinal_type verbose = 0) { + inline virtual void factorize(const value_type_array &ax, const mag_type pivot_tol = 0.0, const ordinal_type verbose = 0) { TACHO_TEST_FOR_EXCEPTION(true, std::logic_error, "The function should be overriden by derived classes"); } diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 18897036922a..334d514821b7 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -88,6 +88,7 @@ #endif #endif + namespace Tacho { template @@ -112,6 +113,7 @@ class NumericToolsLevelSet : public NumericToolsBase { using typename base_type::supernode_info_type; using typename base_type::supernode_type_array_host; using typename base_type::value_type; + using typename base_type::mag_type; using typename base_type::int_type_array; using typename base_type::value_type_array; using typename base_type::value_type_matrix; @@ -3669,7 +3671,7 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::parallel_for( policy, KOKKOS_LAMBDA(const ordinal_type &i) { buf_solve_nrhs_ptr(i) = nrhs * buf_solve_ptr(i); }); Kokkos::deep_copy(_h_buf_solve_nrhs_ptr, _buf_solve_nrhs_ptr); - _nrhs = nrhs; + _nrhs = nrhs; } } } @@ -4204,7 +4206,7 @@ class NumericToolsLevelSet : public NumericToolsBase { } } - inline void factorizeLU(const value_type_array &ax, const ordinal_type verbose) { + inline void factorizeLU(const value_type_array &ax, const mag_type pivot_tol, const ordinal_type verbose) { constexpr bool is_host = std::is_same::value; Kokkos::Timer timer; Kokkos::Timer tick; @@ -4278,7 +4280,12 @@ class NumericToolsLevelSet : public NumericToolsBase { team_policy_factor policy_factor(1, 1, 1); team_policy_update policy_update(1, 1, 1); functor_type functor(_info, _factorize_mode, _level_sids, _piv, _buf, &rval); - + if (pivot_tol > 0.0) { + using arith_traits = ArithTraits; + using mag_type = typename arith_traits::mag_type; + const mag_type tol = sqrt(arith_traits::epsilon()); + functor.setDiagPertubationTol(pivot_tol); + } // get max vector length const ordinal_type vmax = policy_factor.vector_length_max(); { @@ -4333,7 +4340,9 @@ class NumericToolsLevelSet : public NumericToolsBase { if (rval != 0) { TACHO_TEST_FOR_EXCEPTION(rval, std::runtime_error, "GETRF (team) returns non-zero error code."); } - + if (_status != 0) { + TACHO_TEST_FOR_EXCEPTION(rval, std::runtime_error, "GETRF (device) returns non-zero error code."); + } Kokkos::parallel_for("update factor", policy_update, functor); if (verbose) { Kokkos::fence(); time_update += tick.seconds(); @@ -4564,7 +4573,7 @@ class NumericToolsLevelSet : public NumericToolsBase { } } - inline void factorize(const value_type_array &ax, const ordinal_type verbose = 0) override { + inline void factorize(const value_type_array &ax, const mag_type pivot_tol = 0.0, const ordinal_type verbose = 0) override { Kokkos::deep_copy(_superpanel_buf, value_type(0)); switch (this->getSolutionMethod()) { case 1: { /// Cholesky @@ -4600,7 +4609,7 @@ class NumericToolsLevelSet : public NumericToolsBase { track_alloc(_piv.span() * sizeof(ordinal_type)); } } - factorizeLU(ax, verbose); + factorizeLU(ax, pivot_tol, verbose); break; } default: { diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp index 584930b56525..86b65e7ef78f 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp @@ -45,6 +45,7 @@ class NumericToolsSerial : public NumericToolsBase { using typename base_type::ordinal_type_array; using typename base_type::ordinal_type_array_host; using typename base_type::size_type_array; + using typename base_type::mag_type; using typename base_type::value_type; using typename base_type::value_type_array; using typename base_type::value_type_matrix; @@ -475,7 +476,7 @@ class NumericToolsSerial : public NumericToolsBase { /// /// main interface /// - inline void factorize(const value_type_array &ax, const ordinal_type verbose = 0) override { + inline void factorize(const value_type_array &ax, const mag_type pivot_tol = 0.0, const ordinal_type verbose = 0) override { { const bool test = !std::is_same::value; TACHO_TEST_FOR_EXCEPTION(test, std::logic_error, "Serial interface works on host device only"); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp index 3ad435b8e853..33caa7532fb0 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp @@ -34,6 +34,9 @@ template struct TeamFunctor_FactorizeLU { using value_type_array = typename supernode_info_type::value_type_array; using value_type_matrix = typename supernode_info_type::value_type_matrix; + using arith_traits = ArithTraits; + using mag_type = typename arith_traits::mag_type; + private: supernode_info_type _info; ordinal_type_array _compute_mode, _level_sids; @@ -44,6 +47,7 @@ template struct TeamFunctor_FactorizeLU { size_type_array _buf_ptr; value_type_array _buf; + mag_type _tol; int *_rval; public: @@ -54,7 +58,8 @@ template struct TeamFunctor_FactorizeLU { TeamFunctor_FactorizeLU(const supernode_info_type &info, const ordinal_type_array &compute_mode, const ordinal_type_array &level_sids, const ordinal_type_array &piv, const value_type_array buf, int *rval) - : _info(info), _compute_mode(compute_mode), _level_sids(level_sids), _piv(piv), _buf(buf), _rval(rval) {} + : _info(info), _compute_mode(compute_mode), _level_sids(level_sids), _piv(piv), _buf(buf), + _tol(0.0), _rval(rval) {} inline void setRange(const ordinal_type pbeg, const ordinal_type pend) { _pbeg = pbeg; @@ -62,6 +67,7 @@ template struct TeamFunctor_FactorizeLU { } inline void setBufferPtr(const size_type_array &buf_ptr) { _buf_ptr = buf_ptr; } + inline void setDiagPertubationTol(const mag_type tol) { _tol = tol; } /// /// Main functions @@ -78,7 +84,10 @@ template struct TeamFunctor_FactorizeLU { if (m > 0) { UnmanagedViewType AT(s.u_buf, m, n); - err = LU::invoke(member, AT, P); + if (_tol > 0.0) + err = LU::invoke(member, _tol, AT, P); + else + err = LU::invoke(member, AT, P); member.team_barrier(); if (err != 0) { Kokkos::atomic_add(_rval, 1); @@ -117,7 +126,10 @@ template struct TeamFunctor_FactorizeLU { if (m > 0) { UnmanagedViewType AT(s.u_buf, m, n); - err = LU::invoke(member, AT, P); + if (_tol > 0.0) + err = LU::invoke(member, _tol, AT, P); + else + err = LU::invoke(member, AT, P); member.team_barrier(); if (err != 0) { Kokkos::atomic_add(_rval, 1); @@ -178,7 +190,10 @@ template struct TeamFunctor_FactorizeLU { if (m > 0) { UnmanagedViewType AT(s.u_buf, m, n); - err = LU::invoke(member, AT, P); + if (_tol > 0.0) + err = LU::invoke(member, _tol, AT, P); + else + err = LU::invoke(member, AT, P); member.team_barrier(); if (err != 0) { Kokkos::atomic_add(_rval, 1); From 9990b4483b333331165a87d141dff32f3054ce61 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Fri, 8 Nov 2024 21:49:56 -0700 Subject: [PATCH 107/243] Tacho : compiler warnings Signed-off-by: iyamazaki Tacho : compiler warnings Signed-off-by: iyamazaki --- .../shylu/shylu_node/tacho/src/Tacho_Driver.hpp | 8 ++++---- .../tacho/src/impl/Tacho_Lapack_Team.hpp | 12 ++++++------ .../tacho/src/impl/Tacho_NumericTools_LevelSet.hpp | 3 --- .../src/impl/Tacho_TeamFunctor_ExtractCRS.hpp | 14 +++++++------- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp b/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp index a8da49d93806..17f871051458 100644 --- a/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp +++ b/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp @@ -393,9 +393,9 @@ template struct Driver { if (blk_size > 1) { //condense graph before calling analyze const size_type nnz = ap(m); - size_type m_graph = m / blk_size; + ordinal_type m_graph = m / blk_size; size_type nnz_graph = nnz / (blk_size*blk_size); - TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != blk_size*blk_size * nnz_graph), + TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != size_type(blk_size*blk_size) * nnz_graph), std::logic_error, "Failed to initialize the condensed graph"); size_type_array_host ap_graph @@ -407,7 +407,7 @@ template struct Driver { // condense the graph nnz_graph = 0; ap_graph(0) = 0; - for (size_type i = 0; i < m; i += blk_size) { + for (ordinal_type i = 0; i < m; i += blk_size) { for (size_type k = ap(i); k < ap(i+1); k++) { if (aj(k)%blk_size == 0) { aj_graph(nnz_graph) = aj(k)/blk_size; @@ -417,7 +417,7 @@ template struct Driver { ap_graph((i/blk_size)+1) = nnz_graph; } } - TACHO_TEST_FOR_EXCEPTION((nnz != blk_size*blk_size * nnz_graph), + TACHO_TEST_FOR_EXCEPTION((nnz != size_type(blk_size*blk_size) * nnz_graph), std::logic_error, "Failed to condense graph"); return analyze(m, ap, aj, m_graph, ap_graph, aj_graph, aw_graph, duplicate); } else { diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp index 939ff6f240d8..f7308a444b94 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp @@ -257,9 +257,9 @@ template struct LapackTeam { Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, 1 + iend), [&](const int &i, reducer_value_type &update) { - const mag_type val = arith_traits::abs(ABR[i * as0]); - if (val > update.val) { - update.val = val; + const mag_type val_i = arith_traits::abs(ABR[i * as0]); + if (val_i > update.val) { + update.val = val_i; update.loc = i; } }, @@ -324,9 +324,9 @@ template struct LapackTeam { Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, 1 + iend), [&](const int &i, reducer_value_type &update) { - const mag_type val = arith_traits::abs(ABR[i * as0]); - if (val > update.val) { - update.val = val; + const mag_type val_i = arith_traits::abs(ABR[i * as0]); + if (val_i > update.val) { + update.val = val_i; update.loc = i; } }, diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 334d514821b7..535b117e4c9a 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -4281,9 +4281,6 @@ class NumericToolsLevelSet : public NumericToolsBase { team_policy_update policy_update(1, 1, 1); functor_type functor(_info, _factorize_mode, _level_sids, _piv, _buf, &rval); if (pivot_tol > 0.0) { - using arith_traits = ArithTraits; - using mag_type = typename arith_traits::mag_type; - const mag_type tol = sqrt(arith_traits::epsilon()); functor.setDiagPertubationTol(pivot_tol); } // get max vector length diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp index 3970b8f9c213..5728d40b5f2f 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp @@ -192,7 +192,7 @@ template struct TeamFunctor_ExtractCrs { [&](const int& i) { // diagonal block ordinal_type j; - for (ordinal_type j = i; j < s.m; j++) { + for (j = i; j < s.m; j++) { if (AT(i,j) != zero) { int nnz = _rowptr[i+offm]; _colind[nnz] = j+offm; @@ -202,8 +202,8 @@ template struct TeamFunctor_ExtractCrs { } // off-diagonal blocksa j = s.m; - for (ordinal_type id = s.sid_col_begin + 1; id < s.sid_col_end - 1; id++) { - for (ordinal_type k = _info.sid_block_colidx(id).second; k < _info.sid_block_colidx(id + 1).second; k++) { + for (ordinal_type blk_id = s.sid_col_begin + 1; blk_id < s.sid_col_end - 1; blk_id++) { + for (ordinal_type k = _info.sid_block_colidx(blk_id).second; k < _info.sid_block_colidx(blk_id + 1).second; k++) { if (AT(i,j) != zero) { int nnz = _rowptr[i+offm]; _colind[nnz] = _info.gid_colidx(k+offn); @@ -262,8 +262,8 @@ template struct TeamFunctor_ExtractCrs { } // off-diagonals (each thread extract col, needing atomic-add) ordinal_type i = s.m; - for (ordinal_type id = s.sid_col_begin + 1; id < s.sid_col_end - 1; id++) { - for (ordinal_type k = _info.sid_block_colidx(id).second; k < _info.sid_block_colidx(id + 1).second; k++) { + for (ordinal_type blk_id = s.sid_col_begin + 1; blk_id < s.sid_col_end - 1; blk_id++) { + for (ordinal_type k = _info.sid_block_colidx(blk_id).second; k < _info.sid_block_colidx(blk_id + 1).second; k++) { if (AL(i, j) != zero) { ordinal_type gid_i = _info.gid_colidx(k+offn); Kokkos::atomic_add(&(_rowptr[1+gid_i]), 1); @@ -329,8 +329,8 @@ template struct TeamFunctor_ExtractCrs { } // off-diagonals (each thread extract col, needing atomic-add) ordinal_type i = s.m; - for (ordinal_type id = s.sid_col_begin + 1; id < s.sid_col_end - 1; id++) { - for (ordinal_type k = _info.sid_block_colidx(id).second; k < _info.sid_block_colidx(id + 1).second; k++) { + for (ordinal_type blk_id = s.sid_col_begin + 1; blk_id < s.sid_col_end - 1; blk_id++) { + for (ordinal_type k = _info.sid_block_colidx(blk_id).second; k < _info.sid_block_colidx(blk_id + 1).second; k++) { if (AL(i, j) != zero) { ordinal_type gid_i = _info.gid_colidx(k+offn); ordinal_type nnz = Kokkos::atomic_fetch_add(&(_rowptr[gid_i]), 1); From 5d53b7c53af1777fe9d4ef69993a6a3f02e757df Mon Sep 17 00:00:00 2001 From: "Samuel E. Browne" Date: Mon, 9 Sep 2024 20:46:54 -0600 Subject: [PATCH 108/243] Update CUDA AT2 config to run non-UVM Which will also cause it to start running all of the appropriate tests. If I remember correctly, we had this disabled because the containers were running out of disk space, but we want this enabled for the "real" PR configuration. Signed-off-by: Samuel E. Browne --- .github/workflows/AT2.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/AT2.yml b/.github/workflows/AT2.yml index c085620db33a..4d6626dd30d4 100644 --- a/.github/workflows/AT2.yml +++ b/.github/workflows/AT2.yml @@ -217,7 +217,7 @@ jobs: echo "https://github.com/trilinos/Trilinos/wiki/Containers" >> $GITHUB_STEP_SUMMARY echo "https://gitlab-ex.sandia.gov/trilinos-project/trilinos-containers/-/wikis/Containers-at-Sandia" >> $GITHUB_STEP_SUMMARY - cuda11-uvm-EXPERIMENTAL: + cuda11-EXPERIMENTAL: needs: pre-checks runs-on: [self-hosted, cuda-11.4.2_gcc-10.3.0_openmpi-4.1.6] if: ${{ needs.pre-checks.outputs.should_skip != 'true' && (github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.review.state == 'APPROVED') }} @@ -277,7 +277,7 @@ jobs: type python python3 ${GITHUB_WORKSPACE}/packages/framework/pr_tools/PullRequestLinuxDriverTest.py \ --target-branch-name ${{ github.event.pull_request.base.ref }} \ - --genconfig-build-name rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables \ + --genconfig-build-name rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_no-package-enables \ --pullrequest-number ${{ github.event.pull_request.number }} \ --pullrequest-env-config-file ${GITHUB_WORKSPACE}/packages/framework/pr_tools/trilinos_pr.ini \ --pullrequest-gen-config-file ${GITHUB_WORKSPACE}/packages/framework/GenConfig/src/gen-config.ini \ From 473f826db2bc961ff0c4839905c4ef3e36c07864 Mon Sep 17 00:00:00 2001 From: "Samuel E. Browne" Date: Mon, 9 Sep 2024 21:01:20 -0600 Subject: [PATCH 109/243] Enable tests for CUDA AT2 config Signed-off-by: Samuel E. Browne --- packages/framework/ini-files/config-specs.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index de052bca3530..39ba2dfc0f3a 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -2102,6 +2102,7 @@ use USE-UVM|NO use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA use SEMS_COMMON_CUDA_11 +opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL : ON [rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables] use NODE-TYPE|CUDA From 7acca366ec34cb41c58e1525c8ca4b33559780ff Mon Sep 17 00:00:00 2001 From: "Samuel E. Browne" Date: Tue, 10 Sep 2024 09:18:58 -0600 Subject: [PATCH 110/243] Reduce build/test parallelism to align with resources Signed-off-by: Samuel E. Browne --- .github/workflows/AT2.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/AT2.yml b/.github/workflows/AT2.yml index 4d6626dd30d4..e3249f041b74 100644 --- a/.github/workflows/AT2.yml +++ b/.github/workflows/AT2.yml @@ -289,8 +289,8 @@ jobs: --ctest-drop-site sems-cdash-son.sandia.gov/cdash \ --filename-subprojects ./package_subproject_list.cmake \ --filename-packageenables ./packageEnables.cmake \ - --max-cores-allowed=96 \ - --num-concurrent-tests=96 + --max-cores-allowed=48 \ + --num-concurrent-tests=48 - name: Summary if: ${{ !cancelled() }} shell: bash -l {0} @@ -398,4 +398,3 @@ jobs: echo "## Helpful Links" >> $GITHUB_STEP_SUMMARY echo "https://github.com/trilinos/Trilinos/wiki/Containers" >> $GITHUB_STEP_SUMMARY echo "https://gitlab-ex.sandia.gov/trilinos-project/trilinos-containers/-/wikis/Containers-at-Sandia" >> $GITHUB_STEP_SUMMARY - From a5eae79fae5772cafb8edf477a3697251f49e02e Mon Sep 17 00:00:00 2001 From: "Samuel E. Browne" Date: Wed, 11 Sep 2024 07:32:23 -0600 Subject: [PATCH 111/243] Disable X11 for container config We disable X11 everywhere else, so be consistent here. In the future, we probably want to enable this, since we DO have X11 in the containers, but getting that hooked up and working is for another day. Signed-off-by: Samuel E. Browne --- packages/framework/ini-files/config-specs.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 39ba2dfc0f3a..e3ca8c84d46b 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -2103,6 +2103,7 @@ use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA use SEMS_COMMON_CUDA_11 opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL : ON +opt-set-cmake-var TPL_ENABLE_X11 BOOL : OFF [rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables] use NODE-TYPE|CUDA From be3d09a358f89a48be6222450e29d8a39d3b01ed Mon Sep 17 00:00:00 2001 From: "Samuel E. Browne" Date: Mon, 16 Sep 2024 15:09:55 -0600 Subject: [PATCH 112/243] Add run-serial-tests to CUDA container configs Signed-off-by: Samuel E. Browne --- packages/framework/ini-files/config-specs.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index e3ca8c84d46b..5613a6c3b925 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -2102,6 +2102,7 @@ use USE-UVM|NO use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA use SEMS_COMMON_CUDA_11 +use CUDA11-RUN-SERIAL-TESTS opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL : ON opt-set-cmake-var TPL_ENABLE_X11 BOOL : OFF @@ -2121,6 +2122,7 @@ use USE-UVM|YES use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA use SEMS_COMMON_CUDA_11 +use CUDA11-RUN-SERIAL-TESTS opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL FORCE : OFF opt-set-cmake-var Kokkos_ENABLE_TESTS BOOL FORCE : ON From 014f761dc20db42305d667ab6ef26576da459dd4 Mon Sep 17 00:00:00 2001 From: "Samuel E. Browne" Date: Mon, 16 Sep 2024 15:10:14 -0600 Subject: [PATCH 113/243] Turn off smcuda BTL for CUDA container configs For some reason, there are a couple of tests that are failing when RDMA support is initialized. I debugged it to the point of disabling the smcuda BTL in OpenMPI. My guess is that something is wrong with our container build of OpenMPI, OR there is something different hardware-wise about our new Ampere80 machines (I checked the PCI bus addresses because that was something that a brief Google investigation indicated, but they didn't look any worse than the Volta70 machines). Signed-off-by: Samuel E. Browne --- packages/framework/ini-files/config-specs.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 5613a6c3b925..876443311a90 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -2105,6 +2105,7 @@ use SEMS_COMMON_CUDA_11 use CUDA11-RUN-SERIAL-TESTS opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL : ON opt-set-cmake-var TPL_ENABLE_X11 BOOL : OFF +opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING FORCE : --bind-to;none --mca btl ^smcuda [rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables] use NODE-TYPE|CUDA From 58637fe0ccbfcb65cd301124e432f606d4411d35 Mon Sep 17 00:00:00 2001 From: "Samuel E. Browne" Date: Tue, 17 Sep 2024 21:27:23 -0600 Subject: [PATCH 114/243] Try recommended Kokkos option Signed-off-by: Samuel E. Browne --- packages/framework/ini-files/config-specs.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 876443311a90..85687b8087ed 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -2106,6 +2106,7 @@ use CUDA11-RUN-SERIAL-TESTS opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL : ON opt-set-cmake-var TPL_ENABLE_X11 BOOL : OFF opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING FORCE : --bind-to;none --mca btl ^smcuda +opt-set-cmake-var Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC BOOL : OFF [rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables] use NODE-TYPE|CUDA From 2fbc6842bc03fd40294c27e13c241500cc2397d3 Mon Sep 17 00:00:00 2001 From: Alan Williams Date: Mon, 11 Nov 2024 10:10:46 -0700 Subject: [PATCH 115/243] STK: Snapshot 11-11-24 10:10 from Sierra 5.23.1-605-g31b54b7f Signed-off-by: Alan Williams --- packages/stk/CHANGELOG.md | 3 + packages/stk/CMakeLists.txt | 55 ++- packages/stk/cmake/STK_Trilinos_config.h.in | 3 + packages/stk/cmake/fperrno/fperrno_test.cpp | 23 + packages/stk/cmake/fpexcept/fpexcept_test.cpp | 23 + packages/stk/cmake/stk_wrappers.cmake | 29 ++ .../stk/stk_expreval/stk_expreval/Eval.cpp | 2 + .../stk/stk_expreval/stk_expreval/Eval.hpp | 11 + .../stk/stk_expreval/stk_expreval/NgpNode.cpp | 4 +- .../stk/stk_expreval/stk_expreval/NgpNode.hpp | 119 ++++- .../stk/stk_expreval/stk_expreval/Node.cpp | 22 + .../stk/stk_expreval/stk_expreval/Node.hpp | 2 + .../stk/stk_expreval/stk_expreval/Parser.cpp | 5 +- .../run_cmake_stk_standalone_serial | 1 + .../cmake_install_test/spack.cuda.yaml | 8 +- .../stk_spack_build_test_cuda.sh | 51 +- .../stk_test_app/run_cmake_in_spack_env | 17 +- packages/stk/stk_io/stk_io/IossBridge.cpp | 16 - .../stk/stk_mesh/stk_mesh/base/Bucket.cpp | 22 +- .../stk/stk_mesh/stk_mesh/base/BulkData.cpp | 45 +- .../stk/stk_mesh/stk_mesh/base/BulkData.hpp | 27 +- .../stk_mesh/stk_mesh/base/DeviceField.hpp | 15 +- .../stk/stk_mesh/stk_mesh/base/DeviceMesh.hpp | 109 ++++- .../stk/stk_mesh/stk_mesh/base/FEMHelpers.cpp | 5 +- .../stk_mesh/base/FieldDataManager.cpp | 13 +- .../stk_mesh/stk_mesh/base/FieldParallel.cpp | 6 +- .../stk/stk_mesh/stk_mesh/base/HostMesh.hpp | 63 ++- .../stk_mesh/base/NgpFieldParallel.hpp | 2 +- .../stk_mesh/stk_mesh/base/NgpMeshBase.hpp | 1 + .../stk_mesh/base/NgpParallelComm.hpp | 15 +- .../stk/stk_mesh/stk_mesh/base/NgpTypes.hpp | 2 +- .../stk/stk_mesh/stk_mesh/base/Selector.hpp | 12 +- .../stk_mesh/baseImpl/BucketRepository.cpp | 40 +- .../stk_mesh/baseImpl/BucketRepository.hpp | 3 +- .../stk_mesh/baseImpl/NgpFieldBLASImpl.hpp | 54 ++- .../stk_mesh/stk_mesh/baseImpl/Partition.cpp | 4 - .../stk_mesh/stk_mesh/baseImpl/Partition.hpp | 31 +- .../stk_mesh/NgpFieldAccess.cpp | 4 +- .../stk_mesh/ParallelSum.cpp | 78 ++- .../stk_mesh/calculate_centroid.hpp | 26 +- .../stk_mesh/perfCommNeighbors.cpp | 2 +- .../stk_util/perfPrintTimersTable.cpp | 111 +++++ .../stk_mesh_fixtures/HexFixture.cpp | 24 +- .../stk_mesh_fixtures/HexFixture.hpp | 9 +- .../UnitTestStkBalanceDecomposition.cpp | 3 +- .../stk_expreval/UnitTestEvaluator.cpp | 74 ++- .../stk_io/UnitTestWriteSTKMesh.cpp | 198 ++++---- .../stk_mesh/UnitTestBulkData.cpp | 9 +- .../stk_mesh/UnitTestChangeParts.cpp | 70 ++- .../stk_mesh/UnitTestCreateFaces.cpp | 12 +- .../stk_mesh/UnitTestDestroyElements.cpp | 3 +- .../stk_unit_tests/stk_mesh/UnitTestField.cpp | 65 +-- .../stk_mesh/UnitTestPartitions.cpp | 2 +- .../skin_mesh/UnitTestSkinMeshRefined.cpp | 1 - .../stk_mesh/ngp/NgpFieldTestUtils.hpp | 1 + .../stk_mesh/ngp/NgpMeshTest.cpp | 41 ++ .../stk_mesh/ngp/NgpParallelSumTest.cpp | 2 +- .../stk_mesh/ngp/NgpUnitTestUtils.hpp | 66 ++- .../stk_mesh/ngp/TestNgpMeshUpdate.cpp | 27 +- .../stk_mesh/ngp/UnitTestNgp.cpp | 10 + .../ngp/UnitTestNgpMeshModification.cpp | 444 ++++++++++++++++++ .../stk_mesh/ngp/ngpFieldTest.cpp | 34 +- .../stk_mesh/ngp/ngpMultiStateFieldTests.cpp | 20 +- .../parallel/UnitTestDeviceAwareMPI.cpp | 129 ++++- .../stk_util/util/UnitTestFPExceptions.cpp | 108 +++++ packages/stk/stk_util/stk_util/Version.hpp | 2 +- .../stk/stk_util/stk_util/ngp/NgpSpaces.hpp | 43 +- .../stk_util/parallel/DeviceAwareMPI.cpp | 12 +- .../stk_util/parallel/OutputStreams.cpp | 4 +- .../stk_util/registry/ProductRegistry.cpp | 2 +- packages/stk/stk_util/stk_util/stk_config.h | 2 + .../stk_util/stk_util/util/FPExceptions.cpp | 51 ++ .../stk_util/stk_util/util/FPExceptions.hpp | 95 ++++ 73 files changed, 2097 insertions(+), 550 deletions(-) create mode 100644 packages/stk/cmake/fperrno/fperrno_test.cpp create mode 100644 packages/stk/cmake/fpexcept/fpexcept_test.cpp create mode 100644 packages/stk/stk_performance_tests/stk_util/perfPrintTimersTable.cpp create mode 100644 packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpMeshModification.cpp create mode 100644 packages/stk/stk_unit_tests/stk_util/util/UnitTestFPExceptions.cpp create mode 100644 packages/stk/stk_util/stk_util/util/FPExceptions.cpp create mode 100644 packages/stk/stk_util/stk_util/util/FPExceptions.hpp diff --git a/packages/stk/CHANGELOG.md b/packages/stk/CHANGELOG.md index 6160baab2c76..514f7e831a1a 100644 --- a/packages/stk/CHANGELOG.md +++ b/packages/stk/CHANGELOG.md @@ -1,5 +1,8 @@ # CHANGELOG +5.21.6-1 (STK_VERSION 5210601) 10/31/2024 + stk_mesh, stk_search: more fixes for HIP unified and Cuda no-uvm builds + 5.21.6 (STK_VERSION 5210600) 10/25/2024 stk_search: fix build-error (instantiation error for morton_lbvh_search) for gcc 13.2 stk_util: added parallel/OutputStreams.hpp diff --git a/packages/stk/CMakeLists.txt b/packages/stk/CMakeLists.txt index f9ff77e06c1c..fa8d3c733a63 100644 --- a/packages/stk/CMakeLists.txt +++ b/packages/stk/CMakeLists.txt @@ -32,9 +32,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -cmake_minimum_required(VERSION 3.16 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23 FATAL_ERROR) -message("starting STK cmake configuration, CMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR}") +message("Starting STK cmake configuration, CMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR}") option(STK_BUILT_FOR_SIERRA "Enable SIERRA capability" OFF) set(SIERRA_MIGRATION ${STK_BUILT_FOR_SIERRA} CACHE BOOL "Enable SIERRA capability") @@ -46,14 +46,14 @@ endif() IF(COMMAND TRIBITS_PACKAGE_DECL) SET(HAVE_STK_Trilinos ON) TRIBITS_PACKAGE_DECL(STK) - MESSAGE("*** Building STK as a Trilinos package. ***") + message("*** Building STK as a Trilinos package. ***") ELSE() SET(HAVE_STK_Trilinos OFF) project(STK CXX Fortran) SET(PACKAGE_NAME "STK") - MESSAGE("*** Building STK as a stand-alone cmake package. ***") + message("*** Building STK as a stand-alone cmake package. ***") ENDIF() SET(STK_TOPLEVEL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) @@ -87,9 +87,9 @@ endif() STK_ADD_DEBUG_AND_DEPRECATED_OPTIONS() -MESSAGE("\nPROJECT_NAME: '${PROJECT_NAME}'") -MESSAGE("PACKAGE_NAME: '${PACKAGE_NAME}'") -MESSAGE("${PACKAGE_NAME}_SOURCE_DIR: '${${PACKAGE_NAME}_SOURCE_DIR}'\n") +message("\nPROJECT_NAME: '${PROJECT_NAME}'") +message("PACKAGE_NAME: '${PACKAGE_NAME}'") +message("${PACKAGE_NAME}_SOURCE_DIR: '${${PACKAGE_NAME}_SOURCE_DIR}'\n") IF (HAVE_STK_Trilinos) SET(STK_HAVE_KOKKOS ON) @@ -110,12 +110,12 @@ ELSE() IF(DEFINED STK_ENABLE_MPI) IF(STK_ENABLE_MPI) - MESSAGE("MPI requested via STK_ENABLE_MPI=ON") + message("MPI requested via STK_ENABLE_MPI=ON") ELSE() - MESSAGE("MPI disabled via STK_ENABLE_MPI=OFF") + message("MPI disabled via STK_ENABLE_MPI=OFF") ENDIF() ELSE() - MESSAGE("MPI defaulting to off. (STK_ENABLE_MPI not defined)") + message("MPI defaulting to off. (STK_ENABLE_MPI not defined)") ENDIF() IF(STK_ENABLE_MPI) @@ -125,36 +125,51 @@ ELSE() include_directories(SYSTEM ${MPI_INCLUDE_PATH}) message("MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}") ELSE() - MESSAGE(FATAL_ERROR "MPI enabled by '-DSTK_ENABLE_MPI' but not found.") + message(FATAL_ERROR "MPI enabled by '-DSTK_ENABLE_MPI' but not found.") ENDIF() ELSE() - MESSAGE("Building serial without MPI. (To enable MPI, use '-DSTK_ENABLE_MPI:BOOL=ON')") + message("Building serial without MPI. (To enable MPI, use '-DSTK_ENABLE_MPI:BOOL=ON')") ENDIF() ENDIF() -find_package(ArborX QUIET) -if(TARGET ArborX::ArborX) - MESSAGE("Found ArborX, making it available within stk") - SET(STK_HAS_ARBORX ON) +IF (STK_ENABLE_ARBORX OR (NOT HAVE_STK_Trilinos)) + if (HAVE_STK_Trilinos) + message("Caution: when building in Trilinos with ArborX enabled, Kokkos versions need to be consistent.") + endif() + find_package(ArborX QUIET) + if(TARGET ArborX::ArborX) + message("Found ArborX, making it available for stk search") + SET(STK_HAS_ARBORX ON) + else() + message("Optional search library ArborX is not enabled.") + endif() else() - MESSAGE("Optional search library ArborX is not enabled.") + message("Optional search library ArborX is not enabled.") endif() +stk_check_fp_handling() + if(NOT HAVE_STK_Trilinos) - find_package(SEACAS) + find_package(SEACAS QUIET) endif() if(SEACAS_ENABLE_SEACASIoss) message("Enabling stk usage of SEACASIoss") SET(STK_HAS_SEACAS_IOSS ON) +else() + message("Optional SEACASIoss usage not enabled") endif() if(SEACAS_ENABLE_SEACASExodus) message("Enabling stk usage of SEACASExodus") SET(STK_HAS_SEACAS_EXODUS ON) +else() + message("Optional SEACASExodus usage not enabled") endif() if(SEACAS_ENABLE_SEACASNemesis) message("Enabling stk usage of SEACASNemesis") SET(STK_HAS_SEACAS_NEMESIS ON) +else() + message("Optional SEACASNemesis usage not enabled") endif() if (Trilinos_ENABLE_Intrepid2) @@ -206,11 +221,11 @@ ELSEIF("${FC_FN_UNDERSCORE}" STREQUAL "UNDER") ELSEIF("${FC_FN_UNDERSCORE}" STREQUAL "SECOND_UNDER") SET(FORTRAN_TWO_UNDERSCORES ON) ELSE() - MESSAGE("Could not determine the Fortran mangling; defaulting to one underscore.") + message("Could not determine the Fortran mangling; defaulting to one underscore.") SET(FORTRAN_ONE_UNDERSCORE ON) ENDIF() -MESSAGE("\nCMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") +message("\nCMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") STK_CONFIGURE_FILE(STK_Trilinos_config.h) diff --git a/packages/stk/cmake/STK_Trilinos_config.h.in b/packages/stk/cmake/STK_Trilinos_config.h.in index 40aea3c9cd45..dcd83299ed83 100644 --- a/packages/stk/cmake/STK_Trilinos_config.h.in +++ b/packages/stk/cmake/STK_Trilinos_config.h.in @@ -80,6 +80,9 @@ #cmakedefine FORTRAN_TWO_UNDERSCORES #endif +#cmakedefine STK_HAVE_FP_EXCEPT +#cmakedefine STK_HAVE_FP_ERRNO + #cmakedefine SIERRA_MIGRATION #cmakedefine STK_16BIT_UPWARDCONN_INDEX_TYPE diff --git a/packages/stk/cmake/fperrno/fperrno_test.cpp b/packages/stk/cmake/fperrno/fperrno_test.cpp new file mode 100644 index 000000000000..ea77ba40b8d3 --- /dev/null +++ b/packages/stk/cmake/fperrno/fperrno_test.cpp @@ -0,0 +1,23 @@ + +#include +#include +#include + +int main(int argc, char** argv) +{ + bool haveFpErrno = false; + if (math_errhandling & MATH_ERRNO) { + [[maybe_unused]] auto result = std::log(0.0); + if (errno == ERANGE) { + haveFpErrno = true; + std::cout<<"ON"; //no newline, this output will set a cmake variable + } + } + + if (!haveFpErrno) { + std::cout<<"OFF"; + } + + return 0; +} + diff --git a/packages/stk/cmake/fpexcept/fpexcept_test.cpp b/packages/stk/cmake/fpexcept/fpexcept_test.cpp new file mode 100644 index 000000000000..1532ceb60c25 --- /dev/null +++ b/packages/stk/cmake/fpexcept/fpexcept_test.cpp @@ -0,0 +1,23 @@ + +#include +#include +#include + +int main(int argc, char** argv) +{ + bool haveFpExcept = false; + if (math_errhandling & MATH_ERREXCEPT) { + [[maybe_unused]] auto result = std::log(0.0); + if (std::fetestexcept(FE_DIVBYZERO)) { + haveFpExcept = true; + std::cout<<"ON"; //no newline, this output will set a cmake variable + } + } + + if (!haveFpExcept) { + std::cout<<"OFF"; + } + + return 0; +} + diff --git a/packages/stk/cmake/stk_wrappers.cmake b/packages/stk/cmake/stk_wrappers.cmake index 8a88fa9aba8c..f141f6f3c0f8 100644 --- a/packages/stk/cmake/stk_wrappers.cmake +++ b/packages/stk/cmake/stk_wrappers.cmake @@ -17,6 +17,35 @@ macro(STK_CONFIGURE_FILE filename) endif() endmacro() +function(stk_check_fp_handling) +# +# The following try_run commands use syntax that is supposed to work for +# cmake versions older than 3.25, as stated in cmake documentation +# here: https://cmake.org/cmake/help/latest/command/try_run.html +# As of Nov 8, 2024, trilinos and stk require cmake 3.23 +# + message("calling try_run with bindir=${CMAKE_CURRENT_BINARY_DIR}/fpexcept, srcfile=${${PACKAGE_NAME}_SOURCE_DIR}/cmake/fpexcept/fpexcept_test.cpp") + try_run(RUN_RESULT COMPILE_RESULT + ${CMAKE_CURRENT_BINARY_DIR}/fpexcept + ${${PACKAGE_NAME}_SOURCE_DIR}/cmake/fpexcept/fpexcept_test.cpp + RUN_OUTPUT_VARIABLE FP_RESULT) + + message("FP-EXCEPT-CHECK COMPILE_RESULT: ${COMPILE_RESULT}") + message("FP-EXCEPT-CHECK RUN_RESULT: ${RUN_RESULT}") + set(STK_HAVE_FP_EXCEPT ${FP_RESULT} CACHE BOOL "") + message("STK_HAVE_FP_EXCEPT: ${STK_HAVE_FP_EXCEPT}") + + try_run(RUN_RESULT COMPILE_RESULT + ${CMAKE_CURRENT_BINARY_DIR}/fperrno + ${${PACKAGE_NAME}_SOURCE_DIR}/cmake/fperrno/fperrno_test.cpp + RUN_OUTPUT_VARIABLE FP_RESULT) + + message("FP-ERRNO-CHECK COMPILE_RESULT: ${COMPILE_RESULT}") + message("FP-ERRNO-CHECK RUN_RESULT: ${RUN_RESULT}") + set(STK_HAVE_FP_ERRNO ${FP_RESULT} CACHE BOOL "") + message("STK_HAVE_FP_ERRNO: ${STK_HAVE_FP_ERRNO}") +endfunction() + function(stk_process_enables) message("******** Begin stk_process_enables ******") if(STK_ENABLE_ALL) diff --git a/packages/stk/stk_expreval/stk_expreval/Eval.cpp b/packages/stk/stk_expreval/stk_expreval/Eval.cpp index ae20873d80d0..21f5f0a61832 100644 --- a/packages/stk/stk_expreval/stk_expreval/Eval.cpp +++ b/packages/stk/stk_expreval/stk_expreval/Eval.cpp @@ -46,6 +46,7 @@ Eval::Eval(VariableMap::Resolver & resolver, const std::string & expression, Var m_expression(expression), m_syntaxStatus(false), m_parseStatus(false), + m_fpErrorBehavior(FPErrorBehavior::Warn), m_headNode(nullptr), m_arrayOffsetType(arrayOffsetType), m_parsedEval(nullptr) @@ -58,6 +59,7 @@ Eval::Eval(const std::string & expression, Variable::ArrayOffset arrayOffsetType m_expression(expression), m_syntaxStatus(false), m_parseStatus(false), + m_fpErrorBehavior(FPErrorBehavior::Warn), m_headNode(nullptr), m_arrayOffsetType(arrayOffsetType), m_parsedEval(nullptr) diff --git a/packages/stk/stk_expreval/stk_expreval/Eval.hpp b/packages/stk/stk_expreval/stk_expreval/Eval.hpp index 57882db67167..4935b0010a2c 100644 --- a/packages/stk/stk_expreval/stk_expreval/Eval.hpp +++ b/packages/stk/stk_expreval/stk_expreval/Eval.hpp @@ -57,6 +57,12 @@ class Eval public: typedef std::set UndefinedFunctionSet; + enum class FPErrorBehavior { + Ignore, + Warn, + Error + }; + Eval(VariableMap::Resolver &resolver = VariableMap::getDefaultResolver(), const std::string &expr = "", const Variable::ArrayOffset arrayOffsetType = Variable::ZERO_BASED_INDEX); @@ -113,6 +119,10 @@ class Eval UndefinedFunctionSet &getUndefinedFunctionSet() { return m_undefinedFunctionSet; } + void set_fp_error_behavior(FPErrorBehavior flag) { m_fpErrorBehavior = flag; } + + FPErrorBehavior get_fp_error_behavior() const { return m_fpErrorBehavior; } + bool getSyntaxStatus() const { return m_syntaxStatus; } bool getParseStatus() const { return m_parseStatus; } @@ -198,6 +208,7 @@ class Eval std::string m_expression; bool m_syntaxStatus; bool m_parseStatus; + FPErrorBehavior m_fpErrorBehavior; Node* m_headNode; std::vector> m_nodes; diff --git a/packages/stk/stk_expreval/stk_expreval/NgpNode.cpp b/packages/stk/stk_expreval/stk_expreval/NgpNode.cpp index b24c9dacf7be..6aca63f58286 100644 --- a/packages/stk/stk_expreval/stk_expreval/NgpNode.cpp +++ b/packages/stk/stk_expreval/stk_expreval/NgpNode.cpp @@ -34,6 +34,7 @@ #include "stk_expreval/NgpNode.hpp" #include "stk_expreval/Function.hpp" +#include "stk_expreval/Eval.hpp" namespace stk { namespace expreval { @@ -47,7 +48,8 @@ NgpNode::NgpNode(const Node& node) m_ternaryFalseNextNodeIndex(node.m_ternaryFalseNextNodeIndex), m_leftNodeIndex((node.m_left != nullptr) ? node.m_left->m_currentNodeIndex : -1), m_rightNodeIndex((node.m_right != nullptr) ? node.m_right->m_currentNodeIndex : -1), - m_ternaryOtherNodeIndex((node.m_ternaryOther != nullptr) ? node.m_ternaryOther->m_currentNodeIndex : -1) + m_ternaryOtherNodeIndex((node.m_ternaryOther != nullptr) ? node.m_ternaryOther->m_currentNodeIndex : -1), + m_fpErrorBehavior(node.m_owner->get_fp_error_behavior()) { if (m_opcode == OPCODE_CONSTANT) { m_data.constant.value = node.m_data.constant.value; diff --git a/packages/stk/stk_expreval/stk_expreval/NgpNode.hpp b/packages/stk/stk_expreval/stk_expreval/NgpNode.hpp index d57c16746808..bde87345c68a 100644 --- a/packages/stk/stk_expreval/stk_expreval/NgpNode.hpp +++ b/packages/stk/stk_expreval/stk_expreval/NgpNode.hpp @@ -36,10 +36,12 @@ #define NGPNODE_HPP #include "Kokkos_Core.hpp" +#include "stk_expreval/NgpNode.hpp" #include "stk_util/ngp/NgpSpaces.hpp" #include "stk_expreval/Variable.hpp" #include "stk_expreval/Node.hpp" #include "stk_expreval/Function.hpp" +#include "stk_expreval/Eval.hpp" namespace stk { namespace expreval { @@ -64,7 +66,8 @@ class NgpNode m_ternaryFalseNextNodeIndex(-1), m_leftNodeIndex(-1), m_rightNodeIndex(-1), - m_ternaryOtherNodeIndex(-1) + m_ternaryOtherNodeIndex(-1), + m_fpErrorBehavior(Eval::FPErrorBehavior::Error) {} explicit NgpNode(const Node& node); @@ -78,6 +81,17 @@ class NgpNode KOKKOS_DEFAULTED_FUNCTION ~NgpNode() = default; +#define checkNgpNodeFPError(val, name) \ + { \ + do { \ + if (m_fpErrorBehavior == stk::expreval::Eval::FPErrorBehavior::Warn && !std::isfinite(val)) { \ + printf("error in expression evaluator function " name ": " __FILE__ ": " LINE_STRING); \ + } else if (m_fpErrorBehavior == stk::expreval::Eval::FPErrorBehavior::Error && !std::isfinite(val)) { \ + STK_NGP_ThrowErrorMsg("error in expression evaluator function " name); \ + } \ + } while (false); \ + } + template KOKKOS_FUNCTION double @@ -106,11 +120,15 @@ class NgpNode break; } case OPCODE_EXPONENIATION: { - setResult(resultBuffer) = std::pow(get_left_node()->getResult(resultBuffer),get_right_node()->getResult(resultBuffer)); + double val = std::pow(get_left_node()->getResult(resultBuffer),get_right_node()->getResult(resultBuffer)); + checkNgpNodeFPError(val, "std::pow"); + setResult(resultBuffer) = val; break; } case OPCODE_DIVIDE: { - setResult(resultBuffer) = get_left_node()->getResult(resultBuffer)/get_right_node()->getResult(resultBuffer); + double val = get_left_node()->getResult(resultBuffer)/get_right_node()->getResult(resultBuffer); + checkNgpNodeFPError(val, "division operator"); + setResult(resultBuffer) = val; break; } case OPCODE_MODULUS: { @@ -275,6 +293,7 @@ class NgpNode int m_leftNodeIndex; int m_rightNodeIndex; int m_ternaryOtherNodeIndex; + Eval::FPErrorBehavior m_fpErrorBehavior; KOKKOS_FUNCTION double evaluate_function(int argumentCount, double* arguments) const @@ -357,14 +376,18 @@ class NgpNode } case FunctionType::POW : { if (argumentCount == 2) { - return std::pow(arguments[0], arguments[1]); + double val = std::pow(arguments[0], arguments[1]); + checkNgpNodeFPError(val, "pow"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for pow function"); break; } case FunctionType::SQRT : { if (argumentCount == 1) { - return std::sqrt(arguments[0]); + double val = std::sqrt(arguments[0]); + checkNgpNodeFPError(val, "sqrt"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for sqrt function"); break; @@ -378,14 +401,18 @@ class NgpNode } case FunctionType::LN : { if (argumentCount == 1) { - return std::log(arguments[0]); + double val = std::log(arguments[0]); + checkNgpNodeFPError(val, "ln"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for ln or log function"); break; } case FunctionType::LOG10 : { if (argumentCount == 1) { - return std::log10(arguments[0]); + double val = std::log10(arguments[0]); + checkNgpNodeFPError(val, "log10"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for log10 function"); break; @@ -427,14 +454,18 @@ class NgpNode } case FunctionType::ASIN : { if (argumentCount == 1) { - return std::asin(arguments[0]); + double val = std::asin(arguments[0]); + checkNgpNodeFPError(val, "asin"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for asin function"); break; } case FunctionType::ACOS : { if (argumentCount == 1) { - return std::acos(arguments[0]); + double val = std::acos(arguments[0]); + checkNgpNodeFPError(val, "acos"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for acos function"); break; @@ -476,35 +507,45 @@ class NgpNode } case FunctionType::ASINH : { if (argumentCount == 1) { - return std::asinh(arguments[0]); + double val = std::asinh(arguments[0]); + checkNgpNodeFPError(val, "asinh"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for asinh function"); break; } case FunctionType::ACOSH : { if (argumentCount == 1) { - return std::acosh(arguments[0]); + double val = std::acosh(arguments[0]); + checkNgpNodeFPError(val, "acosh"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for acosh function"); break; } case FunctionType::ATANH : { if (argumentCount == 1) { - return std::atanh(arguments[0]); + double val = std::atanh(arguments[0]); + checkNgpNodeFPError(val , "atanh"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for atanh function"); break; } case FunctionType::ERF : { if (argumentCount == 1) { - return std::erf(arguments[0]); + double val = std::erf(arguments[0]); + checkNgpNodeFPError(val, "erf"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for erf function"); break; } case FunctionType::ERFC : { if (argumentCount == 1) { - return std::erfc(arguments[0]); + double val = std::erfc(arguments[0]); + checkNgpNodeFPError(val, "erfc"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for erfc function"); break; @@ -546,62 +587,82 @@ class NgpNode } case FunctionType::CYCLOIDAL_RAMP : { if (argumentCount == 3) { - return cycloidal_ramp(arguments[0], arguments[1], arguments[2]); + double val = cycloidal_ramp(arguments[0], arguments[1], arguments[2]); + checkNgpNodeFPError(val, "cycloidal_ramp"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for cycloidal_ramp function"); break; } case FunctionType::COS_RAMP : { if (argumentCount == 1) { - return cosine_ramp1(arguments[0]); + double val = cosine_ramp1(arguments[0]); + checkNgpNodeFPError(val, "cosine_ramp1"); + return val; } else if (argumentCount == 2) { - return cosine_ramp2(arguments[0], arguments[1]); + double val = cosine_ramp2(arguments[0], arguments[1]); + checkNgpNodeFPError(val, "cosine_ramp2"); + return val; } else if (argumentCount == 3) { - return cosine_ramp3(arguments[0], arguments[1], arguments[2]); + double val = cosine_ramp3(arguments[0], arguments[1], arguments[2]); + checkNgpNodeFPError(val, "cosine_ramp3"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for cos_ramp or cosine_ramp function"); break; } case FunctionType::LINEAR_RAMP : { if (argumentCount == 3) { - return linear_ramp3(arguments[0], arguments[1], arguments[2]); + double val = linear_ramp3(arguments[0], arguments[1], arguments[2]); + checkNgpNodeFPError(val, "linear_ramp3"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for linear_ramp function"); break; } case FunctionType::HAVERSINE_PULSE : { if (argumentCount == 3) { - return haversine_pulse(arguments[0], arguments[1], arguments[2]); + double val = haversine_pulse(arguments[0], arguments[1], arguments[2]); + checkNgpNodeFPError(val, "haversine_pulse"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for haversine_pulse function"); break; } case FunctionType::POINT2D : { if (argumentCount == 4) { - return point_2(arguments[0], arguments[1], arguments[2], arguments[3]); + double val = point_2(arguments[0], arguments[1], arguments[2], arguments[3]); + checkNgpNodeFPError(val, "point_2d"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for pulse_2 function"); break; } case FunctionType::POINT3D : { if (argumentCount == 5) { - return point_3(arguments[0], arguments[1], arguments[2], arguments[3], arguments[4]); + double val = point_3(arguments[0], arguments[1], arguments[2], arguments[3], arguments[4]); + checkNgpNodeFPError(val, "point_3d"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for pulse_3 function"); break; } case FunctionType::EXPONENTIAL_PDF : { if (argumentCount == 2) { - return exponential_pdf(arguments[0], arguments[1]); + double val = exponential_pdf(arguments[0], arguments[1]); + checkNgpNodeFPError(val, "exponential_pdf"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for exponential_pdf function"); break; } case FunctionType::LOG_UNIFORM_PDF : { if (argumentCount == 3) { - return log_uniform_pdf(arguments[0], arguments[1], arguments[2]); + double val = log_uniform_pdf(arguments[0], arguments[1], arguments[2]); + checkNgpNodeFPError(val, "log_uniform_pdf"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for log_uniform_pdf function"); break; @@ -615,14 +676,18 @@ class NgpNode } case FunctionType::WEIBULL_PDF : { if (argumentCount == 3) { - return weibull_pdf(arguments[0], arguments[1], arguments[2]); + double val = weibull_pdf(arguments[0], arguments[1], arguments[2]); + checkNgpNodeFPError(val, "weibull_pdf"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for weibull_pdf function"); break; } case FunctionType::GAMMA_PDF : { if (argumentCount == 3) { - return gamma_pdf(arguments[0], arguments[1], arguments[2]); + double val = gamma_pdf(arguments[0], arguments[1], arguments[2]); + checkNgpNodeFPError(val, "gamma_pdf"); + return val; } STK_NGP_ThrowErrorMsg("Incorrect number of arguments for gamma_pdf function"); break; @@ -655,6 +720,8 @@ class NgpNode } }; + + } } diff --git a/packages/stk/stk_expreval/stk_expreval/Node.cpp b/packages/stk/stk_expreval/stk_expreval/Node.cpp index 8d73bc036a19..83916c8c7c9c 100644 --- a/packages/stk/stk_expreval/stk_expreval/Node.cpp +++ b/packages/stk/stk_expreval/stk_expreval/Node.cpp @@ -35,6 +35,7 @@ #include "stk_expreval/Node.hpp" #include "stk_expreval/Eval.hpp" #include "stk_expreval/Constants.hpp" +#include "stk_util/util/FPExceptions.hpp" #include namespace stk { @@ -81,6 +82,7 @@ double& Node::setResult() { void Node::eval() { + stk::util::clear_fp_errors(); switch (m_opcode) { case OPCODE_STATEMENT: { setResult() = m_left->getResult(); @@ -105,6 +107,7 @@ Node::eval() } case OPCODE_EXPONENIATION: { setResult() = std::pow(m_left->getResult(),m_right->getResult()); + checkFPError("std::pow"); break; } case OPCODE_DIVIDE: { @@ -205,12 +208,15 @@ Node::eval() } setResult() = (*m_data.function.function)(argc, argv); + checkFPError(m_data.function.functionName); + break; } default: { STK_ThrowErrorMsg("Unknown OpCode (" + std::to_string(m_opcode) + ")"); } } + checkFPError(); m_hasBeenEvaluated = true; } @@ -429,5 +435,21 @@ Node::evalTrace(const NodeWeightMap & nodeWeights, EvalNodesType & evaluationNod } } +void Node::checkFPError(const char* fname) +{ + Eval::FPErrorBehavior behavior = m_owner->get_fp_error_behavior(); + if (behavior == Eval::FPErrorBehavior::Ignore) + { + return; + } else if (behavior == Eval::FPErrorBehavior::Warn) + { + stk::util::warn_on_fp_error(fname); + } else if (behavior == Eval::FPErrorBehavior::Error) + { + stk::util::throw_on_fp_error(fname); + } +} + + } } diff --git a/packages/stk/stk_expreval/stk_expreval/Node.hpp b/packages/stk/stk_expreval/stk_expreval/Node.hpp index 0c93c9f89d0f..d2337092350e 100644 --- a/packages/stk/stk_expreval/stk_expreval/Node.hpp +++ b/packages/stk/stk_expreval/stk_expreval/Node.hpp @@ -171,6 +171,8 @@ class Node double& setResult(); + void checkFPError(const char* fname = nullptr); + const Opcode m_opcode; union _data diff --git a/packages/stk/stk_expreval/stk_expreval/Parser.cpp b/packages/stk/stk_expreval/stk_expreval/Parser.cpp index 9f454b35eb5c..ac2ad798beed 100644 --- a/packages/stk/stk_expreval/stk_expreval/Parser.cpp +++ b/packages/stk/stk_expreval/stk_expreval/Parser.cpp @@ -571,7 +571,10 @@ parseFunction(Eval & eval, function->m_data.function.functionType = functionType; std::strncpy(function->m_data.function.functionName, function_name.c_str(), - function_name.length() < Node::MAXIMUM_FUNCTION_NAME_LENGTH-1 ? function_name.length() : Node::MAXIMUM_FUNCTION_NAME_LENGTH-1); + std::min(function_name.length()+1, std::string::size_type(Node::MAXIMUM_FUNCTION_NAME_LENGTH)) + ); + function->m_data.function.functionName[Node::MAXIMUM_FUNCTION_NAME_LENGTH-1] = '\0'; + break; } } diff --git a/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_standalone_serial b/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_standalone_serial index 30ac86d21f52..4ed2a5dcc355 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_standalone_serial +++ b/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_standalone_serial @@ -34,6 +34,7 @@ cmake \ -DCMAKE_CXX_COMPILER=${OMPI_CXX} \ -DCMAKE_CXX_FLAGS:STRING="${cmake_cxx_flags}" \ -DSTK_ENABLE_ALL=ON \ +-DSTK_ENABLE_ARBORX=ON \ -DSTK_ENABLE_MPI:BOOL=OFF \ -DSTK_ENABLE_STKMiddle_mesh=OFF \ -DSTK_ENABLE_STKMiddle_mesh_util=OFF \ diff --git a/packages/stk/stk_integration_tests/cmake_install_test/spack.cuda.yaml b/packages/stk/stk_integration_tests/cmake_install_test/spack.cuda.yaml index f0c30bcf6301..1622329941f5 100644 --- a/packages/stk/stk_integration_tests/cmake_install_test/spack.cuda.yaml +++ b/packages/stk/stk_integration_tests/cmake_install_test/spack.cuda.yaml @@ -9,13 +9,15 @@ spack: - zlib - openmpi@4.1.6 - kokkos+cuda+cuda_constexpr+cuda_lambda+cuda_relocatable_device_code~cuda_uvm~shared+wrapper cuda_arch=70 - - trilinos@develop~boost+cuda+cuda_rdc+exodus+kokkos~shared+stk+test~uvm+wrapper cuda_arch=70 cxxstd=17 + - trilinos@master~boost+cuda+cuda_rdc+exodus+kokkos~shared~uvm+wrapper cuda_arch=70 cxxstd=17 view: true concretizer: unify: true config: install_tree: root: SED_REPLACE_INSTALL_PATH + build_stage: + - /fgs/$USER/spack-stage compilers: - compiler: spec: gcc@=10.3.0 @@ -43,10 +45,6 @@ spack: modules: [] environment: {} extra_rpaths: [] - develop: - trilinos: - path: SED_REPLACE_TRILINOS_PATH - spec: trilinos@=develop packages: binutils: externals: diff --git a/packages/stk/stk_integration_tests/cmake_install_test/stk_spack_build_test_cuda.sh b/packages/stk/stk_integration_tests/cmake_install_test/stk_spack_build_test_cuda.sh index d1dc0f318ff8..633d07cb0545 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/stk_spack_build_test_cuda.sh +++ b/packages/stk/stk_integration_tests/cmake_install_test/stk_spack_build_test_cuda.sh @@ -12,40 +12,30 @@ exe() { # To specify custom paths for one or more of the following, set # the variable on the command line when running this script. # Example: -# $ TRILINOS=/my/path/trilinos source stk_spack_create_env_cuda.sh +# $ SIERRA=/my/path/code source stk_spack_create_env_cuda.sh work_dir=${STK_SPACK_WORK_DIR:-/fgs/$USER/stk-spack-testing-cuda} -trilinos_source=${TRILINOS:-/fgs/$USER/Trilinos} sierra_source=${SIERRA:-/fgs/$USER/code} -stk_spack_env=CUDA STK_SPACK_WORK_DIR=${work_dir} -TRILINOS=${trilinos_source} SIERRA=${sierra_source} printf "using STK_SPACK_WORK_DIR=${STK_SPACK_WORK_DIR}\n"; -printf "using TRILINOS=${TRILINOS}\n"; printf "using SIERRA=${SIERRA}\n"; -if [ ! -d ${trilinos_source} ] ; then - printf "ERROR, TRILINOS location not specified or not a directory.\n"; - return 1; -fi - if [ ! -d ${sierra_source} ] ; then printf "ERROR, SIERRA location not specified or not a directory.\n"; return 1; fi -printf "copying stk directory from SIERRA to TRILINOS...\n"; -exe rm -rf ${trilinos_source}/packages/stk -exe cp -r ${sierra_source}/stk ${trilinos_source}/packages - printf "Setting up spack env 'stkSpackTesting' in STK_SPACK_WORK_DIR=${work_dir}\n" exe mkdir -p ${work_dir} exe cd ${work_dir} -exe rm -rf spack spack.yaml stk_test_app +exe rm -rf spack spack.yaml + +exe mkdir -p ${work_dir}/tmp +exe export TMPDIR=${work_dir}/tmp exe module load aue/python/3.11.6 exe module load aue/git/2.42.0 @@ -66,7 +56,6 @@ exe cp ${sierra_source}/stk/stk_integration_tests/cmake_install_test/spack.cuda. spack_yaml_file=${work_dir}/spack.yaml exe sed -i s@SED_REPLACE_INSTALL_PATH@"${work_dir}/install"@g ${spack_yaml_file} -exe sed -i s@SED_REPLACE_TRILINOS_PATH@"${trilinos_source}"@g ${spack_yaml_file} exe spack config add -f ${spack_yaml_file} exe spack env activate stkSpackTesting @@ -79,12 +68,9 @@ exe spack add zlib exe spack add ncurses@6.3 exe spack add openmpi@4.1.6 exe spack add cuda@11.4.4 -exe spack add kokkos+cuda+wrapper+cuda_constexpr+cuda_lambda+cuda_relocatable_device_code~shared cuda_arch=70 -exe spack add trilinos@develop+cuda+cuda_rdc+exodus+stk+kokkos+wrapper~amesos~epetra~shared~boost cuda_arch=70 cxxstd=17 - -# don't need the following 'spack develop' command since we have specified it in -# our pre-packaged spack.cuda.yaml file. -# exe spack develop trilinos@develop -p ${trilinos_source} +exe spack add googletest cxxstd=17 +exe spack add kokkos+cuda~cuda_uvm+wrapper+cuda_constexpr+cuda_lambda+cuda_relocatable_device_code~shared cuda_arch=70 +exe spack add trilinos@master+cuda+cuda_rdc~uvm+exodus+kokkos+shards+intrepid2+zoltan2+wrapper~amesos~epetra~shared~boost cuda_arch=70 cxxstd=17 exe spack concretize -f if [ $? -ne 0 ] ; then @@ -98,18 +84,23 @@ if [ $? -ne 0 ] ; then return 1; fi +exe spack load googletest exe spack load cmake exe spack load openmpi printf "setting OMPI_CXX for CUDA environment\n"; export OMPI_CXX=$(find $(spack location -i kokkos) -name nvcc_wrapper) -printf "copying stk test app from SIERRA...\n"; -exe cp -r ${sierra_source}/stk/stk_integration_tests/cmake_install_test/stk_test_app . +printf "making build-dir for stk build...\n"; +stk_build_dir=${work_dir}/build_stk +exe mkdir -p ${stk_build_dir} -exe cd stk_test_app +exe cp ${sierra_source}/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_in_spack_env ${stk_build_dir} + +exe cd ${stk_build_dir} + +exe STK_SOURCE_DIR=${sierra_source}/stk source run_cmake_in_spack_env -exe source run_cmake_in_spack_env if [ $? -ne 0 ] ; then printf "!! error running cmake\n"; return 1; @@ -121,12 +112,6 @@ if [ $? -ne 0 ] ; then return 1; fi -exe mpirun --np 4 ./test_stk_app -if [ $? -ne 0 ] ; then - printf "!! error running test_stk_app\n"; - return 1; -fi - -printf "all done, SUCCESS!\n"; +printf "all done, SUCCESS building!\n"; return 0 diff --git a/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_in_spack_env b/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_in_spack_env index c42f7adf585a..8d6826d19df9 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_in_spack_env +++ b/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_in_spack_env @@ -2,14 +2,21 @@ spack env status spack find -v trilinos -TEST_STK_APP_SOURCE_DIR=$(pwd) +if [ -z ${STK_SOURCE_DIR+x} ]; then + echo "STK_SOURCE_DIR is unset"; + return 1; +else + echo "STK_SOURCE_DIR is set to '$STK_SOURCE_DIR'"; +fi -mkdir -p build - -cd build +stk_source_dir=${STK_SOURCE_DIR} cmake \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-RELEASE} \ -DCMAKE_CXX_COMPILER=mpicxx \ -${TEST_STK_APP_SOURCE_DIR} +-DSTK_ENABLE_ALL:BOOL=ON \ +-DSTK_ENABLE_MPI:BOOL=ON \ +-DSTK_ENABLE_TESTS:BOOL=ON \ +-DSTK_ENABLE_STKMiddle_mesh:BOOL=OFF \ +${stk_source_dir} diff --git a/packages/stk/stk_io/stk_io/IossBridge.cpp b/packages/stk/stk_io/stk_io/IossBridge.cpp index e645230b9b33..a69dc2a801f3 100644 --- a/packages/stk/stk_io/stk_io/IossBridge.cpp +++ b/packages/stk/stk_io/stk_io/IossBridge.cpp @@ -187,20 +187,9 @@ void STKIORequire(bool cond) namespace { - static const std::string invalid("invalid"); static const std::string scalar("scalar"); static const std::string vector_2d("vector_2d"); static const std::string vector_3d("vector_3d"); - static const std::string full_tensor_36("full_tensor_36"); - static const std::string full_tensor_32("full_tensor_32"); - static const std::string full_tensor_22("full_tensor_22"); - static const std::string full_tensor_16("full_tensor_16"); - static const std::string full_tensor_12("full_tensor_12"); - static const std::string sym_tensor_33("sym_tensor_33"); - static const std::string sym_tensor_31("sym_tensor_31"); - static const std::string sym_tensor_21("sym_tensor_21"); - static const std::string matrix_22("matrix_22"); - static const std::string matrix_33("matrix_33"); const std::string base_stk_part_name = "_base_stk_part_name"; @@ -2211,8 +2200,6 @@ const stk::mesh::FieldBase *declare_stk_field_internal(stk::mesh::MetaData &meta stk::mesh::EntityRank rank = get_output_rank(params); //-------------------------------- // Create the special universal node block: - mesh::Selector sharedSelector = params.has_shared_selector() ? *(params.get_shared_selector()) - : meta.globally_shared_part(); mesh::Selector allSelector = meta.globally_shared_part() | meta.locally_owned_part(); if (params.get_subset_selector( )) allSelector &= *params.get_subset_selector(); @@ -2255,9 +2242,6 @@ const stk::mesh::FieldBase *declare_stk_field_internal(stk::mesh::MetaData &meta mesh::MetaData & meta = mesh::MetaData::get(part); Ioss::Region & ioRegion = params.io_region(); - mesh::Selector sharedSelector = params.has_shared_selector() ? *(params.get_shared_selector()) - : meta.globally_shared_part(); - mesh::Selector allSelector = (meta.globally_shared_part() | meta.locally_owned_part()) & part; if (params.get_subset_selector( )) allSelector &= *params.get_subset_selector(); if (params.get_output_selector(rank)) allSelector &= *params.get_output_selector(rank); diff --git a/packages/stk/stk_mesh/stk_mesh/base/Bucket.cpp b/packages/stk/stk_mesh/stk_mesh/base/Bucket.cpp index 9343a4992358..4eb2e69024ce 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/Bucket.cpp +++ b/packages/stk/stk_mesh/stk_mesh/base/Bucket.cpp @@ -95,7 +95,7 @@ struct ClearEntityFunctor {} template - void operator()(Bucket&, ConnectivityType& connectivity) + void operator()(Bucket&, ConnectivityType& /*connectivity*/) {} void operator()(Bucket&, impl::BucketConnDynamic& connectivity) @@ -144,7 +144,7 @@ struct DeclareRelationFunctor {} template - void operator()(Bucket& bucket, Connectivity& connectivity) + void operator()(Bucket& /*bucket*/, Connectivity& connectivity) { STK_ThrowAssert(!m_modified); m_modified = connectivity.add_connectivity(m_bucket_ordinal, m_to, m_ordinal, m_permutation); @@ -167,7 +167,7 @@ struct DestroyRelationFunctor {} template - void operator()(Bucket& bucket, Connectivity& connectivity) + void operator()(Bucket& /*bucket*/, Connectivity& connectivity) { STK_ThrowAssert(!m_modified); m_modified = connectivity.remove_connectivity(m_bucket_ordinal, m_to, m_ordinal); @@ -195,7 +195,7 @@ struct ReplaceRelationFunctor {} template - void operator()(Bucket& bucket, Connectivity& connectivity) + void operator()(Bucket& /*bucket*/, Connectivity& connectivity) { STK_ThrowAssert(!m_modified); m_modified = connectivity.replace_connectivity(m_bucket_ordinal, m_numConnectivity, @@ -249,7 +249,7 @@ bool raw_part_equal( const unsigned * lhs , const unsigned * rhs ) { bool result = true ; { - const unsigned * const end_lhs = lhs + *lhs ; + const unsigned * const end_lhs = lhs + *lhs + 1 ; while ( result && end_lhs != lhs ) { result = *lhs == *rhs ; ++lhs ; ++rhs ; @@ -287,7 +287,7 @@ Bucket::Bucket(BulkData & mesh, m_entity_rank(entityRank), m_topology(), m_key(key), - m_partOrdsBeginEnd(m_key.data()+1,m_key.data()+m_key[0]), + m_partOrdsBeginEnd(m_key.data()+1,m_key.data()+1+m_key[0]), m_capacity(initialCapacity), m_maxCapacity(maximumCapacity), m_size(0), @@ -323,7 +323,7 @@ Bucket::Bucket(BulkData & mesh, setup_connectivity(m_topology, entityRank, stk::topology::FACE_RANK, m_face_kind, m_fixed_face_connectivity); setup_connectivity(m_topology, entityRank, stk::topology::ELEMENT_RANK, m_element_kind, m_fixed_element_connectivity); - m_parts.reserve(m_key.size()); + m_parts.reserve(m_key.size()-1); supersets(m_parts); m_mesh.new_bucket_callback(m_entity_rank, m_parts, m_capacity, this); @@ -655,17 +655,15 @@ unsigned Bucket::get_ngp_field_bucket_is_modified(unsigned fieldOrdinal) const void Bucket::reset_part_ord_begin_end() { m_partOrdsBeginEnd.first = m_key.data()+1; - m_partOrdsBeginEnd.second = m_key.data()+m_key[0]; + m_partOrdsBeginEnd.second = m_key.data()+1+m_key[0]; } void Bucket::reset_bucket_key(const OrdinalVector& newPartOrdinals) { - unsigned partitionCount = m_key[m_key.size() - 1]; unsigned newPartCount = newPartOrdinals.size(); - m_key.resize(newPartCount + 2); - m_key[0] = newPartCount + 1; - m_key[newPartCount+1] = partitionCount; + m_key.resize(newPartCount + 1); + m_key[0] = newPartCount; for(unsigned i = 0; i < newPartCount; i++) { m_key[i+1] = newPartOrdinals[i]; diff --git a/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp b/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp index 9870058c6a6d..5a1655330001 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp +++ b/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp @@ -62,15 +62,11 @@ #include "stk_mesh/baseImpl/ConnectEdgesImpl.hpp" #include "stk_mesh/baseImpl/Partition.hpp" #include "stk_topology/topology.hpp" // for topology, etc -#include "stk_util/diag/StringUtil.hpp" #include "stk_util/parallel/Parallel.hpp" // for ParallelMachine, etc -#include "stk_util/util/NamedPair.hpp" #include "stk_util/util/PairIter.hpp" // for PairIter -#include "stk_util/util/SameType.hpp" // for SameType, etc #include "stk_util/util/SortAndUnique.hpp" #include "stk_util/util/GetEnv.hpp" #include // for sort, lower_bound, unique, etc -#include #include // for operator<<, basic_ostream, etc #include // for back_insert_iterator, etc #include // for set, set<>::iterator, etc @@ -439,20 +435,6 @@ BulkData::register_device_mesh() const m_isDeviceMeshRegistered = true; } -void -BulkData::unregister_device_mesh() const -{ - m_isDeviceMeshRegistered = false; - - const stk::mesh::EntityRank endRank = static_cast(mesh_meta_data().entity_rank_count()); - for (stk::mesh::EntityRank rank = stk::topology::NODE_RANK; rank < endRank; ++rank) { - const stk::mesh::BucketVector & stkBuckets = buckets(rank); - for (Bucket * bucket : stkBuckets) { - bucket->set_ngp_bucket_id(INVALID_BUCKET_ID); - } - } -} - void BulkData::set_automatic_aura_option(AutomaticAuraOption auraOption, bool applyImmediately) { STK_ThrowRequireMsg(in_synchronized_state(),"set_automatic_aura_option currently can only be used when the mesh is not already being modified."); @@ -1026,17 +1008,15 @@ void BulkData::internal_verify_and_change_entity_parts( const EntityVector& enti OrdinalVector removePartsAndSubsetsMinusPartsInAddPartsList; OrdinalVector scratchOrdinalVec, scratchSpace; - for(Entity entity : entities) { - addPartsAndSupersets.clear(); - impl::fill_add_parts_and_supersets(add_parts, addPartsAndSupersets); + impl::fill_add_parts_and_supersets(add_parts, addPartsAndSupersets); + for(Entity entity : entities) { impl::fill_remove_parts_and_subsets_minus_parts_in_add_parts_list(remove_parts, addPartsAndSupersets, bucket_ptr(entity), removePartsAndSubsetsMinusPartsInAddPartsList); - internal_change_entity_parts(entity, - addPartsAndSupersets, + internal_change_entity_parts(entity, addPartsAndSupersets, removePartsAndSubsetsMinusPartsInAddPartsList, scratchOrdinalVec, scratchSpace); } @@ -1739,6 +1719,7 @@ void BulkData::comm_shared_procs(Entity entity, std::vector & procs ) const void BulkData::shared_procs_intersection(const std::vector & keys, std::vector & procs ) const { + confirm_host_mesh_is_synchronized_from_device(); procs.clear(); int num = keys.size(); std::vector procs_tmp; @@ -1768,6 +1749,7 @@ void BulkData::shared_procs_intersection(const std::vector & keys, st void BulkData::shared_procs_intersection(const EntityVector& entities, std::vector & procs ) const { + confirm_host_mesh_is_synchronized_from_device(); procs.clear(); int num = entities.size(); for (int i = 0; i < num; ++i) { @@ -2070,6 +2052,7 @@ void BulkData::update_field_data_states(bool rotateNgpFieldViews) const_entity_iterator BulkData::begin_entities(EntityRank ent_rank) const { + confirm_host_mesh_is_synchronized_from_device(); return m_entityKeyMapping->begin_rank(ent_rank); } @@ -2080,6 +2063,7 @@ const_entity_iterator BulkData::end_entities(EntityRank ent_rank) const Entity BulkData::get_entity( EntityRank ent_rank , EntityId entity_id ) const { + confirm_host_mesh_is_synchronized_from_device(); if (!impl::is_good_rank_and_id(mesh_meta_data(), ent_rank, entity_id)) { return Entity(); } @@ -2088,6 +2072,7 @@ Entity BulkData::get_entity( EntityRank ent_rank , EntityId entity_id ) const Entity BulkData::get_entity( const EntityKey key ) const { + confirm_host_mesh_is_synchronized_from_device(); return m_entityKeyMapping->get_entity(key); } @@ -2137,6 +2122,8 @@ void BulkData::erase_and_clear_if_empty(Entity entity, RelationIterator rel_itr) BucketVector const& BulkData::get_buckets(EntityRank rank, Selector const& selector) const { + confirm_host_mesh_is_synchronized_from_device(); + if (rank == stk::topology::INVALID_RANK) { static BucketVector empty; return empty; @@ -2168,6 +2155,7 @@ BucketVector const& BulkData::get_buckets(EntityRank rank, Selector const& selec } void BulkData::get_entities(EntityRank rank, Selector const& selector, EntityVector& output_entities) const { + confirm_host_mesh_is_synchronized_from_device(); output_entities.clear(); const stk::mesh::BucketVector &bucket_ptrs = get_buckets(rank, selector); for(size_t ib=0, ib_end=bucket_ptrs.size(); ib& BulkData::all_sharing_procs(stk::mesh::EntityRank rank) const { + confirm_host_mesh_is_synchronized_from_device(); internal_update_all_sharing_procs(); return m_all_sharing_procs[rank]; } @@ -5185,9 +5174,9 @@ void BulkData::internal_insert_all_parts_induced_from_higher_rank_entities_to_ve int num_upward_rels = num_connectivity(e_to, to_rel_rank_i); Entity const* upward_rel_entities = begin(e_to, to_rel_rank_i); + const Bucket* prevBucketPtr = nullptr; for (int k = 0; k < num_upward_rels; ++k) { - const Bucket* prevBucketPtr = nullptr; if (entity != upward_rel_entities[k]) // Already did this entity { const Bucket* curBucketPtr = bucket_ptr(upward_rel_entities[k]); @@ -5825,5 +5814,13 @@ EntityRank BulkData::get_entity_rank_count() const return mesh_meta_data().entity_rank_count(); } +void BulkData::confirm_host_mesh_is_synchronized_from_device(const char * fileName, int lineNumber) const +{ + STK_ThrowRequireMsg((not get_ngp_mesh()) || (not get_ngp_mesh()->need_sync_to_host()), + std::string(fileName) + ":" + std::to_string(lineNumber) + + " Accessing host-side BulkData or Field data after a device-side mesh modification without " + "calling NgpMesh::sync_to_host()"); +} + } // namespace mesh } // namespace stk diff --git a/packages/stk/stk_mesh/stk_mesh/base/BulkData.hpp b/packages/stk/stk_mesh/stk_mesh/base/BulkData.hpp index 6942896ee20a..d5f491b8cff8 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/BulkData.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/BulkData.hpp @@ -39,7 +39,6 @@ //---------------------------------------------------------------------- #include // for size_t #include // for uint16_t -#include // for max #include #include // for Entity, etc #include // for EntityCommDatabase @@ -56,6 +55,7 @@ #include "stk_mesh/base/Bucket.hpp" // for Bucket #include "stk_mesh/base/EntityKey.hpp" // for EntityKey, hash_value #include "stk_mesh/base/FieldDataManager.hpp" +#include "stk_mesh/base/FieldSyncDebugging.hpp" #include "stk_topology/topology.hpp" // for topology, etc #include "stk_util/util/ReportHandler.hpp" // for ThrowAssert, etc #include "stk_mesh/base/ModificationSummary.hpp" @@ -232,6 +232,7 @@ class BulkData { bool modification_begin(const std::string description = std::string("UNSPECIFIED")) { ProfilingBlock block("mod begin:" + description); + confirm_host_mesh_is_synchronized_from_device(); if(m_meshModification.in_modifiable_state()) { return false; } @@ -324,7 +325,10 @@ class BulkData { /** \brief Query all buckets of a given entity rank * Don't call inside BucketRepository member functions! */ - const BucketVector & buckets( EntityRank rank ) const { return m_bucket_repository.buckets(rank); } + const BucketVector & buckets( EntityRank rank ) const { + confirm_host_mesh_is_synchronized_from_device(); + return m_bucket_repository.buckets(rank); + } //iterator that traverses entities of the specified rank, in order of ascending global identifier const_entity_iterator begin_entities(EntityRank ent_rank) const; @@ -591,8 +595,14 @@ class BulkData { * Is likely to be stale if ownership or sharing has changed * and the 'modification_end' has not been called. */ - Ghosting & aura_ghosting() const { return *m_ghosting[AURA] ; } - Ghosting & shared_ghosting() const { return *m_ghosting[SHARED] ; } + Ghosting & aura_ghosting() const { + confirm_host_mesh_is_synchronized_from_device(); + return *m_ghosting[AURA]; + } + Ghosting & shared_ghosting() const { + confirm_host_mesh_is_synchronized_from_device(); + return *m_ghosting[SHARED]; + } /** Return the part corresponding to the specified ghosting. */ @@ -637,7 +647,10 @@ class BulkData { void destroy_all_ghosting(); // Mod Mark /** \brief Vector of all ghostings */ - const std::vector & ghostings() const { return m_ghosting ; } + const std::vector & ghostings() const { + confirm_host_mesh_is_synchronized_from_device(); + return m_ghosting; + } size_t get_num_communicated_entities() const { return m_entity_comm_list.size(); } @@ -889,6 +902,9 @@ class BulkData { bool is_mesh_consistency_check_on() const { return m_runConsistencyCheck; } + void confirm_host_mesh_is_synchronized_from_device(const char * fileName = HOST_DEBUG_FILE_NAME, + int lineNumber = HOST_DEBUG_LINE_NUMBER) const; + protected: //functions BulkData(std::shared_ptr mesh_meta_data, ParallelMachine parallel, @@ -2080,6 +2096,7 @@ template inline NgpCommMapIndicesHostMirrorT BulkData::volatile_fast_shared_comm_map(EntityRank rank, int proc) const { + confirm_host_mesh_is_synchronized_from_device(); STK_ThrowAssert(this->in_synchronized_state()); STK_ThrowAssertMsg(rank < stk::topology::ELEMENT_RANK, "Cannot share entities of rank: " << rank); if (m_ngpMeshHostDataBase == nullptr || diff --git a/packages/stk/stk_mesh/stk_mesh/base/DeviceField.hpp b/packages/stk/stk_mesh/stk_mesh/base/DeviceField.hpp index 7793036e1248..0756cb18a676 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/DeviceField.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/DeviceField.hpp @@ -247,23 +247,21 @@ class DeviceField : public NgpFieldBase KOKKOS_FUNCTION unsigned get_component_stride() const { - unsigned stride = 1; #ifdef STK_USE_DEVICE_MESH - stride = bucketCapacity; + return bucketCapacity; +#else + return 1; #endif - return stride; } KOKKOS_FUNCTION unsigned get_num_components_per_entity(const FastMeshIndex& entityIndex) const { - const unsigned bucketId = entityIndex.bucket_id; - return deviceAllFieldsBucketsLayoutPerEntity(bucketId, NUM_COMPONENTS_INDEX); + return deviceAllFieldsBucketsLayoutPerEntity(entityIndex.bucket_id, NUM_COMPONENTS_INDEX); } KOKKOS_FUNCTION unsigned get_extent0_per_entity(const FastMeshIndex& entityIndex) const { - const unsigned bucketId = entityIndex.bucket_id; - return deviceAllFieldsBucketsLayoutPerEntity(bucketId, FIRST_DIMENSION_INDEX); + return deviceAllFieldsBucketsLayoutPerEntity(entityIndex.bucket_id, FIRST_DIMENSION_INDEX); } KOKKOS_FUNCTION @@ -325,8 +323,7 @@ class DeviceField : public NgpFieldBase { fieldSyncDebugger.device_stale_access_check(this, index, fileName, lineNumber); T* dataPtr = &deviceData(deviceSelectedBucketOffset(index.bucket_id), ORDER_INDICES(index.bucket_ord, 0)); - const unsigned numScalars = get_num_components_per_entity(index); - return EntityFieldData(dataPtr, numScalars, get_component_stride()); + return EntityFieldData(dataPtr, get_num_components_per_entity(index), get_component_stride()); } template diff --git a/packages/stk/stk_mesh/stk_mesh/base/DeviceMesh.hpp b/packages/stk/stk_mesh/stk_mesh/base/DeviceMesh.hpp index 492e189e2545..27faedc82158 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/DeviceMesh.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/DeviceMesh.hpp @@ -46,7 +46,6 @@ #include #include #include -#include #include #include @@ -182,16 +181,17 @@ class DeviceMeshT : public NgpMeshBase bulk(nullptr), spatial_dimension(0), synchronizedCount(0), + m_needSyncToHost(false), deviceMeshHostData(nullptr) {} explicit DeviceMeshT(const stk::mesh::BulkData& b) : NgpMeshBase(), - bulk(&b), + bulk(&const_cast(b)), spatial_dimension(b.mesh_meta_data().spatial_dimension()), synchronizedCount(0), + m_needSyncToHost(false), endRank(static_cast(bulk->mesh_meta_data().entity_rank_count())), - copyCounter("copy_counter"), deviceMeshHostData(nullptr) { bulk->register_device_mesh(); @@ -206,6 +206,7 @@ class DeviceMeshT : public NgpMeshBase KOKKOS_FUNCTION virtual ~DeviceMeshT() override { + m_needSyncToHost = false; clear_buckets_and_views(); } @@ -422,6 +423,12 @@ class DeviceMeshT : public NgpMeshBase buckets[rank] = BucketView(); } + stk::mesh::BulkData &get_bulk_on_host() + { + STK_ThrowRequireMsg(bulk != nullptr, "DeviceMesh::get_bulk_on_host, bulk==nullptr"); + return *bulk; + } + const stk::mesh::BulkData &get_bulk_on_host() const { STK_ThrowRequireMsg(bulk != nullptr, "DeviceMeshT::get_bulk_on_host, bulk==nullptr"); @@ -433,6 +440,88 @@ class DeviceMeshT : public NgpMeshBase return synchronizedCount == bulk->synchronized_count(); } + // This is an initial crude implementation that brings the device-side Views back to + // the host and then kicks off a host-side mesh modification. The modified host mesh + // is then synchronized back to device. This will not perform well and the semantics + // are a little different from the final device-side capability (because the host mesh + // will not be left in an unsynchronized state), but it can serve as a stand-in for + // the final device-side mesh modification capability in the meantime. + // + template + void batch_change_entity_parts(const Kokkos::View& entities, + const Kokkos::View& addPartOrdinals, + const Kokkos::View& removePartOrdinals) + { + using EntitiesMemorySpace = typename std::remove_reference::type::memory_space; + using AddPartOrdinalsMemorySpace = typename std::remove_reference::type::memory_space; + using RemovePartOrdinalsMemorySpace = typename std::remove_reference::type::memory_space; + + static_assert(Kokkos::SpaceAccessibility::accessible, + "The memory space of the 'entities' View is inaccessible from the DeviceMesh execution space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "The memory space of the 'addPartOrdinals' View is inaccessible from the DeviceMesh execution space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "The memory space of the 'removePartOrdinals' View is inaccessible from the DeviceMesh execution space"); + + using HostEntitiesType = typename std::remove_reference::type::HostMirror; + using HostAddPartOrdinalsType = typename std::remove_reference::type::HostMirror; + using HostRemovePartOrdinalsType = typename std::remove_reference::type::HostMirror; + + HostEntitiesType copiedEntities = Kokkos::create_mirror_view(entities); + HostAddPartOrdinalsType copiedAddPartOrdinals = Kokkos::create_mirror_view(addPartOrdinals); + HostRemovePartOrdinalsType copiedRemovePartOrdinals = Kokkos::create_mirror_view(removePartOrdinals); + + Kokkos::deep_copy(copiedEntities, entities); + Kokkos::deep_copy(copiedAddPartOrdinals, addPartOrdinals); + Kokkos::deep_copy(copiedRemovePartOrdinals, removePartOrdinals); + + std::vector hostEntities; + std::vector hostAddParts; + std::vector hostRemoveParts; + + hostEntities.reserve(copiedEntities.extent(0)); + for (size_t i = 0; i < copiedEntities.extent(0); ++i) { + hostEntities.push_back(copiedEntities[i]); + } + + const stk::mesh::PartVector& parts = bulk->mesh_meta_data().get_parts(); + + hostAddParts.reserve(copiedAddPartOrdinals.extent(0)); + for (size_t i = 0; i < copiedAddPartOrdinals.extent(0); ++i) { + const size_t partOrdinal = copiedAddPartOrdinals[i]; + STK_ThrowRequire(partOrdinal < parts.size()); + hostAddParts.push_back(parts[partOrdinal]); + } + + hostRemoveParts.reserve(copiedRemovePartOrdinals.extent(0)); + for (size_t i = 0; i < copiedRemovePartOrdinals.extent(0); ++i) { + const size_t partOrdinal = copiedRemovePartOrdinals[i]; + STK_ThrowRequire(partOrdinal < parts.size()); + hostRemoveParts.push_back(parts[partOrdinal]); + } + + m_needSyncToHost = false; + bulk->batch_change_entity_parts(hostEntities, hostAddParts, hostRemoveParts); + + update_mesh(); + m_needSyncToHost = true; + } + + // This function should be called before doing any host-side mesh operations after a + // device-side mesh modification, to avoid accessing stale data. Accessing the host + // mesh without syncing it first should result in a throw. + // + void sync_to_host() { + m_needSyncToHost = false; + } + + // This can be used to check if the device-side mesh has been modified without + // synchronizing it to the host. + // + bool need_sync_to_host() const override { + return m_needSyncToHost; + } + private: void set_entity_keys(const stk::mesh::BulkData& bulk_in); @@ -440,12 +529,6 @@ class DeviceMeshT : public NgpMeshBase void fill_sparse_connectivities(const stk::mesh::BulkData& bulk_in); - KOKKOS_FUNCTION - bool is_last_mesh_copy() const - { - return (copyCounter.use_count() == 1); - } - KOKKOS_FUNCTION bool is_last_bucket_reference(unsigned rank = stk::topology::NODE_RANK) const { @@ -456,10 +539,6 @@ class DeviceMeshT : public NgpMeshBase void clear_buckets_and_views() { KOKKOS_IF_ON_HOST(( - if (is_last_mesh_copy()) { - bulk->unregister_device_mesh(); - } - if (is_last_bucket_reference()) { for (stk::mesh::EntityRank rank=stk::topology::NODE_RANK; rank*, stk::ngp::UVMMemSpace>; - const stk::mesh::BulkData *bulk; + stk::mesh::BulkData* bulk; unsigned spatial_dimension; unsigned synchronizedCount; + bool m_needSyncToHost; stk::mesh::EntityRank endRank; - Kokkos::View copyCounter; impl::NgpMeshHostData* deviceMeshHostData; EntityKeyViewType entityKeys; diff --git a/packages/stk/stk_mesh/stk_mesh/base/FEMHelpers.cpp b/packages/stk/stk_mesh/stk_mesh/base/FEMHelpers.cpp index ceb6e6bdb27a..ca5a1b61ba11 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/FEMHelpers.cpp +++ b/packages/stk/stk_mesh/stk_mesh/base/FEMHelpers.cpp @@ -357,8 +357,7 @@ inline void sub_topology_check(const stk::mesh::EntityVector& candidateSideNodes << ", expected: " << subTopology.num_nodes()); } -inline void sub_topology_check(const stk::mesh::Entity* candidateSideNodes, - size_t numCandidateSideNodes, +inline void sub_topology_check(size_t numCandidateSideNodes, stk::topology elemTopology, stk::topology subTopology) { @@ -445,7 +444,7 @@ EquivAndPositive is_side_equivalent_and_positive(const stk::mesh::BulkData& mesh } stk::topology subTopology = elemTopology.sub_topology(mesh.mesh_meta_data().side_rank(), sideOrdinal); - sub_topology_check(candidateSideNodes, numCandidateSideNodes, elemTopology, subTopology); + sub_topology_check(numCandidateSideNodes, elemTopology, subTopology); return is_equivalent_and_positive(mesh, element, sideOrdinal, mesh.mesh_meta_data().side_rank(), candidateSideNodes); } diff --git a/packages/stk/stk_mesh/stk_mesh/base/FieldDataManager.cpp b/packages/stk/stk_mesh/stk_mesh/base/FieldDataManager.cpp index a2a606c51d97..05e11510bdea 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/FieldDataManager.cpp +++ b/packages/stk/stk_mesh/stk_mesh/base/FieldDataManager.cpp @@ -947,9 +947,16 @@ void ContiguousFieldDataManager::add_field_data_for_entity(const std::vector m_num_bytes_allocated_per_field[field_ordinal]; + const size_t newFieldSizeNeeded = m_num_bytes_used_per_field[field_ordinal] + extraAllocationNeeded; + + bool requiresNewAllocation = false; + size_t newFieldSize = m_num_bytes_allocated_per_field[field_ordinal]; + + // Only reallocate if we've outgrown the extra capacity + if (newFieldSizeNeeded > m_num_bytes_allocated_per_field[field_ordinal]) { + requiresNewAllocation = true; + newFieldSize = newFieldSizeNeeded + m_extra_capacity; + } unsigned char* new_field_data = m_field_raw_data[field_ordinal]; FieldMetaDataVector& field_meta_data_vector = const_cast(field.get_meta_data_for_field()); diff --git a/packages/stk/stk_mesh/stk_mesh/base/FieldParallel.cpp b/packages/stk/stk_mesh/stk_mesh/base/FieldParallel.cpp index b9254509fe57..cf3e7d2faefd 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/FieldParallel.cpp +++ b/packages/stk/stk_mesh/stk_mesh/base/FieldParallel.cpp @@ -101,6 +101,7 @@ void communicate_field_data(const Ghosting& ghosts, const std::vector & fields) { + mesh.confirm_host_mesh_is_synchronized_from_device(); const int parallel_size = mesh.parallel_size(); if ( fields.empty() || parallel_size == 1) { return; @@ -406,6 +408,7 @@ void parallel_op_impl(const BulkData& mesh, std::vector fields if (fields.empty()) { return; } + mesh.confirm_host_mesh_is_synchronized_from_device(); std::vector comm_procs = mesh.all_sharing_procs(fields[0]->entity_rank()); stk::mesh::EntityRank first_field_rank = fields[0]->entity_rank(); @@ -728,6 +731,7 @@ template void parallel_op_including_ghosts_impl(const BulkData & mesh, const std::vector & fields) { if ( fields.empty() ) { return; } + mesh.confirm_host_mesh_is_synchronized_from_device(); const int parallel_size = mesh.parallel_size(); diff --git a/packages/stk/stk_mesh/stk_mesh/base/HostMesh.hpp b/packages/stk/stk_mesh/stk_mesh/base/HostMesh.hpp index 93109961ed67..36823b3ede6f 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/HostMesh.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/HostMesh.hpp @@ -46,8 +46,6 @@ #include #include #include -#include -#include #include #include @@ -81,7 +79,7 @@ class HostMeshT : public NgpMeshBase HostMeshT(const stk::mesh::BulkData& b) : NgpMeshBase(), - bulk(&b), + bulk(&const_cast(b)), m_syncCountWhenUpdated(bulk->synchronized_count()) { require_ngp_mesh_rank_limit(bulk->mesh_meta_data()); @@ -128,7 +126,7 @@ class HostMeshT : public NgpMeshBase ConnectedEntities get_connected_entities(stk::mesh::EntityRank rank, const stk::mesh::FastMeshIndex &entity, stk::mesh::EntityRank connectedRank) const { const stk::mesh::Bucket& bucket = get_bucket(rank, entity.bucket_id); - return ConnectedEntities(bucket.begin(entity.bucket_ord, connectedRank), bucket.num_connectivity(entity.bucket_ord, connectedRank)); + return bucket.get_connected_entities(entity.bucket_ord, connectedRank); } ConnectedOrdinals get_connected_ordinals(stk::mesh::EntityRank rank, const stk::mesh::FastMeshIndex &entity, stk::mesh::EntityRank connectedRank) const @@ -239,6 +237,11 @@ class HostMeshT : public NgpMeshBase return bulk->volatile_fast_shared_comm_map(rank, proc); } + stk::mesh::BulkData &get_bulk_on_host() + { + return *bulk; + } + const stk::mesh::BulkData &get_bulk_on_host() const { return *bulk; @@ -249,8 +252,58 @@ class HostMeshT : public NgpMeshBase return m_syncCountWhenUpdated == bulk->synchronized_count(); } + template + void batch_change_entity_parts(const Kokkos::View& entities, + const Kokkos::View& addPartOrdinals, + const Kokkos::View& removePartOrdinals) + { + using EntitiesMemorySpace = typename std::remove_reference::type::memory_space; + using AddPartOrdinalsMemorySpace = typename std::remove_reference::type::memory_space; + using RemovePartOrdinalsMemorySpace = typename std::remove_reference::type::memory_space; + + static_assert(Kokkos::SpaceAccessibility::accessible, + "The memory space of the 'entities' View is inaccessible from the HostMesh execution space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "The memory space of the 'addPartOrdinals' View is inaccessible from the HostMesh execution space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "The memory space of the 'removePartOrdinals' View is inaccessible from the HostMesh execution space"); + + std::vector hostEntities; + std::vector hostAddParts; + std::vector hostRemoveParts; + + hostEntities.reserve(entities.extent(0)); + for (size_t i = 0; i < entities.extent(0); ++i) { + hostEntities.push_back(entities[i]); + } + + const stk::mesh::PartVector& parts = bulk->mesh_meta_data().get_parts(); + + hostAddParts.reserve(addPartOrdinals.extent(0)); + for (size_t i = 0; i < addPartOrdinals.extent(0); ++i) { + const size_t partOrdinal = addPartOrdinals[i]; + STK_ThrowRequire(partOrdinal < parts.size()); + hostAddParts.push_back(parts[partOrdinal]); + } + + hostRemoveParts.reserve(removePartOrdinals.extent(0)); + for (size_t i = 0; i < removePartOrdinals.extent(0); ++i) { + const size_t partOrdinal = removePartOrdinals[i]; + STK_ThrowRequire(partOrdinal < parts.size()); + hostRemoveParts.push_back(parts[partOrdinal]); + } + + bulk->batch_change_entity_parts(hostEntities, hostAddParts, hostRemoveParts); + } + + void sync_to_host() {} + + bool need_sync_to_host() const override { + return false; + } + private: - const stk::mesh::BulkData *bulk; + stk::mesh::BulkData *bulk; size_t m_syncCountWhenUpdated; }; diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpFieldParallel.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpFieldParallel.hpp index a0b4128d299d..22ee83a98373 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpFieldParallel.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpFieldParallel.hpp @@ -187,7 +187,7 @@ class ParallelSumDataExchangeSymPackUnpackHandler for (stk::mesh::NgpField* field : m_ngpFields) { stk::mesh::FieldBase* stkField = m_ngpMesh.get_bulk_on_host().mesh_meta_data().get_fields()[field->get_ordinal()]; - stk::mesh::HostCommMapIndices commMapIndices = m_ngpMesh.get_bulk_on_host().volatile_fast_shared_comm_map(field->get_rank(), proc); + stk::mesh::HostCommMapIndices commMapIndices = m_ngpMesh.get_bulk_on_host().volatile_fast_shared_comm_map(field->get_rank(), proc); for (size_t i = 0; i < commMapIndices.extent(0); ++i) { const unsigned bucketId = commMapIndices(i).bucket_id; const unsigned numScalarsPerEntity = stk::mesh::field_scalars_per_entity(*stkField, bucketId); diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpMeshBase.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpMeshBase.hpp index 3ce2687afa10..b980a2cff561 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpMeshBase.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpMeshBase.hpp @@ -19,6 +19,7 @@ class NgpMeshBase KOKKOS_DEFAULTED_FUNCTION NgpMeshBase& operator=(NgpMeshBase&&) = default; virtual void update_mesh() = 0; + virtual bool need_sync_to_host() const = 0; }; }} diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpParallelComm.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpParallelComm.hpp index c2674c75f6a9..d39df0626a63 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpParallelComm.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpParallelComm.hpp @@ -34,9 +34,10 @@ #ifndef STK_MESH_NGPPARALLELCOMM_HPP #define STK_MESH_NGPPARALLELCOMM_HPP -#include "stk_util/parallel/Parallel.hpp" // for ParallelMachine -#include "stk_util/ngp/NgpSpaces.hpp" -#include "Kokkos_Core.hpp" +#include +#include +#include +#include namespace stk { namespace mesh { @@ -58,7 +59,6 @@ void ngp_parallel_data_exchange_sym_pack_unpack(MPI_Comm mpi_communicator, const int pRank = stk::parallel_machine_rank(mpi_communicator); const int msgTag = 10242; size_t num_comm_procs = comm_procs.size(); - int dataTypeSize = sizeof(T); CommProcsViewType deviceCommProcs("DeviceCommProcs", num_comm_procs); CommProcsViewType::HostMirror hostCommProcs = Kokkos::create_mirror_view(deviceCommProcs); @@ -98,14 +98,15 @@ void ngp_parallel_data_exchange_sym_pack_unpack(MPI_Comm mpi_communicator, BufferViewType buffer = Kokkos::subview( deviceSendData, Kokkos::pair(dataBegin, dataEnd)); exchangeHandler.devicePackMessage(pRank, deviceCommProcs(iproc), buffer); }); + Kokkos::fence(); for (size_t proc = 0; proc < num_comm_procs; ++proc) { int iproc = comm_procs[proc]; const size_t dataBegin = hostBufferOffsets[proc]; const size_t dataEnd = hostBufferOffsets[proc+1]; - int bufSize = (dataEnd-dataBegin) * dataTypeSize; - MPI_Irecv((deviceRecvData.data()+dataBegin), bufSize, MPI_CHAR, iproc, msgTag, mpi_communicator, &recvRequests[proc]); - MPI_Isend((deviceSendData.data()+dataBegin), bufSize, MPI_CHAR, iproc, msgTag, mpi_communicator, &sendRequests[proc]); + int bufSize = (dataEnd-dataBegin); + MPI_Irecv((deviceRecvData.data()+dataBegin), bufSize, sierra::MPI::Datatype::type(), iproc, msgTag, mpi_communicator, &recvRequests[proc]); + MPI_Isend((deviceSendData.data()+dataBegin), bufSize, sierra::MPI::Datatype::type(), iproc, msgTag, mpi_communicator, &sendRequests[proc]); } for (size_t proc = 0; proc < num_comm_procs; ++proc) { diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpTypes.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpTypes.hpp index 0a128c4f29e4..85099a167005 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpTypes.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpTypes.hpp @@ -68,8 +68,8 @@ using HostPartOrdinalViewType = Kokkos::View; template using PermutationViewTypeT = Kokkos::View; using FastSharedCommMapViewType = DeviceCommMapIndices; -using HostMeshIndexType = Kokkos::View::HostMirror; using MeshIndexType = Kokkos::View>; +using HostMeshIndexType = MeshIndexType::HostMirror; using BucketEntityOffsetsViewType = Kokkos::View; diff --git a/packages/stk/stk_mesh/stk_mesh/base/Selector.hpp b/packages/stk/stk_mesh/stk_mesh/base/Selector.hpp index 0d0c686000f3..f831c7f7a925 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/Selector.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/Selector.hpp @@ -322,12 +322,6 @@ class Selector { friend std::ostream & operator << ( std::ostream & out, const Selector & selector); -private: - bool select_part_impl(Part const& part, impl::SelectorNode const* root) const; - bool select_bucket_impl(Bucket const& bucket, impl::SelectorNode const* root) const; - - const BulkData* find_mesh() const; - bool is_null() const { if(m_expr.size() > 1) return false; if(m_expr.back().m_type == SelectorNodeType::PART && m_expr.back().part() == InvalidPartOrdinal) { @@ -338,6 +332,12 @@ class Selector { return false; } +private: + bool select_part_impl(Part const& part, impl::SelectorNode const* root) const; + bool select_bucket_impl(Bucket const& bucket, impl::SelectorNode const* root) const; + + const BulkData* find_mesh() const; + Selector& add_binary_op(SelectorNodeType::node_type type, const Selector& rhs); std::vector m_expr; diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.cpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.cpp index e5a12eb5ca37..9f58a66dd2fc 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.cpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.cpp @@ -198,7 +198,7 @@ void BucketRepository::fill_key_ptr(const OrdinalVector& parts, PartOrdinal** ke { const size_t part_count = parts.size(); - const size_t keyLen = 2 + part_count; + const size_t keyLen = 1 + part_count; *keyPtr = keyTmpBuffer; *keyEnd = *keyPtr+keyLen; @@ -211,16 +211,12 @@ void BucketRepository::fill_key_ptr(const OrdinalVector& parts, PartOrdinal** ke //---------------------------------- // Key layout: - // { part_count + 1 , { part_ordinals } , partition_count } - // Thus partition_count = key[ key[0] ] + // { part_count , { part_ordinals } } // - // for upper bound search use the maximum key for a bucket in the partition. - const unsigned max = static_cast(-1); - (*keyPtr)[0] = part_count+1; - (*keyPtr)[ (*keyPtr)[0] ] = max ; + (*keyPtr)[0] = part_count; - { - for ( unsigned i = 0 ; i < part_count ; ++i ) { (*keyPtr)[i+1] = parts[i] ; } + for ( unsigned i = 0 ; i < part_count ; ++i ) { + (*keyPtr)[i+1] = parts[i]; } } @@ -246,22 +242,19 @@ Partition *BucketRepository::get_partition( PartOrdinal* keyPtr, PartOrdinal* keyEnd) { - STK_ThrowRequireMsg(m_mesh.mesh_meta_data().check_rank(arg_entity_rank), "Entity rank " << arg_entity_rank - << " is invalid"); + STK_ThrowAssertMsg(m_mesh.mesh_meta_data().check_rank(arg_entity_rank), + "Entity rank " << arg_entity_rank << " is invalid"); ensure_data_structures_sized(); std::vector & partitions = m_partitions[ arg_entity_rank ]; - // If the partition is found, the iterator will be right after it, thanks to the - // trickiness above. ik = lower_bound( partitions , keyPtr ); - const bool partition_exists = - (ik != partitions.begin()) && raw_part_equal( ik[-1]->key() , keyPtr ); + const bool partition_exists = (ik != partitions.end()) && raw_part_equal( (*ik)->key() , keyPtr ); if (partition_exists) { - return ik[-1]; + return *ik; } return nullptr; @@ -274,8 +267,6 @@ Partition* BucketRepository::create_partition( PartOrdinal* keyPtr, PartOrdinal* keyEnd) { - keyPtr[keyPtr[0]] = 0; - Partition *partition = new Partition(m_mesh, this, arg_entity_rank, keyPtr, keyEnd); STK_ThrowRequire(partition != nullptr); @@ -420,7 +411,8 @@ Bucket *BucketRepository::allocate_bucket(EntityRank entityRank, unsigned initialCapacity, unsigned maximumCapacity) { - STK_ThrowAssertMsg(stk::util::is_sorted_and_unique(std::vector(key.begin()+1,key.end()-1),std::less()), + std::vector tmp(key.begin()+1,key.end()); + STK_ThrowAssertMsg(stk::util::is_sorted_and_unique(tmp,std::less()), "bucket created with 'key' vector that's not sorted and unique"); BucketVector &bucket_vec = m_buckets[entityRank]; const unsigned bucket_id = bucket_vec.size(); @@ -465,15 +457,9 @@ void BucketRepository::sync_bucket_ids(EntityRank entity_rank) m_mesh.reorder_buckets_callback(entity_rank, id_map); } -std::vector BucketRepository::get_partitions(EntityRank rank) const +const std::vector& BucketRepository::get_partitions(EntityRank rank) const { - std::vector retval; - std::vector const& bf_vec = m_partitions[rank]; - for (size_t i = 0; i < bf_vec.size(); ++i) - { - retval.push_back(bf_vec[i]); - } - return retval; + return m_partitions[rank]; } void BucketRepository::delete_bucket(Bucket * bucket) diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.hpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.hpp index 749814a90781..27366e248101 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.hpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/BucketRepository.hpp @@ -137,8 +137,7 @@ class BucketRepository void sync_from_partitions(); void sync_from_partitions(EntityRank rank); - // Used in unit tests. Returns the current partitions. - std::vector get_partitions(EntityRank rank) const; + const std::vector& get_partitions(EntityRank rank) const; Partition* get_partition(const EntityRank arg_entity_rank, const OrdinalVector &parts); diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/NgpFieldBLASImpl.hpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/NgpFieldBLASImpl.hpp index f3368b340236..84fa51cfdd90 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/NgpFieldBLASImpl.hpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/NgpFieldBLASImpl.hpp @@ -99,6 +99,16 @@ void mark_field_modified(const mesh::FieldBase& field, EXEC_SPACE execSpace, boo } } +template +void construct_device_fields(FieldViewType& ngpFields) +{ + using NgpFieldType = typename FieldViewType::value_type; + Kokkos::parallel_for(stk::ngp::DeviceRangePolicy(0, ngpFields.size()), + KOKKOS_LAMBDA(const unsigned& i) { + new (&ngpFields(i)) NgpFieldType(); + }); +} + template class FieldFill { public: @@ -113,8 +123,18 @@ class FieldFill { } } else { - Kokkos::resize(ngpFieldsDynamic, nfields); + constexpr bool accessible = Kokkos::SpaceAccessibility::accessible; + + if (accessible) { + Kokkos::resize(ngpFieldsDynamic, nfields); + } + else { + Kokkos::resize(Kokkos::WithoutInitializing, ngpFieldsDynamic, nfields); + construct_device_fields(ngpFieldsDynamic); + } + auto ngpFieldsDynamicHost = Kokkos::create_mirror_view(ngpFieldsDynamic); + for (int i=0; i < nfields; ++i) { ngpFieldsDynamicHost[i] = fields[i]; @@ -123,6 +143,8 @@ class FieldFill { } } + KOKKOS_FUNCTION ~FieldFill() { } + KOKKOS_FUNCTION void operator()(const stk::mesh::FastMeshIndex& entityIndex) const { @@ -152,6 +174,7 @@ class FieldFill { using FieldView = Kokkos::View; using FieldHostView = typename FieldView::HostMirror; + using MemorySpace = typename FieldView::traits::memory_space; static constexpr int STATIC_FIELD_LIMIT = 4; NGP_FIELD_TYPE ngpFieldsStatic[STATIC_FIELD_LIMIT]; FieldView ngpFieldsDynamic; @@ -175,8 +198,18 @@ class FieldFillComponent { } } else { - Kokkos::resize(ngpFieldsDynamic, nfields); + constexpr bool accessible = Kokkos::SpaceAccessibility::accessible; + + if constexpr (accessible) { + Kokkos::resize(ngpFieldsDynamic, nfields); + } + else { + Kokkos::resize(Kokkos::WithoutInitializing, ngpFieldsDynamic, nfields); + construct_device_fields(ngpFieldsDynamic); + } + auto ngpFieldsDynamicHost = Kokkos::create_mirror_view(ngpFieldsDynamic); + for (int i=0; i < nfields; ++i) { ngpFieldsDynamicHost(i) = fields[i]; @@ -217,6 +250,7 @@ class FieldFillComponent { using FieldView = Kokkos::View; using FieldHostView = typename FieldView::HostMirror; + using MemorySpace = typename FieldView::traits::memory_space; static constexpr int STATIC_FIELD_LIMIT = 4; NGP_FIELD_TYPE ngpFieldsStatic[STATIC_FIELD_LIMIT]; FieldView ngpFieldsDynamic; @@ -237,10 +271,12 @@ void field_fill_for_each_entity(const NGP_MESH_TYPE& ngpMesh, if (component == -1) { FieldFill fieldFill(ngpFields, nfields, alpha); stk::mesh::for_each_entity_run(ngpMesh, ngpFields[0].get_rank(), selector, fieldFill, execSpace); + Kokkos::fence(); } else { FieldFillComponent fieldFill(ngpFields, nfields, alpha, component); stk::mesh::for_each_entity_run(ngpMesh, ngpFields[0].get_rank(), selector, fieldFill, execSpace); + Kokkos::fence(); } } @@ -334,6 +370,8 @@ template Scalar field_amax_no_mark_t( const stk::mesh::FieldBase& xField, const stk::mesh::Selector& selector, const EXEC_SPACE& execSpace) { + Scalar amaxOut = 0; + if constexpr (operate_on_ngp_mesh()) { xField.sync_to_device(); stk::mesh::NgpField& ngpX = stk::mesh::get_updated_ngp_field(xField); @@ -342,16 +380,16 @@ Scalar field_amax_no_mark_t( stk::mesh::for_each_entity_run(ngpMesh, xField.entity_rank(), selector, fieldAMax); Scalar localAmax = fieldAMax.get_amax_val(); - Scalar globalAmax = localAmax; auto comm = xField.get_mesh().parallel(); - stk::all_reduce_max(comm, &localAmax, &globalAmax, 1u); - return globalAmax; + stk::all_reduce_max(comm, &localAmax, &amaxOut, 1u); } else { xField.sync_to_host(); - double amaxOut{0.0}; - stk::mesh::field_amax(amaxOut, xField, selector); - return amaxOut; + double tmpAmax = 0.0; + stk::mesh::field_amax(tmpAmax, xField, selector); + amaxOut = tmpAmax; } + + return amaxOut; } template diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.cpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.cpp index 59d4e7a10076..cc95d6223656 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.cpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.cpp @@ -327,8 +327,6 @@ stk::mesh::FieldVector get_fields_for_bucket(const stk::mesh::BulkData& mesh, void Partition::sort(const EntitySorterBase& sorter) { std::vector partition_key = get_legacy_partition_id(); - //index of bucket in partition - partition_key[ partition_key[0] ] = 0; std::vector entities(m_size); @@ -543,7 +541,6 @@ stk::mesh::Bucket *Partition::get_bucket_for_adds() if (no_buckets()) { std::vector partition_key = get_legacy_partition_id(); - partition_key[ partition_key[0] ] = 0; Bucket *bucket = m_repository->allocate_bucket(m_rank, partition_key, m_repository->get_initial_bucket_capacity(), m_repository->get_maximum_bucket_capacity()); @@ -558,7 +555,6 @@ stk::mesh::Bucket *Partition::get_bucket_for_adds() if (bucket->size() == bucket->capacity()) { if (bucket->size() == m_repository->get_maximum_bucket_capacity()) { std::vector partition_key = get_legacy_partition_id(); - partition_key[ partition_key[0] ] = m_buckets.size(); bucket = m_repository->allocate_bucket(m_rank, partition_key, m_repository->get_initial_bucket_capacity(), m_repository->get_maximum_bucket_capacity()); diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.hpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.hpp index 1708c6093b85..a10a754fb2a2 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.hpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/Partition.hpp @@ -209,34 +209,30 @@ class Partition std::ostream &operator<<(std::ostream &, const stk::mesh::impl::Partition &); -struct PartitionLess { - bool operator()( const Partition * lhs_Partition , const unsigned * rhs ) const ; - bool operator()( const unsigned * lhs , const Partition * rhs_Partition ) const ; -}; - inline bool partition_key_less( const unsigned * lhs , const unsigned * rhs ) { -// const unsigned * const last_lhs = lhs + ( *lhs < *rhs ? *lhs : *rhs ); -// while ( last_lhs != lhs && *lhs == *rhs ) { ++lhs ; ++rhs ; } +// The following (very old) code is clever... So I'm adding some comments. +// +// A partition key is an array of unsigned, laid out like this: +// key[num-part-ordinals, first-part-ordinal, ..., last-part-ordinal] - if (*lhs == *rhs) { + if (*lhs == *rhs) { //num-part-ordinals is equal for lhs and rhs... const unsigned * const last_lhs = lhs + *lhs; do { ++lhs ; ++rhs ; } while ( last_lhs != lhs && *lhs == *rhs ); } - return *lhs < *rhs ; + return *lhs < *rhs; } -// The part count and part ordinals are less -inline bool PartitionLess::operator()( const Partition * lhs_partition , - const unsigned * rhs ) const -{ return partition_key_less( lhs_partition->key() , rhs ); } +struct PartitionLess { + bool operator()( const Partition * lhs_Partition , const unsigned * rhs ) const + { return partition_key_less( lhs_Partition->key() , rhs ); } -inline bool PartitionLess::operator()( const unsigned * lhs , - const Partition * rhs_partition ) const -{ return partition_key_less( lhs , rhs_partition->key() ); } + bool operator()( const unsigned * lhs , const Partition * rhs_Partition ) const + { return partition_key_less( lhs , rhs_Partition->key() ); } +}; inline std::vector::iterator @@ -247,4 +243,5 @@ lower_bound( std::vector & v , const unsigned * key ) } // mesh } // stk -#endif /* PartitionFAMILY_HPP_ */ +#endif /* STK_MESH_IMPL_PARTITION_HPP_ */ + diff --git a/packages/stk/stk_performance_tests/stk_mesh/NgpFieldAccess.cpp b/packages/stk/stk_performance_tests/stk_mesh/NgpFieldAccess.cpp index afb163cd5f13..0a8b1d1837c1 100644 --- a/packages/stk/stk_performance_tests/stk_mesh/NgpFieldAccess.cpp +++ b/packages/stk/stk_performance_tests/stk_mesh/NgpFieldAccess.cpp @@ -124,7 +124,7 @@ class NgpFieldAccess : public stk::unit_test_util::MeshFixture TEST_F(NgpFieldAccess, Centroid) { - if (get_parallel_size() != 1) return; + if (get_parallel_size() != 1) { GTEST_SKIP(); } const unsigned NUM_RUNS = 5; const int NUM_ITERS = 100; @@ -149,7 +149,7 @@ TEST_F(NgpFieldAccess, Centroid) TEST_F(NgpFieldAccess, HostCentroid) { - if (get_parallel_size() != 1) return; + if (get_parallel_size() != 1) { GTEST_SKIP(); } const unsigned NUM_RUNS = 5; const int NUM_ITERS = 100; diff --git a/packages/stk/stk_performance_tests/stk_mesh/ParallelSum.cpp b/packages/stk/stk_performance_tests/stk_mesh/ParallelSum.cpp index 3539d763c041..43f897e8cbd1 100644 --- a/packages/stk/stk_performance_tests/stk_mesh/ParallelSum.cpp +++ b/packages/stk/stk_performance_tests/stk_mesh/ParallelSum.cpp @@ -11,11 +11,15 @@ #include #include // for parallel_machine_rank #include // for parallel_data_exchange... +#include "stk_mesh/base/Types.hpp" // for BucketVector, EntityPr... #include // for Bucket #include // for BulkData, BulkData::NO... #include #include // for field_data, FieldBase #include // for parallel_sum, parallel... +#include +#include +#include #include // for count_selected_entities #include // for Part #include // for Selector, operator| @@ -25,7 +29,6 @@ #include "stk_mesh/base/Entity.hpp" // for Entity #include "stk_mesh/base/Field.hpp" // for Field #include "stk_mesh/base/MetaData.hpp" // for MetaData, put_field_on... -#include "stk_mesh/base/Types.hpp" // for BucketVector, EntityPr... #include "stk_util/environment/Env.hpp" // for parallel_rank, paralle... #include "stk_util/environment/perf_util.hpp" // for get_max_hwm_across_procs #include // for size_t @@ -51,7 +54,7 @@ stk::mesh::EntityId node_id( unsigned x , unsigned y , unsigned z, unsigned nx, return 1 + x + ( nx + 1 ) * ( y + ( ny + 1 ) * z ); } -void do_stk_test(bool with_ghosts=false) +void do_stk_test(bool with_ghosts=false, bool device_mpi=false) { using namespace stk::mesh; @@ -68,14 +71,15 @@ void do_stk_test(bool with_ghosts=false) } return; } - int z_dim = parallel_size*2; - //vector of mesh-dimensions holds the number of elements in each dimension. - //Hard-wired to 3. This test can run with spatial-dimension less than 3, - //(if generated-mesh can do that) but not greater than 3. + const int xdim = device_mpi ? 21 : X_DIM; + const int ydim = device_mpi ? 21 : Y_DIM; + const int zdim = parallel_size*2; + const int numFields = device_mpi ? 5 : NUM_FIELDS; + const int numIters = device_mpi ? 5 : NUM_ITERS; std::ostringstream oss; - oss << "generated:" << X_DIM << "x" << Y_DIM << "x" << z_dim; + oss << "generated:" << xdim << "x" << ydim << "x" << zdim; stk::mesh::MeshBuilder builder(pm); unsigned spatialDim = 3; @@ -89,8 +93,8 @@ void do_stk_test(bool with_ghosts=false) std::cerr << "Mesh: " << oss.str() << std::endl; } - std::vector fields(NUM_FIELDS); - for (int i = 0; i < NUM_FIELDS; ++i) { + std::vector fields(numFields); + for (int i = 0; i < numFields; ++i) { std::ostringstream oss2; oss2 << "field_" << i; FieldBase* field = &meta.declare_field(stk::topology::NODE_RANK, oss2.str()); @@ -162,7 +166,7 @@ void do_stk_test(bool with_ghosts=false) stk::mesh::BucketVector const& node_buckets = bulk.get_buckets(stk::topology::NODE_RANK, communicated_nodes); for (int b = 0, be = node_buckets.size(); b < be; ++b) { stk::mesh::Bucket const& bucket = *node_buckets[b]; - for (int i = 0; i < NUM_FIELDS; ++i) { + for (int i = 0; i < numFields; ++i) { const ScalarField& field = dynamic_cast(*fields[i]); double* data = stk::mesh::field_data(field, bucket); for (int n = 0, ne = bucket.size(); n < ne; ++n) { @@ -174,15 +178,30 @@ void do_stk_test(bool with_ghosts=false) MPI_Barrier(pm); double start_time = stk::cpu_time(); + NgpMesh* ngpMesh = nullptr; + if (device_mpi) { + ngpMesh = & stk::mesh::get_updated_ngp_mesh(bulk); + } - for (int t = 0; t < NUM_ITERS; ++t) { - if (with_ghosts) - { + std::vector*> ngpFields(numFields); + if (device_mpi) { + for (int i = 0; i < numFields; ++i) { + ngpFields[i] = &stk::mesh::get_updated_ngp_field(*fields[i]); + } + } + + for (int t = 0; t < numIters; ++t) { + if (with_ghosts) { + STK_ThrowRequireMsg(!device_mpi, "NGP parallel_sum_including_ghosts not implemented yet."); stk::mesh::parallel_sum_including_ghosts(bulk, fields); } - else - { - stk::mesh::parallel_sum(bulk, fields); + else { + if (device_mpi) { + stk::mesh::parallel_sum_device_mpi(*ngpMesh, ngpFields); + } + else { + stk::mesh::parallel_sum(bulk, fields); + } } } @@ -191,17 +210,23 @@ void do_stk_test(bool with_ghosts=false) double max_time; MPI_Reduce(static_cast(&stk_sum_time), static_cast(&max_time), 1, MPI_DOUBLE, MPI_MAX, 0 /*root*/, MPI_COMM_WORLD); - double power2 = std::pow(2,NUM_ITERS); - double power3 = std::pow(3,NUM_ITERS); + double power2 = std::pow(2,numIters); + double power3 = std::pow(3,numIters); const double tolerance = 1.e-8; + if (device_mpi) { + for (int i = 0; i < numFields; ++i) { + ngpFields[i]->sync_to_host(); + } + } + // Sanity check size_t num_comm_nodes = 0; for (int b = 0, be = node_buckets.size(); b < be; ++b) { stk::mesh::Bucket const& bucket = *node_buckets[b]; const bool isShared = bucket.shared(); num_comm_nodes += bucket.size(); - for (int f = 0; f < NUM_FIELDS; ++f) { + for (int f = 0; f < numFields; ++f) { const ScalarField& field = dynamic_cast(*fields[f]); const double* stk_data = stk::mesh::field_data(field, bucket); const double expected_shared_value = static_cast(f+1) * power2; @@ -209,15 +234,15 @@ void do_stk_test(bool with_ghosts=false) const double expected = isShared ? expected_shared_value : expected_ghosted_value; for (int n = 0, ne = bucket.size(); n < ne; ++n) { const double relativeError = std::abs(stk_data[n] - expected) / expected; - EXPECT_NEAR(0.0, relativeError, tolerance); + EXPECT_NEAR(0.0, relativeError, tolerance)<<"node "< +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +TEST(PrintTimersTable, performance) +{ + stk::ParallelMachine comm = MPI_COMM_WORLD; + + const unsigned NUM_BATCHES = 5; + const unsigned NUM_RUNS = 5; + + stk::unit_test_util::BatchTimer batchTimer(comm); + batchTimer.initialize_batch_timer(); + + for(unsigned b=0; b(stk::topology::NODE_RANK, "Coordinates") ), owns_mesh(false) { + if (!elemPartAlias.empty()) { + m_meta.add_part_alias(*m_elem_parts[0], elemPartAlias); + } //put coord-field on all nodes: put_field_on_mesh(*m_coord_field, m_meta.universal_part(), m_spatial_dimension, nullptr); } @@ -151,6 +155,24 @@ HexFixture::HexFixture(stk::ParallelMachine pm, put_field_on_mesh(*m_coord_field, m_meta.universal_part(), m_spatial_dimension, nullptr); } +void HexFixture::fill_mesh(size_t nx, + size_t ny, + size_t nz, + BulkData& bulk, + const std::string& elemPartAlias) +{ + size_t nidStart = 1; + size_t eidStart = 1; + if (!bulk.mesh_meta_data().is_initialized()) { + const size_t spatialDim = 3; + const std::string coordFieldName("Coordinates"); + bulk.mesh_meta_data().initialize(spatialDim, stk::mesh::entity_rank_names(), coordFieldName); + } + HexFixture hexFixture(bulk.mesh_meta_data(), bulk, nx, ny, nz, nidStart, eidStart, elemPartAlias); + hexFixture.m_meta.commit(); + hexFixture.generate_mesh(); +} + HexFixture::~HexFixture() { diff --git a/packages/stk/stk_unit_test_utils/stk_unit_test_utils/stk_mesh_fixtures/HexFixture.hpp b/packages/stk/stk_unit_test_utils/stk_unit_test_utils/stk_mesh_fixtures/HexFixture.hpp index ad4c6d701d9b..30c6c1a5891a 100644 --- a/packages/stk/stk_unit_test_utils/stk_unit_test_utils/stk_mesh_fixtures/HexFixture.hpp +++ b/packages/stk/stk_unit_test_utils/stk_unit_test_utils/stk_mesh_fixtures/HexFixture.hpp @@ -84,7 +84,8 @@ class HexFixture size_t ny, size_t nz, size_t nid_start, - size_t eid_start); + size_t eid_start, + const std::string& elemPartAlias = std::string("")); HexFixture(stk::ParallelMachine pm, size_t nx, @@ -103,6 +104,12 @@ class HexFixture size_t nz, bool auraOn); + static void fill_mesh(size_t nx, + size_t ny, + size_t nz, + BulkData& bulk, + const std::string& elemPartAlias = std::string("")); + const int m_spatial_dimension; const size_t m_nx; const size_t m_ny; diff --git a/packages/stk/stk_unit_tests/stk_balance/UnitTestStkBalanceDecomposition.cpp b/packages/stk/stk_unit_tests/stk_balance/UnitTestStkBalanceDecomposition.cpp index bd492095fbd8..86b06bf01e60 100644 --- a/packages/stk/stk_unit_tests/stk_balance/UnitTestStkBalanceDecomposition.cpp +++ b/packages/stk/stk_unit_tests/stk_balance/UnitTestStkBalanceDecomposition.cpp @@ -96,8 +96,7 @@ class StkBalanceDecomposition : public stk::unit_test_util::MeshFixture TEST_F(StkBalanceDecomposition, 4Elem1ProcMesh_EntireDomain) { - if (stk::parallel_machine_size(get_comm()) != 1) return; - + if (stk::parallel_machine_size(get_comm()) != 1) { GTEST_SKIP(); } setup_initial_mesh("generated:1x1x4"); balance_mesh({get_meta().universal_part()}); diff --git a/packages/stk/stk_unit_tests/stk_expreval/UnitTestEvaluator.cpp b/packages/stk/stk_unit_tests/stk_expreval/UnitTestEvaluator.cpp index 16b8daf7da74..f4c24d8b8174 100644 --- a/packages/stk/stk_unit_tests/stk_expreval/UnitTestEvaluator.cpp +++ b/packages/stk/stk_unit_tests/stk_expreval/UnitTestEvaluator.cpp @@ -36,17 +36,22 @@ #include #include #include +#include #include #include #include #include #include #include +#include "stk_expreval/NgpNode.hpp" +#include "stk_expreval/Node.hpp" namespace { using ViewInt1DHostType = Kokkos::View; +using FPErrorBehavior = stk::expreval::Eval::FPErrorBehavior; + bool has_variable(const std::vector& variableNames, const std::string& variableName) { @@ -87,6 +92,7 @@ double evaluate(const std::string & expression, const stk::expreval::Variable::ArrayOffset arrayOffsetType = stk::expreval::Variable::ZERO_BASED_INDEX) { stk::expreval::Eval eval(expression, arrayOffsetType); + eval.set_fp_error_behavior(stk::expreval::Eval::FPErrorBehavior::Error); eval.parse(); for (ScalarBinding & scalar : boundScalars) { @@ -565,6 +571,37 @@ TEST( UnitTestEvaluator, testEvaluateEmptyString) EXPECT_EQ(0.0, result); } +TEST( UnitTestEvaluator, FunctionNameNullTerminated) +{ + stk::expreval::Eval eval("sin(0.5)"); + eval.parse(); + for (int i=0; i < eval.get_node_count(); ++i) + { + stk::expreval::Node* node = eval.get_node(i); + if (node->m_opcode == stk::expreval::OPCODE_FUNCTION) + { + EXPECT_EQ(std::strcmp(node->m_data.function.functionName, "sin"), 0); + } + } +} + +#ifndef STK_ENABLE_GPU + +TEST(UnitTestEvaluator, CheckNGPNodeFPError_Ignore) +{ + FPErrorBehavior m_fpErrorBehavior = FPErrorBehavior::Ignore; + EXPECT_NO_THROW(checkNgpNodeFPError(NAN, "foo")); +} + +TEST(UnitTestEvaluator, CheckNGPNodeFPError_Error) +{ + FPErrorBehavior m_fpErrorBehavior = FPErrorBehavior::Error; + EXPECT_ANY_THROW(checkNgpNodeFPError(NAN, "foo")); +} + +#endif + + TEST(UnitTestEvaluator, test_copy_constructor) { double a = 1.0; @@ -2224,6 +2261,9 @@ TEST(UnitTestEvaluator, testFunction_sqrt) EXPECT_DOUBLE_EQ(evaluate("sqrt(9)"), 3); EXPECT_DOUBLE_EQ(evaluate("sqrt(2)"), std::sqrt(2)); EXPECT_DOUBLE_EQ(evaluate("sqrt(1.21)"), 1.1); + if (stk::util::have_errno() || stk::util::have_errexcept()) { + EXPECT_ANY_THROW(evaluate("sqrt(-1)")); + } } TEST(UnitTestEvaluator, Ngp_testFunction_sqrt) @@ -2234,6 +2274,36 @@ TEST(UnitTestEvaluator, Ngp_testFunction_sqrt) EXPECT_DOUBLE_EQ(device_evaluate("sqrt(9)"), 3); EXPECT_DOUBLE_EQ(device_evaluate("sqrt(2)"), std::sqrt(2)); EXPECT_DOUBLE_EQ(device_evaluate("sqrt(1.21)"), 1.1); + if (stk::util::have_errno() || stk::util::have_errexcept()) { + KOKKOS_IF_ON_HOST( + EXPECT_ANY_THROW(evaluate("sqrt(-1)")); + ) + } +} + +TEST(UnitTestEvaluator, IgnoreFloatingPointError) +{ + stk::expreval::Eval eval("sqrt(-1)"); + eval.set_fp_error_behavior(stk::expreval::Eval::FPErrorBehavior::Ignore); + eval.parse(); + EXPECT_NO_THROW(eval.evaluate()); +} + +TEST(UnitTestEvaluator, WarnFloatingPointError) +{ + stk::expreval::Eval eval("sqrt(-1)"); + eval.set_fp_error_behavior(stk::expreval::Eval::FPErrorBehavior::Warn); + eval.parse(); + EXPECT_NO_THROW(eval.evaluate()); +} + +TEST(UnitTestEvaluator, ThrowFloatingPointError) +{ + if (!stk::util::have_errno() && !stk::util::have_errexcept()) { GTEST_SKIP(); } + stk::expreval::Eval eval("sqrt(-1)"); + eval.set_fp_error_behavior(stk::expreval::Eval::FPErrorBehavior::Error); + eval.parse(); + EXPECT_ANY_THROW(eval.evaluate()); } TEST(UnitTestEvaluator, testFunction_exp) @@ -2598,7 +2668,7 @@ TEST(UnitTestEvaluator, testFunction_atanh) EXPECT_DOUBLE_EQ(evaluate("atanh(0)"), 0); EXPECT_DOUBLE_EQ(evaluate("atanh(0.1)"), std::atanh(0.1)); EXPECT_DOUBLE_EQ(evaluate("atanh(0.5)"), std::atanh(0.5)); - EXPECT_DOUBLE_EQ(evaluate("atanh(1)"), std::atanh(1)); + EXPECT_DOUBLE_EQ(evaluate("atanh(0.9)"), std::atanh(0.9)); } TEST(UnitTestEvaluator, Ngp_testFunction_atanh) @@ -2607,7 +2677,7 @@ TEST(UnitTestEvaluator, Ngp_testFunction_atanh) EXPECT_DOUBLE_EQ(device_evaluate("atanh(0)"), 0); EXPECT_DOUBLE_EQ(device_evaluate("atanh(0.1)"), std::atanh(0.1)); EXPECT_DOUBLE_EQ(device_evaluate("atanh(0.5)"), std::atanh(0.5)); - EXPECT_DOUBLE_EQ(device_evaluate("atanh(1)"), std::atanh(1)); + EXPECT_DOUBLE_EQ(device_evaluate("atanh(0.9)"), std::atanh(0.9)); } TEST(UnitTestEvaluator, testFunction_erf) diff --git a/packages/stk/stk_unit_tests/stk_io/UnitTestWriteSTKMesh.cpp b/packages/stk/stk_unit_tests/stk_io/UnitTestWriteSTKMesh.cpp index 75e180bd1c8b..7ca4388e788f 100644 --- a/packages/stk/stk_unit_tests/stk_io/UnitTestWriteSTKMesh.cpp +++ b/packages/stk/stk_unit_tests/stk_io/UnitTestWriteSTKMesh.cpp @@ -1,44 +1,42 @@ #include -#include "mpi.h" #include #include #include #include -#include #include #include #include #include #include -#include #include #include #include #include #include +#include #include #include +#include #include #include -#include #include #include #include #include -#include +#include #include #include #include +#include #include #include // for unlink -#include "stk_util/environment/Env.hpp" #include namespace { @@ -62,24 +60,83 @@ Ioss::DatabaseIO* create_output_db_io(const std::string &filename) return db_io; } -//BeginDocTest1 -TEST(StkIo, write_stk_mesh_to_file) +void verify_num_nodes_in_file(MPI_Comm comm, + const std::string& meshFileName, + unsigned expectedNumNodes) +{ + std::shared_ptr bulkData = build_mesh(comm); + stk::io::fill_mesh(meshFileName, *bulkData); + + std::vector entity_counts; + stk::mesh::comm_mesh_counts(*bulkData, entity_counts); + EXPECT_EQ(expectedNumNodes, entity_counts[stk::topology::NODE_RANK]); +} + +void fill_node_ids_and_coords(const stk::mesh::BulkData& bulk, + std::vector& node_ids, + std::vector& coordinates) +{ + const stk::mesh::MetaData& meta = bulk.mesh_meta_data(); + stk::mesh::Field * coordField = meta.get_field(stk::topology::NODE_RANK, "coordinates"); + int spatial_dim = meta.spatial_dimension(); + + STK_ThrowAssert(coordField != NULL); + + stk::mesh::Selector locallyOwned = meta.locally_owned_part(); + + int node_counter = 0; + stk::mesh::for_each_entity_run_no_threads(bulk, stk::topology::NODE_RANK, locallyOwned, + [&](const stk::mesh::BulkData& mesh, stk::mesh::Entity node) + { + int node_id = mesh.identifier(node); + node_ids[node_counter] = node_id; + + const double* coords = stk::mesh::field_data(*coordField, node); + for(int k=0;k& elem_ids, + std::vector& connectivity) +{ + stk::mesh::EntityVector elems; + stk::mesh::get_entities(bulkData, stk::topology::ELEM_RANK, *elemBlock, elems); + + elem_ids.resize(elems.size()); + const unsigned connectivity_size = elems.size()*elemBlock->topology().num_nodes(); + connectivity.resize(connectivity_size); + + unsigned conn_counter = 0; + for(size_t j=0;j bulkData = build_mesh(comm); stk::mesh::MetaData& meta = bulkData->mesh_meta_data(); stk::io::fill_mesh("generated:2x2x2|sideset:xX|nodeset:x", *bulkData); - const stk::mesh::PartVector & all_parts = meta.get_parts(); - Ioss::DatabaseIO* db_io = create_output_db_io(file_written); - Ioss::Region output_region(db_io); EXPECT_TRUE(db_io->ok()); + Ioss::Region output_region(db_io); //////////////////////////////////////////////////////////// @@ -96,26 +153,19 @@ TEST(StkIo, write_stk_mesh_to_file) Ioss::NodeBlock *output_node_block = new Ioss::NodeBlock(db_io, NodeBlockName, num_nodes, spatial_dim); output_region.add(output_node_block); - for(stk::mesh::PartVector::const_iterator i = all_parts.begin(); i != all_parts.end(); ++i) - { - stk::mesh::Part * const part = *i; - - if(stk::io::is_part_io_part(*part)) // this means it is an io_part - { - if(part->primary_entity_rank() == stk::topology::ELEMENT_RANK) - { - stk::mesh::EntityVector entities; - const stk::mesh::BucketVector &input_buckets = bulkData->buckets(stk::topology::ELEMENT_RANK); - stk::mesh::get_selected_entities(*part, input_buckets, entities); - Ioss::ElementBlock *output_element_block = new Ioss::ElementBlock(db_io, part->name(), part->topology().name(), entities.size()); - - output_element_block->property_add(Ioss::Property("original_topology_type", part->topology().name())); - output_element_block->property_add(Ioss::Property("id", part->id())); - output_region.add(output_element_block); - - // how about attributes? - } - } + stk::mesh::PartVector elemBlockParts; + stk::mesh::fill_element_block_parts(meta, stk::topology::HEX_8, elemBlockParts); + + for(const stk::mesh::Part* elemBlock : elemBlockParts) { + STK_ThrowRequireMsg(stk::io::is_part_io_part(*elemBlock),"element-block-part "<name()<<" is not an IO part."); + unsigned numElems = stk::mesh::count_entities(*bulkData, stk::topology::ELEM_RANK, *elemBlock); + Ioss::ElementBlock *output_element_block = new Ioss::ElementBlock(db_io, elemBlock->name(), elemBlock->topology().name(), numElems); + + output_element_block->property_add(Ioss::Property("original_topology_type", elemBlock->topology().name())); + output_element_block->property_add(Ioss::Property("id", elemBlock->id())); + output_region.add(output_element_block); + + // how about attributes? } output_region.end_mode(Ioss::STATE_DEFINE_MODEL); @@ -126,94 +176,32 @@ TEST(StkIo, write_stk_mesh_to_file) Ioss::NodeBlock *node_block = output_region.get_node_blocks()[0]; - stk::mesh::Field * coordField = meta.get_field(stk::topology::NODE_RANK, "coordinates"); - - ASSERT_TRUE(coordField != NULL); - std::vector coordinates(spatial_dim*num_nodes); std::vector node_ids(num_nodes); - stk::mesh::Selector local_nodes = meta.locally_owned_part(); - - const stk::mesh::BucketVector &input_buckets = bulkData->get_buckets(stk::topology::NODE_RANK, local_nodes); - - int node_counter = 0; - for(size_t i=0;iidentifier(node); - node_ids[node_counter] = node_id; - - double* coords = stk::mesh::field_data(*coordField, node); - for(int k=0;kput_field_data("mesh_model_coordinates", coordinates); node_block->put_field_data("ids", node_ids); - for(stk::mesh::PartVector::const_iterator i = all_parts.begin(); i != all_parts.end(); ++i) - { - stk::mesh::Part * const part = *i; - - if(stk::io::is_part_io_part(*part)) // this means it is an io_part - { - if(part->primary_entity_rank() == stk::topology::ELEMENT_RANK) - { - stk::mesh::EntityVector entities; - const stk::mesh::BucketVector &input_bucketsA = bulkData->buckets(stk::topology::ELEMENT_RANK); - stk::mesh::get_selected_entities(*part, input_bucketsA, entities); - Ioss::ElementBlock *output_element_block = output_region.get_element_block(part->id()); - - std::vector elem_ids(entities.size()); - unsigned connectivity_size = entities.size()*part->topology().num_nodes(); - std::vector connectivity(connectivity_size); - unsigned conn_counter = 0; - - for(size_t j=0;jidentifier(entities[j]); - unsigned num_nodes_per = bulkData->num_nodes(entities[j]); - const stk::mesh::Entity *nodes = bulkData->begin_nodes(entities[j]); - for(unsigned k=0;kidentifier(nodes[k]); - conn_counter++; - } - } - - output_element_block->put_field_data("connectivity_raw", connectivity); - output_element_block->put_field_data("ids", elem_ids); - } - } + for(const stk::mesh::Part* elemBlock : elemBlockParts) { + std::vector elem_ids; + std::vector connectivity; + + fill_elem_ids_and_connectivity(*bulkData, elemBlock, elem_ids, connectivity); + + Ioss::ElementBlock *output_element_block = output_region.get_element_block(elemBlock->id()); + output_element_block->put_field_data("connectivity_raw", connectivity); + output_element_block->put_field_data("ids", elem_ids); } output_region.end_mode(Ioss::STATE_MODEL); //////////////////////////////////////////////////////////// } - if(stk::parallel_machine_size(comm) == 1) - { - std::shared_ptr bulkData = build_mesh(comm); - - stk::io::fill_mesh(file_written, *bulkData); - - std::vector entity_counts; - stk::mesh::comm_mesh_counts(*bulkData, entity_counts); - EXPECT_EQ(27u, entity_counts[stk::topology::NODE_RANK]); - } - + verify_num_nodes_in_file(comm, file_written, 27); unlink(file_written.c_str()); - } -//EndDocTest1 class StkIoResultsOutput : public stk::unit_test_util::MeshFixture { diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestBulkData.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestBulkData.cpp index 95de942fa434..f384a8949659 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestBulkData.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestBulkData.cpp @@ -5815,8 +5815,7 @@ TEST(FaceCreation, test_face_creation_2Hexes_2procs) stk::mesh::MetaData meta(3); stk::unit_test_util::BulkDataFaceSharingTester mesh(meta, MPI_COMM_WORLD); - const std::string generatedMeshSpec = "generated:1x1x2"; - stk::io::fill_mesh(generatedMeshSpec, mesh); + stk::mesh::fixtures::HexFixture::fill_mesh(1,1,2, mesh); int procId = stk::parallel_machine_rank(MPI_COMM_WORLD); @@ -6018,8 +6017,7 @@ TEST(ChangeEntityId, test_throw_on_shared_node) std::shared_ptr bulkPtr = stk::unit_test_util::build_mesh(spatialDim, MPI_COMM_WORLD); stk::mesh::BulkData& mesh = *bulkPtr; - const std::string generatedMeshSpec = "generated:1x1x2"; - stk::io::fill_mesh(generatedMeshSpec, mesh); + stk::mesh::fixtures::HexFixture::fill_mesh(1,1,2, mesh); stk::mesh::Entity sharedNode5 = mesh.get_entity(stk::topology::NODE_RANK, 5); @@ -6041,8 +6039,7 @@ TEST(AmbiguousTopology, hexRedefinedAsShell) stk::mesh::MetaData& meta= bulkPtr->mesh_meta_data(); stk::mesh::BulkData& mesh = *bulkPtr; - const std::string generatedMeshSpec = "generated:1x1x1"; - stk::io::fill_mesh(generatedMeshSpec, mesh); + stk::mesh::fixtures::HexFixture::fill_mesh(1,1,1, mesh); stk::mesh::Part& shellPart = meta.get_topology_root_part(stk::topology::SHELL_QUAD_4); stk::mesh::EntityId elemId = 1; diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestChangeParts.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestChangeParts.cpp index c55edaf4185c..d1b2f883f726 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestChangeParts.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestChangeParts.cpp @@ -96,14 +96,78 @@ TEST(UnitTestChangeParts, test_throw_on_internal_part_change) EXPECT_THROW(bulkData.change_entity_parts(node, addParts, removeParts), std::runtime_error); } +void do_simple_batch_part_change_test(stk::mesh::BulkData& bulkData) +{ + stk::mesh::MetaData& metaData = bulkData.mesh_meta_data(); + stk::mesh::Part& bluePart = metaData.declare_part("blue_part"); + stk::mesh::Part& redPart = metaData.declare_part("red_part"); + + stk::mesh::Entity elem1 = bulkData.get_entity(stk::topology::ELEM_RANK, 1u); + EXPECT_TRUE(bulkData.is_valid(elem1)); + + stk::mesh::EntityVector nodes(bulkData.begin_nodes(elem1), bulkData.begin_nodes(elem1)+bulkData.num_nodes(elem1)); + EXPECT_EQ(8u, nodes.size()); + + for(stk::mesh::Entity node : nodes) { + EXPECT_FALSE(bulkData.bucket(node).member(bluePart)); + EXPECT_FALSE(bulkData.bucket(node).member(redPart)); + } + + stk::mesh::PartVector blue_parts(1, &bluePart); + stk::mesh::PartVector red_parts(1, &redPart); + bulkData.batch_change_entity_parts(nodes, blue_parts, {}); + nodes.resize(4); + bulkData.batch_change_entity_parts(nodes, red_parts, blue_parts); + + for(stk::mesh::Entity node : nodes) { + EXPECT_TRUE(bulkData.bucket(node).member(redPart)); + EXPECT_FALSE(bulkData.bucket(node).member(bluePart)); + } + + nodes.resize(1); + bulkData.batch_change_entity_parts(nodes, blue_parts, red_parts); + + EXPECT_TRUE(bulkData.bucket(nodes[0]).member(bluePart)); + EXPECT_FALSE(bulkData.bucket(nodes[0]).member(redPart)); +} + +TEST(UnitTestChangeParts, genmesh_test_batch_part_change_1_node) +{ + stk::ParallelMachine pm = MPI_COMM_WORLD; + const int p_size = stk::parallel_machine_size( pm ); + + if (p_size != 1) { GTEST_SKIP(); } + + const int spatialDim = 3; + std::shared_ptr bulkPtr = build_mesh(spatialDim, pm, stk::mesh::BulkData::NO_AUTO_AURA); + + stk::io::fill_mesh("generated:1x1x1", *bulkPtr); + + do_simple_batch_part_change_test(*bulkPtr); +} + +TEST(UnitTestChangeParts, test_batch_part_change_1_node) +{ + stk::ParallelMachine pm = MPI_COMM_WORLD; + const int p_size = stk::parallel_machine_size( pm ); + + if (p_size != 1) { GTEST_SKIP(); } + + const int spatialDim = 3; + std::shared_ptr bulkPtr = build_mesh(spatialDim, pm, stk::mesh::BulkData::NO_AUTO_AURA); + + std::string meshDesc = "0,1,HEX_8,1,2,3,4,5,6,7,8,block_1"; + stk::unit_test_util::setup_text_mesh(*bulkPtr, meshDesc); + + do_simple_batch_part_change_test(*bulkPtr); +} + TEST(UnitTestChangeParts, test_batch_part_change) { stk::ParallelMachine pm = MPI_COMM_WORLD; const int p_size = stk::parallel_machine_size( pm ); - if (p_size != 1) { - return; - } + if (p_size != 1) { GTEST_SKIP(); } const int spatialDim = 3; std::shared_ptr bulkPtr = build_mesh(spatialDim, pm, stk::mesh::BulkData::NO_AUTO_AURA); diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestCreateFaces.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestCreateFaces.cpp index 9f446199215a..28241beec212 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestCreateFaces.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestCreateFaces.cpp @@ -113,14 +113,12 @@ TEST ( UnitTestCreateFaces, Hex_2x1x1 ) const size_t NY = 1; const size_t NZ = 1; - stk::mesh::fixtures::HexFixture fixture( MPI_COMM_WORLD, NX, NY, NZ); - - fixture.m_meta.commit(); - fixture.generate_mesh(); + std::shared_ptr bulkPtr = build_mesh(MPI_COMM_WORLD); + stk::mesh::fixtures::HexFixture::fill_mesh(NX, NY, NZ, *bulkPtr); { std::vector counts ; - stk::mesh::comm_mesh_counts( fixture.m_bulk_data , counts); + stk::mesh::comm_mesh_counts( *bulkPtr , counts); EXPECT_EQ( exp_node_count(NX, NY, NZ), counts[node_rank] ); // nodes EXPECT_EQ( 0u, counts[edge_rank] ); // edges @@ -128,11 +126,11 @@ TEST ( UnitTestCreateFaces, Hex_2x1x1 ) EXPECT_EQ( exp_hex_count(NX, NY, NZ), counts[elem_rank] ); // elements } - stk::mesh::create_faces(fixture.m_bulk_data); + stk::mesh::create_faces(*bulkPtr); { std::vector counts ; - stk::mesh::comm_mesh_counts( fixture.m_bulk_data , counts); + stk::mesh::comm_mesh_counts( *bulkPtr , counts); EXPECT_EQ( exp_node_count(NX, NY, NZ), counts[node_rank] ); // nodes EXPECT_EQ( 0u , counts[edge_rank] ); // edges diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestDestroyElements.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestDestroyElements.cpp index b9528d885cff..2172852b1a7f 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestDestroyElements.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestDestroyElements.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -660,7 +661,7 @@ TEST(DestroyElements, destroyAll) builder.set_maximum_bucket_capacity(2); std::shared_ptr bulkPtr = builder.create(); - stk::io::fill_mesh("generated:1x1x4", *bulkPtr); + stk::mesh::fixtures::HexFixture::fill_mesh(1,1,4, *bulkPtr, "block_1"); stk::mesh::Part* block1 = bulkPtr->mesh_meta_data().get_part("block_1"); stk::mesh::Part& block2 = bulkPtr->mesh_meta_data().declare_part("block_2", stk::topology::ELEM_RANK); stk::mesh::EntityVector elemsToMove = { diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp index 1dced60a826b..979e9725ab88 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp @@ -79,6 +79,7 @@ namespace Ioss { class DatabaseIO; } namespace { const stk::topology::rank_t NODE_RANK = stk::topology::NODE_RANK; +using ngp_unit_test_utils::check_bucket_layout; using stk::unit_test_util::build_mesh; TEST(UnitTestField, testFieldMaxSize) @@ -1562,7 +1563,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity1_maxCapacity1) check_num_buckets(*m_bulk, 1); check_bucket_sizes(*m_bulk, {1}); check_bucket_capacities(*m_bulk, {1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1}}}, bucketRank); } { SCOPED_TRACE("Create Node 2"); @@ -1571,7 +1572,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity1_maxCapacity1) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {1, 1}); check_bucket_capacities(*m_bulk, {1, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1}}, {"block_1", {2}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1}}, {{"block_1"}, {2}}}, bucketRank); } { SCOPED_TRACE("Create Node 3"); @@ -1580,7 +1581,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity1_maxCapacity1) check_num_buckets(*m_bulk, 3); check_bucket_sizes(*m_bulk, {1, 1, 1}); check_bucket_capacities(*m_bulk, {1, 1, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1}}, {"block_1", {2}}, {"block_1", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1}}, {{"block_1"}, {2}}, {{"block_1"}, {3}}}, bucketRank); } } @@ -1599,7 +1600,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 1); check_bucket_sizes(*m_bulk, {1}); check_bucket_capacities(*m_bulk, {2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1}}}, bucketRank); } { SCOPED_TRACE("Create Node 2"); @@ -1608,7 +1609,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 1); check_bucket_sizes(*m_bulk, {2}); check_bucket_capacities(*m_bulk, {2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}}, bucketRank); } { SCOPED_TRACE("Create Node 3"); @@ -1617,7 +1618,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); } } @@ -1636,7 +1637,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 1); check_bucket_sizes(*m_bulk, {1}); check_bucket_capacities(*m_bulk, {1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1}}}, bucketRank); } { SCOPED_TRACE("Create Node 2"); @@ -1645,7 +1646,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 1); check_bucket_sizes(*m_bulk, {2}); check_bucket_capacities(*m_bulk, {2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}}, bucketRank); } { SCOPED_TRACE("Create Node 3"); @@ -1654,7 +1655,7 @@ TEST_F(VariableCapacityBuckets, createNodes_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); } } @@ -1676,7 +1677,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); } { SCOPED_TRACE("Change parts for Node 1"); @@ -1685,7 +1686,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {2, 3}}, {"block_2", {1}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {2, 3}}, {{"block_2"}, {1}}}, bucketRank); } { SCOPED_TRACE("Change parts for Node 2"); @@ -1694,7 +1695,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {1, 2}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {3}}, {"block_2", {1, 2}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {3}}, {{"block_2"}, {1, 2}}}, bucketRank); } { SCOPED_TRACE("Change parts for Node 3"); @@ -1703,7 +1704,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_2", {1, 2}}, {"block_2", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_2"}, {1, 2}}, {{"block_2"}, {3}}}, bucketRank); } } @@ -1725,7 +1726,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); } { @@ -1735,7 +1736,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {2, 3}}, {"block_2", {1}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {2, 3}}, {{"block_2"}, {1}}}, bucketRank); } { @@ -1745,7 +1746,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {1, 2}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {3}}, {"block_2", {1, 2}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {3}}, {{"block_2"}, {1, 2}}}, bucketRank); } { @@ -1755,7 +1756,7 @@ TEST_F(VariableCapacityBuckets, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_2", {1, 2}}, {"block_2", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_2"}, {1, 2}}, {{"block_2"}, {3}}}, bucketRank); } } @@ -1772,12 +1773,12 @@ TEST_F(VariableCapacityBuckets, initialMeshConstruction_initialCapacity2_maxCapa stk::mesh::EntityVector newNodes; m_bulk->modification_begin(); m_bulk->declare_entities(stk::topology::NODE_RANK, ids, stk::mesh::PartVector{&block1}, newNodes); + m_bulk->modification_end(); check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); - m_bulk->modification_end(); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); } } @@ -1794,12 +1795,12 @@ TEST_F(VariableCapacityBuckets, initialMeshConstruction_initialCapacity1_maxCapa stk::mesh::EntityVector newNodes; m_bulk->modification_begin(); m_bulk->declare_entities(stk::topology::NODE_RANK, ids, stk::mesh::PartVector{&block1}, newNodes); + m_bulk->modification_end(); check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); - m_bulk->modification_end(); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); } } @@ -1860,7 +1861,7 @@ class VariableCapacityFieldData : public ::testing::TestWithParam(numRanks, alignment); } else if (GetParam() == FieldDataManagerType::CONTIGUOUS) { - const unsigned extraCapacity = 0; + const unsigned extraCapacity = 4; // Room for 1 extra int, for occasional in-place sorting const unsigned alignment = 4; fieldDataManager = std::make_unique(extraCapacity, alignment); } @@ -2203,7 +2204,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2215,7 +2216,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {2, 3}}, {"block_2", {1}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {2, 3}}, {{"block_2"}, {1}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2227,7 +2228,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {1, 2}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {3}}, {"block_2", {1, 2}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {3}}, {{"block_2"}, {1, 2}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2238,7 +2239,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity2_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_2", {1, 2}}, {"block_2", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_2"}, {1, 2}}, {{"block_2"}, {3}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2265,7 +2266,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {1, 2}}, {"block_1", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {1, 2}}, {{"block_1"}, {3}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2277,7 +2278,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {2, 3}}, {"block_2", {1}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {2, 3}}, {{"block_2"}, {1}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2289,7 +2290,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {1, 2}); check_bucket_capacities(*m_bulk, {2, 2}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_1", {3}}, {"block_2", {1, 2}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_1"}, {3}}, {{"block_2"}, {1, 2}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2300,7 +2301,7 @@ TEST_P(VariableCapacityFieldData, changeNodeParts_initialCapacity1_maxCapacity2) check_num_buckets(*m_bulk, 2); check_bucket_sizes(*m_bulk, {2, 1}); check_bucket_capacities(*m_bulk, {2, 1}); - ngp_unit_test_utils::check_bucket_layout(*m_bulk, {{"block_2", {1, 2}}, {"block_2", {3}}}, bucketRank); + check_bucket_layout(*m_bulk, {{{"block_2"}, {1, 2}}, {{"block_2"}, {3}}}, bucketRank); check_expected_bytes_allocated(*m_bulk, field); check_field_values(*m_bulk, field); } @@ -2325,10 +2326,10 @@ TEST_P(VariableCapacityFieldData, initialMeshConstruction_initialCapacity2_maxCa m_bulk->modification_begin(); m_bulk->declare_entities(stk::topology::NODE_RANK, ids, stk::mesh::PartVector{&block1}, newNodes); m_bulk->allocate_field_data(); + m_bulk->modification_end(); check_num_buckets(*m_bulk, 2); check_expected_bytes_allocated(*m_bulk, field); - m_bulk->modification_end(); } } @@ -2351,10 +2352,10 @@ TEST_P(VariableCapacityFieldData, initialMeshConstruction_initialCapacity1_maxCa m_bulk->modification_begin(); m_bulk->declare_entities(stk::topology::NODE_RANK, ids, stk::mesh::PartVector{&block1}, newNodes); m_bulk->allocate_field_data(); + m_bulk->modification_end(); check_num_buckets(*m_bulk, 2); check_expected_bytes_allocated(*m_bulk, field); - m_bulk->modification_end(); } } diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestPartitions.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestPartitions.cpp index 1569f62adc60..065836bb136d 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestPartitions.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestPartitions.cpp @@ -270,7 +270,7 @@ void check_test_partition_invariant(SelectorFixture& fix, EXPECT_TRUE(check_nonempty_strictly_ordered(field_data, bkt.size())); } const unsigned *bucket_key = bkt.key(); - for (size_t k = 0; k < partition_key.size() - 1; ++k) + for (size_t k = 0; k < partition_key.size(); ++k) { EXPECT_EQ(partition_key[k], bucket_key[k]); } diff --git a/packages/stk/stk_unit_tests/stk_mesh/face_creation/skin_mesh/UnitTestSkinMeshRefined.cpp b/packages/stk/stk_unit_tests/stk_mesh/face_creation/skin_mesh/UnitTestSkinMeshRefined.cpp index f24f4d1c49d7..980c9b1540d5 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/face_creation/skin_mesh/UnitTestSkinMeshRefined.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/face_creation/skin_mesh/UnitTestSkinMeshRefined.cpp @@ -22,7 +22,6 @@ TEST(ElementGraph, RefinedQuad) stk::mesh::Part &quad_part = meta.declare_part_with_topology("Quads", stk::topology::QUADRILATERAL_4_2D); stk::mesh::Part &skin = meta.declare_part_with_topology("Edges", stk::topology::LINE_2); stk::io::put_io_part_attribute(skin); - stk::mesh::PartVector skin_parts = {&skin}; stk::mesh::Part &active = meta.declare_part("active"); stk::mesh::Field & node_coord = meta.declare_field(stk::topology::NODE_RANK, "coordinates"); diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpFieldTestUtils.hpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpFieldTestUtils.hpp index 17768ef7edfc..7c52397c85cc 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpFieldTestUtils.hpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpFieldTestUtils.hpp @@ -7,6 +7,7 @@ #include #include #include +#include "stk_mesh/base/NgpForEachEntity.hpp" namespace ngp_field_test_utils { diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpMeshTest.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpMeshTest.cpp index b95d67c4c016..e96770e3cbe3 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpMeshTest.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpMeshTest.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -278,3 +279,43 @@ TEST(NgpHostMesh, FieldForEachEntityReduceOnHost_fromTylerVoskuilen) EXPECT_EQ(1.0, maxZ); } +void add_elements(std::unique_ptr& bulk) +{ + stk::mesh::MetaData& meta = bulk->mesh_meta_data(); + stk::mesh::Part& part_1 = meta.declare_part_with_topology("part_1", stk::topology::HEX_8); + + const int rank = stk::parallel_machine_rank(MPI_COMM_WORLD); + const stk::mesh::EntityId elemId = rank + 1; + const stk::mesh::EntityId firstNodeId = rank * 8 + 1; + + stk::mesh::EntityIdVector nodeIds(8, 0); + for (unsigned i = 0; i < nodeIds.size(); ++i) { + nodeIds[i] = firstNodeId + i; + } + + bulk->modification_begin(); + stk::mesh::declare_element(*bulk, part_1, elemId, nodeIds); + bulk->modification_end(); +} + +TEST(NgpTeardownOrdering, BulkDataOutlivesNgpMesh) +{ + std::unique_ptr bulk = stk::mesh::MeshBuilder(MPI_COMM_WORLD).set_spatial_dimension(3).create(); + add_elements(bulk); + + [[maybe_unused]] stk::mesh::NgpMesh ngpMesh = stk::mesh::get_updated_ngp_mesh(*bulk); + + // The "expect" for this test is a clean Valgrind run and no seg-faults +} + +TEST(NgpTeardownOrdering, NgpMeshOutlivesBulkData) +{ + stk::mesh::NgpMesh ngpMesh; + std::unique_ptr bulk = stk::mesh::MeshBuilder(MPI_COMM_WORLD).set_spatial_dimension(3).create(); + add_elements(bulk); + + ngpMesh = stk::mesh::get_updated_ngp_mesh(*bulk); + + // The "expect" for this test is a clean Valgrind run and no seg-faults +} + diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpParallelSumTest.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpParallelSumTest.cpp index 2562c7f3aca1..ee0e5667a0d0 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpParallelSumTest.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpParallelSumTest.cpp @@ -595,7 +595,7 @@ NGP_TEST_F(NgpCommunicateFieldData, simpleVersion_takesBulkData_noSyncToDeviceAf check_field_on_device(ngpMesh, deviceUserField, deviceGoldValues); } -NGP_TEST_F(NgpParallelSum, DISABLED_DeviceMPIVersion) +NGP_TEST_F(NgpParallelSum, DeviceMPIVersion) { if (!stk::have_device_aware_mpi()) { GTEST_SKIP(); } diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp index 1b623df4984b..4d7b9e514485 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp @@ -16,9 +16,11 @@ namespace ngp_unit_test_utils { +constexpr unsigned MaxNumParts = 4; + struct BucketContents { - std::string partName; + std::vector partNames; std::vector entities; }; @@ -63,9 +65,9 @@ inline void setup_mesh_2hex_2block(stk::mesh::BulkData& bulk, unsigned bucketCap stk::unit_test_util::setup_text_mesh(bulk, meshDesc); } -struct CheckPartMembership { +struct CheckBucketParts { using BucketPartOrdinalType = Kokkos::View; - CheckPartMembership( + CheckBucketParts( const stk::mesh::NgpMesh& _ngpMesh, BucketPartOrdinalType _bucketPartOrdinals, size_t _numBuckets, const stk::topology::rank_t _bucketRank) : ngpMesh(_ngpMesh), @@ -78,8 +80,13 @@ struct CheckPartMembership { KOKKOS_FUNCTION void operator()(size_t) const { - for (unsigned i = 0; i < numBuckets; ++i) { - NGP_EXPECT_TRUE(ngpMesh.get_bucket(bucketRank, i).member(bucketPartOrdinals[i])); + for (unsigned bucketIdx = 0; bucketIdx < numBuckets; ++bucketIdx) { + for (unsigned partIdx = 0; partIdx < MaxNumParts; ++partIdx) { + unsigned partOffset = bucketIdx*MaxNumParts + partIdx; + if (bucketPartOrdinals[partOffset] != stk::mesh::InvalidOrdinal) { + NGP_EXPECT_TRUE(ngpMesh.get_bucket(bucketRank, bucketIdx).member(bucketPartOrdinals[partOffset])); + } + } } } @@ -90,42 +97,59 @@ struct CheckPartMembership { stk::topology::rank_t bucketRank; }; -inline void check_bucket_layout(const stk::mesh::BulkData& bulk, const std::vector & expectedBucketLayout, +inline void check_bucket_layout(const stk::mesh::BulkData& bulk, + const std::vector & expectedBucketLayout, const stk::topology::rank_t bucketRank = stk::topology::ELEM_RANK) { const stk::mesh::MetaData& meta = bulk.mesh_meta_data(); const stk::mesh::BucketVector & buckets = bulk.buckets(bucketRank); size_t numBuckets = buckets.size(); - ASSERT_EQ(numBuckets, expectedBucketLayout.size()); + ASSERT_EQ(numBuckets, expectedBucketLayout.size()) << "Found " << numBuckets << " Host Buckets when expecting " + << expectedBucketLayout.size(); size_t numEntitiesAcrossBuckets = 0; for (size_t bucketIdx = 0; bucketIdx < numBuckets; ++bucketIdx) { const stk::mesh::Bucket & bucket = *buckets[bucketIdx]; const BucketContents & bucketContents = expectedBucketLayout[bucketIdx]; - const stk::mesh::Part & expectedPart = *meta.get_part(bucketContents.partName); - EXPECT_TRUE(bucket.member(expectedPart)); + for (const std::string& partName : bucketContents.partNames) { + const stk::mesh::Part& expectedPart = *meta.get_part(partName); + EXPECT_TRUE(bucket.member(expectedPart)) << "Host Bucket " << bucket.bucket_id() << " not a member of Part " + << expectedPart.name(); + } numEntitiesAcrossBuckets += bucket.size(); - ASSERT_EQ(bucket.size(), bucketContents.entities.size()); + ASSERT_EQ(bucket.size(), bucketContents.entities.size()) << "Found " << bucket.size() + << " Entities in Host Bucket when expecting " + << bucketContents.entities.size(); for (unsigned i = 0; i < bucket.size(); ++i) { - EXPECT_EQ(bulk.identifier(bucket[i]), bucketContents.entities[i]); + EXPECT_EQ(bulk.identifier(bucket[i]), bucketContents.entities[i]) << "Found " << bucket[i] + << " in Host Bucket when expecting " + << bucketContents.entities[i]; } } using BucketPartOrdinalType = Kokkos::View; - BucketPartOrdinalType bucketPartOrdinals("bucketPartOrdinals", numBuckets); + BucketPartOrdinalType bucketPartOrdinals("bucketPartOrdinals", numBuckets*MaxNumParts); BucketPartOrdinalType::HostMirror hostBucketPartOrdinals = Kokkos::create_mirror_view(bucketPartOrdinals); - for (size_t i = 0; i < buckets.size(); ++i) { - hostBucketPartOrdinals[i] = meta.get_part(expectedBucketLayout[i].partName)->mesh_meta_data_ordinal(); + Kokkos::deep_copy(hostBucketPartOrdinals, stk::mesh::InvalidOrdinal); + for (size_t bucketIdx = 0; bucketIdx < buckets.size(); ++bucketIdx) { + const unsigned numExpectedParts = expectedBucketLayout[bucketIdx].partNames.size(); + STK_ThrowRequireMsg(numExpectedParts <= MaxNumParts, "Checking more Parts than test fixture supports"); + for (size_t partIdx = 0; partIdx < numExpectedParts; ++partIdx) { + const std::string& partName = expectedBucketLayout[bucketIdx].partNames[partIdx]; + unsigned partOffset = bucketIdx*MaxNumParts + partIdx; + hostBucketPartOrdinals[partOffset] = meta.get_part(partName)->mesh_meta_data_ordinal(); + } } Kokkos::deep_copy(bucketPartOrdinals, hostBucketPartOrdinals); stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(bulk); - CheckPartMembership checkElementMembership(ngpMesh, bucketPartOrdinals, numBuckets, bucketRank); - Kokkos::parallel_for(stk::ngp::DeviceRangePolicy(0, 1), checkElementMembership); + CheckBucketParts checkBucketParts(ngpMesh, bucketPartOrdinals, numBuckets, bucketRank); + Kokkos::parallel_for(stk::ngp::DeviceRangePolicy(0, 1), checkBucketParts); - ASSERT_EQ(ngpMesh.num_buckets(bucketRank), numBuckets); + ASSERT_EQ(ngpMesh.num_buckets(bucketRank), numBuckets) << "Found " << ngpMesh.num_buckets(bucketRank) + << " Device Buckets when expecting " << numBuckets; for (unsigned bucketIdx = 0; bucketIdx < numBuckets; ++bucketIdx) { const stk::mesh::NgpMesh::BucketType & ngpBucket = ngpMesh.get_bucket(bucketRank, bucketIdx); @@ -133,7 +157,7 @@ inline void check_bucket_layout(const stk::mesh::BulkData& bulk, const std::vec ASSERT_EQ(bucket.size(), ngpBucket.size()); } - using BucketEntitiesType = Kokkos::View; + using BucketEntitiesType = Kokkos::View; BucketEntitiesType bucketEntities("bucketEntities", numEntitiesAcrossBuckets); BucketEntitiesType::HostMirror hostBucketEntities = Kokkos::create_mirror_view(bucketEntities); @@ -143,7 +167,7 @@ inline void check_bucket_layout(const stk::mesh::BulkData& bulk, const std::vec for (unsigned bucketIdx = 0; bucketIdx < numBuckets; ++bucketIdx) { const stk::mesh::NgpMesh::BucketType & bucket = ngpMesh.get_bucket(bucketRank, bucketIdx); for (size_t i = 0; i < bucket.size(); ++i) { - bucketEntities[idx++] = ngpMesh.identifier(bucket[i]); + bucketEntities[idx++] = bucket[i]; } } }); @@ -154,7 +178,9 @@ inline void check_bucket_layout(const stk::mesh::BulkData& bulk, const std::vec for (size_t bucketIdx = 0; bucketIdx < numBuckets; ++bucketIdx) { const stk::mesh::Bucket & bucket = *buckets[bucketIdx]; for (unsigned i = 0; i < bucket.size(); ++i) { - EXPECT_EQ(bulk.identifier(bucket[i]), hostBucketEntities[index++]); + const stk::mesh::Entity deviceEntity = hostBucketEntities[index++]; + EXPECT_EQ(bucket[i], deviceEntity) << "Found " << deviceEntity << " in Device Bucket when expecting " + << bucket[i]; } } } diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/TestNgpMeshUpdate.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/TestNgpMeshUpdate.cpp index 2dd63a22b895..8880e9de6a1b 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/TestNgpMeshUpdate.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/TestNgpMeshUpdate.cpp @@ -12,6 +12,7 @@ namespace { +using ngp_unit_test_utils::check_bucket_layout; using NgpMeshDefaultMemSpace = stk::mesh::NgpMeshDefaultMemSpace; class UpdateNgpMesh : public stk::unit_test_util::MeshFixture @@ -185,7 +186,7 @@ TEST_F(BucketLayoutModification, DeleteBucketInMiddle) stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}}); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_1")}; @@ -195,7 +196,7 @@ TEST_F(BucketLayoutModification, DeleteBucketInMiddle) ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1,2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1,2}}, {{"block_3"}, {3}}}); } // ------------------------- ------------------------- @@ -213,7 +214,7 @@ TEST_F(BucketLayoutModification, AddBucketInMiddle) stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}}); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_1")}; @@ -223,7 +224,7 @@ TEST_F(BucketLayoutModification, AddBucketInMiddle) ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_1", {3}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_1"}, {3}}, {{"block_2"}, {2}}}); } // ------------------------- ------------------------- @@ -241,7 +242,7 @@ TEST_F(BucketLayoutModification, ChangeBucketContents) stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1,2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1,2}}, {{"block_3"}, {3}}}); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_3")}; @@ -251,7 +252,7 @@ TEST_F(BucketLayoutModification, ChangeBucketContents) ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_3", {2,3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_3"}, {2,3}}}); } // ------------------------- ------------------------- @@ -269,7 +270,7 @@ TEST_F(BucketLayoutModification, DeleteBucketInMiddle_WithCopy) stk::mesh::NgpMesh ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}}); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_1")}; @@ -279,7 +280,7 @@ TEST_F(BucketLayoutModification, DeleteBucketInMiddle_WithCopy) stk::mesh::get_updated_ngp_mesh(get_bulk()); // Trigger an update - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1,2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1,2}}, {{"block_3"}, {3}}}); } // ------------------------- ------------------------- @@ -297,7 +298,7 @@ TEST_F(BucketLayoutModification, AddBucketInMiddle_WithCopy) stk::mesh::NgpMesh ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}}); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_1")}; @@ -307,7 +308,7 @@ TEST_F(BucketLayoutModification, AddBucketInMiddle_WithCopy) stk::mesh::get_updated_ngp_mesh(get_bulk()); // Trigger an update - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_1", {3}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_1"}, {3}}, {{"block_2"}, {2}}}); } // ------------------------- ------------------------- @@ -325,7 +326,7 @@ TEST_F(BucketLayoutModification, ChangeBucketContents_WithCopy) stk::mesh::NgpMesh ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1,2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1,2}}, {{"block_3"}, {3}}}); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_3")}; @@ -335,7 +336,7 @@ TEST_F(BucketLayoutModification, ChangeBucketContents_WithCopy) stk::mesh::get_updated_ngp_mesh(get_bulk()); // Trigger an update - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_3", {2,3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_3"}, {2,3}}}); } -} +} // namespace diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgp.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgp.cpp index 0472a7af4b76..4c869fb317de 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgp.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgp.cpp @@ -52,6 +52,16 @@ void test_view_of_fields(const stk::mesh::BulkData& bulk, EXPECT_EQ(1, result.h_view(0)); EXPECT_EQ(1, result.h_view(1)); + +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) + for (unsigned i = 0; i < 2; ++i) { +#ifdef STK_USE_DEVICE_MESH // Compiler can't resolve destructor type through NgpField using statement + fields(i).~DeviceField(); +#else + fields(i).~HostField(); +#endif + } +#endif } TEST(UnitTestNgp, viewOfFields) diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpMeshModification.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpMeshModification.cpp new file mode 100644 index 000000000000..7e70384d89f9 --- /dev/null +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpMeshModification.cpp @@ -0,0 +1,444 @@ +// Copyright 2002 - 2008, 2010, 2011 National Technology Engineering +// Solutions of Sandia, LLC (NTESS). Under the terms of Contract +// DE-NA0003525 with NTESS, the U.S. Government retains certain rights +// in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of NTESS nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include +#include +#include "ngp/NgpUnitTestUtils.hpp" +#include "stk_mesh/base/MeshBuilder.hpp" +#include "stk_mesh/base/BulkData.hpp" +#include "stk_mesh/base/MetaData.hpp" +#include "stk_mesh/base/Part.hpp" +#include "stk_mesh/base/Types.hpp" +#include "stk_mesh/base/SkinMesh.hpp" + +namespace +{ +using ngp_unit_test_utils::check_bucket_layout; + +class NgpBatchChangeEntityParts : public ::testing::Test +{ +public: + NgpBatchChangeEntityParts() + { + } + + void build_empty_mesh(unsigned initialBucketCapacity, unsigned maximumBucketCapacity) + { + stk::mesh::MeshBuilder builder(MPI_COMM_WORLD); + builder.set_spatial_dimension(3); + builder.set_initial_bucket_capacity(initialBucketCapacity); + builder.set_maximum_bucket_capacity(maximumBucketCapacity); + m_bulk = builder.create(); + m_meta = &m_bulk->mesh_meta_data(); + stk::mesh::get_updated_ngp_mesh(*m_bulk); + } + +protected: + std::unique_ptr m_bulk; + stk::mesh::MetaData * m_meta; +}; + +stk::mesh::Entity create_node(stk::mesh::BulkData& bulk, stk::mesh::EntityId nodeId, + const stk::mesh::PartVector& initialParts = stk::mesh::PartVector()) +{ + bulk.modification_begin(); + stk::mesh::Entity newNode = bulk.declare_node(nodeId, initialParts); + bulk.modification_end(); + + return newNode; +} + +template +void confirm_host_mesh_is_not_synchronized_from_device(const MeshType& ngpMesh) +{ + if constexpr (std::is_same_v) { + EXPECT_EQ(ngpMesh.need_sync_to_host(), true); + } + else { + EXPECT_EQ(ngpMesh.need_sync_to_host(), false); // If host build, HostMesh can't ever be stale + } +} + +template +void confirm_host_mesh_is_synchronized_from_device(const MeshType& ngpMesh) +{ + EXPECT_EQ(ngpMesh.need_sync_to_host(), false); +} + +using DeviceEntitiesType = Kokkos::View; +using DevicePartOrdinalsType = Kokkos::View; + +using HostEntitiesType = Kokkos::View; +using HostPartOrdinalsType = Kokkos::View; + +void fill_device_views_add_remove_part_from_node(DeviceEntitiesType& entities, DevicePartOrdinalsType& addPartOrdinals, + DevicePartOrdinalsType& removePartOrdinals, stk::mesh::NgpMesh& ngpMesh, + stk::mesh::Entity node, stk::mesh::Part* addPart, + stk::mesh::Part* removePart) +{ + const stk::mesh::BulkData& bulk = ngpMesh.get_bulk_on_host(); + stk::mesh::EntityId nodeId = bulk.identifier(node); + const stk::mesh::PartOrdinal addPartOrdinal = (addPart) ? addPart->mesh_meta_data_ordinal() + : stk::mesh::InvalidPartOrdinal; + const stk::mesh::PartOrdinal removePartOrdinal = (removePart) ? removePart->mesh_meta_data_ordinal() + : stk::mesh::InvalidPartOrdinal; + + Kokkos::parallel_for("Fill Device Views for Part Addition", stk::ngp::DeviceRangePolicy(0, 1), + KOKKOS_LAMBDA(size_t /*index*/) { + STK_NGP_ThrowRequireMsg(ngpMesh.identifier(node) == nodeId, "Unexpected node found on device"); + entities(0) = node; + + if (addPartOrdinal != stk::mesh::InvalidPartOrdinal) { + addPartOrdinals(0) = addPartOrdinal; + } + + if (removePartOrdinal != stk::mesh::InvalidPartOrdinal) { + removePartOrdinals(0) = removePartOrdinal; + } + }); +} + + +TEST_F(NgpBatchChangeEntityParts, addPartToNode_host) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + std::vector entities {node1}; + std::vector addParts {&part2}; + std::vector removeParts; + + m_bulk->batch_change_entity_parts(entities, addParts, removeParts); + + check_bucket_layout(*m_bulk, {{{"part1", "part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, addPartToNode_ngpHost) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + HostEntitiesType entities("hostEntities", 1); + HostPartOrdinalsType addPartOrdinals("hostAddParts", 1); + HostPartOrdinalsType removePartOrdinals("hostRemoveParts", 0); + + entities(0) = node1; + addPartOrdinals(0) = part2.mesh_meta_data_ordinal(); + + stk::mesh::HostMesh hostMesh(*m_bulk); + hostMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_synchronized_from_device(hostMesh); + + hostMesh.sync_to_host(); + confirm_host_mesh_is_synchronized_from_device(hostMesh); + + check_bucket_layout(*m_bulk, {{{"part1", "part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, addPartToNode_ngpDevice) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + DeviceEntitiesType entities("deviceEntities", 1); + DevicePartOrdinalsType addPartOrdinals("deviceAddParts", 1); + DevicePartOrdinalsType removePartOrdinals("deviceRemoveParts", 0); + + stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(*m_bulk); + fill_device_views_add_remove_part_from_node(entities, addPartOrdinals, removePartOrdinals, ngpMesh, + node1, &part2, nullptr); + + ngpMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_not_synchronized_from_device(ngpMesh); + + ngpMesh.sync_to_host(); + confirm_host_mesh_is_synchronized_from_device(ngpMesh); + + check_bucket_layout(*m_bulk, {{{"part1", "part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + + +TEST_F(NgpBatchChangeEntityParts, removePartFromNode_host) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1, &part2}); + check_bucket_layout(*m_bulk, {{{"part1", "part2"}, {nodeId}}}, stk::topology::NODE_RANK); + + std::vector entities {node1}; + std::vector addParts; + std::vector removeParts {&part1}; + + m_bulk->batch_change_entity_parts(entities, addParts, removeParts); + stk::mesh::get_updated_ngp_mesh(*m_bulk); + + check_bucket_layout(*m_bulk, {{{"part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, removePartFromNode_ngpHost) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1, &part2}); + check_bucket_layout(*m_bulk, {{{"part1", "part2"}, {nodeId}}}, stk::topology::NODE_RANK); + + HostEntitiesType entities("hostEntities", 1); + HostPartOrdinalsType addPartOrdinals("hostAddParts", 0); + HostPartOrdinalsType removePartOrdinals("hostRemoveParts", 1); + + entities(0) = node1; + removePartOrdinals(0) = part1.mesh_meta_data_ordinal(); + + stk::mesh::HostMesh hostMesh(*m_bulk); + hostMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_synchronized_from_device(hostMesh); + + hostMesh.sync_to_host(); + confirm_host_mesh_is_synchronized_from_device(hostMesh); + + check_bucket_layout(*m_bulk, {{{"part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, removePartFromNode_ngpDevice) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1, &part2}); + check_bucket_layout(*m_bulk, {{{"part1", "part2"}, {nodeId}}}, stk::topology::NODE_RANK); + + DeviceEntitiesType entities("deviceEntities", 1); + DevicePartOrdinalsType addPartOrdinals("deviceAddParts", 0); + DevicePartOrdinalsType removePartOrdinals("deviceRemoveParts", 1); + + stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(*m_bulk); + fill_device_views_add_remove_part_from_node(entities, addPartOrdinals, removePartOrdinals, ngpMesh, + node1, nullptr, &part1); + + ngpMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_not_synchronized_from_device(ngpMesh); + + ngpMesh.sync_to_host(); + confirm_host_mesh_is_synchronized_from_device(ngpMesh); + + check_bucket_layout(*m_bulk, {{{"part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + + +TEST_F(NgpBatchChangeEntityParts, addAndRemovePartFromNode_host) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + std::vector entities {node1}; + std::vector addParts {&part2}; + std::vector removeParts {&part1}; + + m_bulk->batch_change_entity_parts(entities, addParts, removeParts); + stk::mesh::get_updated_ngp_mesh(*m_bulk); + + check_bucket_layout(*m_bulk, {{{"part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, addAndRemovePartFromNode_ngpHost) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + HostEntitiesType entities("hostEntities", 1); + HostPartOrdinalsType addPartOrdinals("hostAddParts", 1); + HostPartOrdinalsType removePartOrdinals("hostRemoveParts", 1); + + entities(0) = node1; + addPartOrdinals(0) = part2.mesh_meta_data_ordinal(); + removePartOrdinals(0) = part1.mesh_meta_data_ordinal(); + + stk::mesh::HostMesh hostMesh(*m_bulk); + hostMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_synchronized_from_device(hostMesh); + + hostMesh.sync_to_host(); + confirm_host_mesh_is_synchronized_from_device(hostMesh); + + check_bucket_layout(*m_bulk, {{{"part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, addAndRemovePartFromNode_ngpDevice) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + DeviceEntitiesType entities("deviceEntities", 1); + DevicePartOrdinalsType addPartOrdinals("deviceAddParts", 1); + DevicePartOrdinalsType removePartOrdinals("deviceRemoveParts", 1); + + stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(*m_bulk); + fill_device_views_add_remove_part_from_node(entities, addPartOrdinals, removePartOrdinals, ngpMesh, + node1, &part2, &part1); + + ngpMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_not_synchronized_from_device(ngpMesh); + + ngpMesh.sync_to_host(); + confirm_host_mesh_is_synchronized_from_device(ngpMesh); + + check_bucket_layout(*m_bulk, {{{"part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, multipleDeviceMeshMods) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + stk::mesh::Part & part3 = m_meta->declare_part_with_topology("part3", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + DeviceEntitiesType entities("deviceEntities", 1); + DevicePartOrdinalsType addPartOrdinals("deviceAddParts", 1); + DevicePartOrdinalsType removePartOrdinals("deviceRemoveParts", 0); + + stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(*m_bulk); + fill_device_views_add_remove_part_from_node(entities, addPartOrdinals, removePartOrdinals, ngpMesh, + node1, &part2, nullptr); + + ngpMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_not_synchronized_from_device(ngpMesh); + + fill_device_views_add_remove_part_from_node(entities, addPartOrdinals, removePartOrdinals, ngpMesh, + node1, &part3, nullptr); + + ngpMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_not_synchronized_from_device(ngpMesh); + + ngpMesh.sync_to_host(); + confirm_host_mesh_is_synchronized_from_device(ngpMesh); + + check_bucket_layout(*m_bulk, {{{"part1", "part2"}, {nodeId}}}, stk::topology::NODE_RANK); +} + +TEST_F(NgpBatchChangeEntityParts, failedHostAccessAfterDeviceMeshMod) +{ + if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) GTEST_SKIP(); + + build_empty_mesh(1, 1); + + stk::mesh::Part & part1 = m_meta->declare_part_with_topology("part1", stk::topology::NODE); + stk::mesh::Part & part2 = m_meta->declare_part_with_topology("part2", stk::topology::NODE); + const unsigned nodeId = 1; + const stk::mesh::Entity node1 = create_node(*m_bulk, nodeId, {&part1}); + check_bucket_layout(*m_bulk, {{{"part1"}, {nodeId}}}, stk::topology::NODE_RANK); + + DeviceEntitiesType entities("deviceEntities", 1); + DevicePartOrdinalsType addPartOrdinals("deviceAddParts", 1); + DevicePartOrdinalsType removePartOrdinals("deviceRemoveParts", 0); + + stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(*m_bulk); + fill_device_views_add_remove_part_from_node(entities, addPartOrdinals, removePartOrdinals, ngpMesh, + node1, &part2, nullptr); + + ngpMesh.batch_change_entity_parts(entities, addPartOrdinals, removePartOrdinals); + confirm_host_mesh_is_not_synchronized_from_device(ngpMesh); + + if constexpr (std::is_same_v) { + EXPECT_ANY_THROW(m_bulk->buckets(stk::topology::NODE_RANK)); + EXPECT_ANY_THROW(m_bulk->get_buckets(stk::topology::NODE_RANK, m_meta->universal_part())); + EXPECT_ANY_THROW(m_bulk->modification_begin()); + EXPECT_ANY_THROW(m_bulk->batch_change_entity_parts(stk::mesh::EntityVector{node1}, stk::mesh::PartVector{}, + stk::mesh::PartVector{})); + EXPECT_ANY_THROW(stk::mesh::skin_mesh(*m_bulk, stk::mesh::PartVector{&part1})); + } +} + +} // namespace diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp index 4dc9cadf14c6..8cd3fb2dafad 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp @@ -63,6 +63,8 @@ namespace ngp_field_test { +using ngp_unit_test_utils::check_bucket_layout; + class NgpFieldFixture : public stk::unit_test_util::MeshFixture { public: @@ -656,7 +658,7 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture void modify_mesh_add_and_delete_bucket(stk::mesh::Field& stkIntField, stk::mesh::NgpField& ngpIntField) { stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}}); check_field_data_on_device(ngpIntField, stkIntField); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_3")}; @@ -664,7 +666,7 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture get_bulk().change_entity_parts(get_bulk().get_entity(stk::topology::ELEM_RANK, 1), addParts, removeParts); get_bulk().modification_end(); ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_3", {1}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_3"}, {1}}, {{"block_2"}, {2}}}); } void fill_nodes(const stk::mesh::Entity element, unsigned numNodes, stk::mesh::EntityVector& nodes) @@ -727,19 +729,19 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture void modify_mesh_add_and_delete_bucket3(stk::mesh::Field& stkIntField, stk::mesh::NgpField& ngpIntField) { stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}}); check_field_data_on_device(ngpIntField, stkIntField); get_bulk().modification_begin(); replace_element_and_place_in_block("block_3"); get_bulk().modification_end(); ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_3", {4}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_3"}, {4}}, {{"block_2"}, {2}}}); } void modify_mesh_add_and_delete_bucket2(stk::mesh::Field& stkIntField, stk::mesh::NgpField& ngpIntField) { stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}}); check_field_data_on_device(ngpIntField, stkIntField); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_2")}; @@ -750,13 +752,13 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture get_bulk().change_entity_parts(get_bulk().get_entity(stk::topology::ELEM_RANK, 2), addParts, removeParts); get_bulk().modification_end(); ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_3", {2}}, {"block_2", {1}}}); + check_bucket_layout(get_bulk(), {{{"block_3"}, {2}}, {{"block_2"}, {1}}}); } void modify_mesh_delete_bucket_in_middle(stk::mesh::Field& stkIntField, stk::mesh::NgpField& ngpIntField) { stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}}); check_field_data_on_device(ngpIntField, stkIntField); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_1")}; @@ -764,13 +766,13 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture get_bulk().change_entity_parts(get_bulk().get_entity(stk::topology::ELEM_RANK, 2), addParts, removeParts); get_bulk().modification_end(); ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1,2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1,2}}, {{"block_3"}, {3}}}); } void modify_mesh_add_bucket_in_middle(stk::mesh::Field& stkIntField, stk::mesh::NgpField& ngpIntField) { stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}}); check_field_data_on_device(ngpIntField, stkIntField); get_bulk().modification_begin(); stk::mesh::PartVector addParts{get_meta().get_part("block_1")}; @@ -778,13 +780,13 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture get_bulk().change_entity_parts(get_bulk().get_entity(stk::topology::ELEM_RANK, 3), addParts, removeParts); get_bulk().modification_end(); ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_1", {3}}, {"block_2", {2}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_1"}, {3}}, {{"block_2"}, {2}}}); } void modify_mesh_add_element(stk::mesh::Field& stkIntField, stk::mesh::NgpField& ngpIntField, unsigned bucketCapacity) { stk::mesh::NgpMesh & ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}}); check_field_data_on_device(ngpIntField, stkIntField); get_bulk().modification_begin(); @@ -796,17 +798,17 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture ngpMesh.update_mesh(); if(bucketCapacity == 1) { - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3}}, {"block_3", {4}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3}}, {{"block_3"}, {4}}}); } else if(bucketCapacity == 2) { - ngp_unit_test_utils::check_bucket_layout(get_bulk(), {{"block_1", {1}}, {"block_2", {2}}, {"block_3", {3,4}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1}}, {{"block_2"}, {2}}, {{"block_3"}, {3,4}}}); } } void modify_mesh_change_bucket_content(stk::mesh::Field& stkIntField, stk::mesh::NgpField& ngpIntField) { stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), { {"block_1", {1, 2}}, {"block_3", {3}}}); + check_bucket_layout(get_bulk(), {{{"block_1"}, {1, 2}}, {{"block_3"}, {3}}}); check_field_data_on_device(ngpIntField, stkIntField); get_bulk().modification_begin(); stk::mesh::PartVector addParts {get_meta().get_part("block_3")}; @@ -814,7 +816,7 @@ class OptimizedNgpFieldFixture : public NgpFieldFixture get_bulk().change_entity_parts(get_bulk().get_entity(stk::topology::ELEM_RANK, 2), addParts, removeParts); get_bulk().modification_end(); ngpMesh.update_mesh(); - ngp_unit_test_utils::check_bucket_layout(get_bulk(), { {"block_1", {1}}, {"block_3", {2, 3}}}); + check_bucket_layout(get_bulk(), { {{"block_1"}, {1}}, {{"block_3"}, {2, 3}}}); } }; @@ -2630,4 +2632,4 @@ TEST_F(NgpFieldUpdate, MoveBackwardForwardBackward) check_field_values(); } -} +} // namespace ngp_field_test diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpMultiStateFieldTests.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpMultiStateFieldTests.cpp index 78f708dc9517..8ad20403ba53 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpMultiStateFieldTests.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpMultiStateFieldTests.cpp @@ -200,8 +200,8 @@ NGP_TEST_F(NgpMultiStateFieldTest, multistateField_rotateDeviceStates_syncStates const double valueNew = 44.4; const double valueOld = 22.2; - stk::mesh::field_fill(valueNew, get_field_new(), stk::ngp::HostExecSpace()); - stk::mesh::field_fill(valueOld, get_field_old(), stk::ngp::HostExecSpace()); + stk::mesh::field_fill(valueNew, get_field_new()); + stk::mesh::field_fill(valueOld, get_field_old()); stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); stk::mesh::NgpField& ngpFieldNew = stk::mesh::get_updated_ngp_field(get_field_new()); @@ -230,8 +230,8 @@ NGP_TEST_F(NgpMultiStateFieldTest, multistateField_copyHasCorrectDataAfterStateR const double valueNew = 44.4; const double valueOld = 22.2; - stk::mesh::field_fill(valueNew, get_field_new(), stk::ngp::HostExecSpace()); - stk::mesh::field_fill(valueOld, get_field_old(), stk::ngp::HostExecSpace()); + stk::mesh::field_fill(valueNew, get_field_new()); + stk::mesh::field_fill(valueOld, get_field_old()); stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); stk::mesh::NgpField& ngpFieldNew = stk::mesh::get_updated_ngp_field(get_field_new()); @@ -260,8 +260,8 @@ NGP_TEST_F(NgpMultiStateFieldTest, multistateField_copyHasWrongDataAfterDeviceSt const double valueNew = 44.4; const double valueOld = 22.2; - stk::mesh::field_fill(valueNew, get_field_new(), stk::ngp::HostExecSpace()); - stk::mesh::field_fill(valueOld, get_field_old(), stk::ngp::HostExecSpace()); + stk::mesh::field_fill(valueNew, get_field_new()); + stk::mesh::field_fill(valueOld, get_field_old()); stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); stk::mesh::NgpField& ngpFieldNew = stk::mesh::get_updated_ngp_field(get_field_new()); @@ -293,8 +293,8 @@ NGP_TEST_F(NgpMultiStateFieldTest, persistentDeviceField_hasCorrectDataAfterStat const double valueNew = 44.4; const double valueOld = 22.2; - stk::mesh::field_fill(valueNew, get_field_new(), stk::ngp::HostExecSpace()); - stk::mesh::field_fill(valueOld, get_field_old(), stk::ngp::HostExecSpace()); + stk::mesh::field_fill(valueNew, get_field_new()); + stk::mesh::field_fill(valueOld, get_field_old()); stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); stk::mesh::NgpField& ngpFieldNew = stk::mesh::get_updated_ngp_field(get_field_new()); @@ -322,8 +322,8 @@ NGP_TEST_F(NgpMultiStateFieldTest, persistentDeviceField_hasWrongDataAfterDevice const double valueNew = 44.4; const double valueOld = 22.2; - stk::mesh::field_fill(valueNew, get_field_new(), stk::ngp::HostExecSpace()); - stk::mesh::field_fill(valueOld, get_field_old(), stk::ngp::HostExecSpace()); + stk::mesh::field_fill(valueNew, get_field_new()); + stk::mesh::field_fill(valueOld, get_field_old()); stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); stk::mesh::NgpField& ngpFieldNew = stk::mesh::get_updated_ngp_field(get_field_new()); diff --git a/packages/stk/stk_unit_tests/stk_util/parallel/UnitTestDeviceAwareMPI.cpp b/packages/stk/stk_unit_tests/stk_util/parallel/UnitTestDeviceAwareMPI.cpp index 371565fcea7d..aa3b8f2fe19d 100644 --- a/packages/stk/stk_unit_tests/stk_util/parallel/UnitTestDeviceAwareMPI.cpp +++ b/packages/stk/stk_unit_tests/stk_util/parallel/UnitTestDeviceAwareMPI.cpp @@ -37,25 +37,146 @@ #include "stk_util/parallel/DeviceAwareMPI.hpp" #include "stk_util/ngp/NgpSpaces.hpp" -TEST(DeviceAwareMPI, DISABLED_trueIfOpenMPIAndCuda) +#ifdef STK_HAS_MPI + +TEST(DeviceAwareMPI, trueIfOpenMPIAndCuda) { #if defined(OMPI_MAJOR_VERSION) && defined(KOKKOS_ENABLE_CUDA) EXPECT_TRUE(stk::have_device_aware_mpi()); +#else + GTEST_SKIP()<<"trueIfOpenMPIAndCuda"; +#endif +} + +TEST(DeviceAwareMPI, trueIfATS2MPIAndCuda) +{ +#if defined(IBM_SPECTRUM_MPI) && defined(KOKKOS_ENABLE_CUDA) + EXPECT_TRUE(stk::have_device_aware_mpi()); +#else + GTEST_SKIP()<<"trueIfATS2MPIAndCuda"; #endif } -TEST(DeviceAwareMPI, DISABLED_falseIfOpenMpiButNoCuda) +TEST(DeviceAwareMPI, falseIfOpenMpiButNoCuda) { -#if defined(OMPI_MAJOR_VERSION) && !defined(KOKKOS_ENABLED_CUDA) +#if defined(OMPI_MAJOR_VERSION) && !defined(KOKKOS_ENABLE_CUDA) EXPECT_FALSE(stk::have_device_aware_mpi()); +#else + GTEST_SKIP()<<"falseIfOpenMpiButNoCuda"; #endif } -TEST(DeviceAwareMPI, DISABLED_falseIfIntel) +TEST(DeviceAwareMPI, falseIfIntelMpi) { #if defined(I_MPI_VERSION) EXPECT_FALSE(stk::have_device_aware_mpi()); +#else + GTEST_SKIP()<<"falseIfIntel, I_MPI_VERSION not defined"; #endif } +void check_device_aware_mpi_send_recv(MPI_Comm comm) +{ + const int numProcs = stk::parallel_machine_size(comm); + ASSERT_EQ(2, numProcs); + const int myProc = stk::parallel_machine_rank(comm); + const int otherProc = 1 - myProc; + const int msgTag = 10101; + + using BufferViewType = Kokkos::View; + constexpr size_t N = 8; + constexpr double tol = 1.e-7; + constexpr double goldValue = 3.14159; + + if (myProc == 0) { + BufferViewType sendBuf("sendBuf",N); + Kokkos::deep_copy(sendBuf, goldValue); + EXPECT_EQ(MPI_SUCCESS, MPI_Send(sendBuf.data(), N, MPI_DOUBLE, otherProc, msgTag, comm)); + } + else { + BufferViewType recvBuf("recvBuf",N); + Kokkos::deep_copy(recvBuf, 0.0); + MPI_Status status; + EXPECT_EQ( MPI_SUCCESS, MPI_Recv(recvBuf.data(), N, MPI_DOUBLE, otherProc, msgTag, comm, &status)); + + BufferViewType::HostMirror hostRecvBuf = Kokkos::create_mirror_view(recvBuf); + Kokkos::deep_copy(hostRecvBuf, recvBuf); + for(size_t i=0; i; + constexpr size_t N = 4200; + constexpr double tol = 1.e-7; + constexpr double goldValue = 3.14159; + + const int numCommProcs = 1; + std::vector sendRequests(numCommProcs); + std::vector recvRequests(numCommProcs); + std::vector statuses(numCommProcs); + + BufferViewType sendBuf("sendBuf",N); + BufferViewType recvBuf("recvBuf",N); + + stk::parallel_machine_barrier(comm); + + if (myProc == 1) { + Kokkos::deep_copy(recvBuf, 0.0); //theoretically unnecessary since Views initialize by default. + EXPECT_EQ(MPI_SUCCESS, MPI_Irecv(recvBuf.data(), N, MPI_DOUBLE, otherProc, msgTag, comm, &recvRequests[0])); + } + if (myProc == 0) { + Kokkos::deep_copy(sendBuf, goldValue); + EXPECT_EQ(MPI_SUCCESS, MPI_Isend(sendBuf.data(), N, MPI_DOUBLE, otherProc, msgTag, comm, &sendRequests[0])); + } + + Kokkos::fence(); + + if (myProc == 1) { + int idx = 99; + MPI_Waitany(numCommProcs, recvRequests.data(), &idx, MPI_STATUS_IGNORE); + EXPECT_EQ(0, idx); + + BufferViewType::HostMirror hostRecvBuf = Kokkos::create_mirror_view(recvBuf); + Kokkos::deep_copy(hostRecvBuf, recvBuf); + for(size_t i=0; i + +TEST(FPExceptions, SimpleAdditionNoError) +{ + if (!stk::util::have_errno() && !stk::util::have_errexcept()) GTEST_SKIP(); + + stk::util::clear_fp_errors(); + double x = 1.0 + 2.0; + EXPECT_NO_THROW(stk::util::throw_on_fp_error()); + EXPECT_EQ(x, 3.0); // appease the compiler +} + +TEST(FPExceptions, Log0Error) +{ + if (!stk::util::have_errno() && !stk::util::have_errexcept()) GTEST_SKIP(); + + stk::util::clear_fp_errors(); + std::log(0.0); + EXPECT_ANY_THROW(stk::util::throw_on_fp_error()); +} + +TEST(FPExceptions, FlagsAreClearedAfterThrow) +{ + if (!stk::util::have_errno() && !stk::util::have_errexcept()) GTEST_SKIP(); + + stk::util::clear_fp_errors(); + std::log(0.0); + EXPECT_ANY_THROW(stk::util::throw_on_fp_error()); + EXPECT_NO_THROW(stk::util::throw_on_fp_error()); +} + +TEST(FPExceptions, ErrorMessageContainsName) +{ + if (!stk::util::have_errno() && !stk::util::have_errexcept()) GTEST_SKIP(); + + stk::util::clear_fp_errors(); + std::log(0.0); + + std::string fname = "my_very_specific_and_clear_function_name"; + try { + stk::util::throw_on_fp_error(fname.c_str()); + } catch (std::exception& except) + { + std::string msg(except.what()); + size_t pos = msg.find(fname); + EXPECT_NE(pos, std::string::npos); + } +} + +TEST(FPExceptions, NoWarning) +{ + if (!stk::util::have_errno() && !stk::util::have_errexcept()) GTEST_SKIP(); + + stk::util::clear_fp_errors(); + double x = 1.0 + 2.0; + std::stringstream ss; + EXPECT_NO_THROW(stk::util::warn_on_fp_error(nullptr, ss)); + EXPECT_EQ(ss.str().size(), 0U); + EXPECT_EQ(x, 3.0); // appease the compiler +} + +TEST(FPExceptions, Warning) +{ + if (!stk::util::have_errno() && !stk::util::have_errexcept()) GTEST_SKIP(); + + stk::util::clear_fp_errors(); + std::log(0.0); + std::stringstream ss; + EXPECT_NO_THROW(stk::util::warn_on_fp_error(nullptr, ss)); + EXPECT_GT(ss.str().size(), 0U); +} diff --git a/packages/stk/stk_util/stk_util/Version.hpp b/packages/stk/stk_util/stk_util/Version.hpp index 244712c23833..4f4efaf1655d 100644 --- a/packages/stk/stk_util/stk_util/Version.hpp +++ b/packages/stk/stk_util/stk_util/Version.hpp @@ -44,7 +44,7 @@ //See the file CHANGELOG.md for a listing that shows the //correspondence between version numbers and API changes. -#define STK_VERSION 5210600 +#define STK_VERSION 5210601 namespace stk diff --git a/packages/stk/stk_util/stk_util/ngp/NgpSpaces.hpp b/packages/stk/stk_util/stk_util/ngp/NgpSpaces.hpp index d17990ca2a23..c0dc9a2d8f34 100644 --- a/packages/stk/stk_util/stk_util/ngp/NgpSpaces.hpp +++ b/packages/stk/stk_util/stk_util/ngp/NgpSpaces.hpp @@ -40,45 +40,36 @@ namespace stk { namespace ngp { using ExecSpace = Kokkos::DefaultExecutionSpace; - using HostExecSpace = Kokkos::DefaultHostExecutionSpace; -#ifdef KOKKOS_ENABLE_CUDA -using MemSpace = Kokkos::CudaSpace; -#elif defined(KOKKOS_ENABLE_HIP) -using MemSpace = Kokkos::HIPSpace; +#ifndef KOKKOS_HAS_SHARED_HOST_PINNED_SPACE +#ifndef _MSC_VER +#warning "Kokkos::SharedHostPinnedSpace is not defined." #else -using MemSpace = ExecSpace::memory_space; +#pragma message("Kokkos::SharedHostPinnedSpace is not defined.") #endif - -#ifdef KOKKOS_HAS_SHARED_SPACE -using UVMMemSpace = Kokkos::SharedSpace; -#else -#ifdef KOKKOS_ENABLE_CUDA -#ifdef KOKKOS_ENABLE_CUDA_UVM -using UVMMemSpace = Kokkos::CudaUVMSpace; +using HostPinnedSpace = Kokkos::HostSpace; #else -using UVMMemSpace = Kokkos::CudaHostPinnedSpace; +using HostPinnedSpace = Kokkos::SharedHostPinnedSpace; #endif -#elif defined(KOKKOS_ENABLE_HIP) -using UVMMemSpace = Kokkos::HIPHostPinnedSpace; -#elif defined(KOKKOS_ENABLE_OPENMP) -using UVMMemSpace = Kokkos::OpenMP; + +#ifndef KOKKOS_HAS_SHARED_SPACE +#ifndef _MSC_VER +#warning "Kokkos::SharedSpace is not defined." #else -using UVMMemSpace = Kokkos::HostSpace; +#pragma message("Kokkos::SharedSpace is not defined.") #endif +using UVMMemSpace = Kokkos::HostSpace; +#else +using UVMMemSpace = Kokkos::SharedSpace; #endif -#ifdef KOKKOS_HAS_SHARED_SPACE -using HostPinnedSpace = Kokkos::SharedHostPinnedSpace; -#else #ifdef KOKKOS_ENABLE_CUDA -using HostPinnedSpace = Kokkos::CudaHostPinnedSpace; +using MemSpace = Kokkos::CudaSpace; #elif defined(KOKKOS_ENABLE_HIP) -using HostPinnedSpace = Kokkos::HIPHostPinnedSpace; +using MemSpace = Kokkos::HIPSpace; #else -using HostPinnedSpace = MemSpace; -#endif +using MemSpace = ExecSpace::memory_space; #endif #ifdef KOKKOS_ENABLE_HIP diff --git a/packages/stk/stk_util/stk_util/parallel/DeviceAwareMPI.cpp b/packages/stk/stk_util/stk_util/parallel/DeviceAwareMPI.cpp index 64efda9af165..99a227352018 100644 --- a/packages/stk/stk_util/stk_util/parallel/DeviceAwareMPI.cpp +++ b/packages/stk/stk_util/stk_util/parallel/DeviceAwareMPI.cpp @@ -34,6 +34,7 @@ #include #include +#include #include #ifdef OMPI_MAJOR_VERSION @@ -44,8 +45,15 @@ namespace stk { bool have_device_aware_mpi() { -#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT - return true; +#if defined(STK_ENABLE_GPU) && defined(MPIX_CUDA_AWARE_SUPPORT) + //This runtime-check is described at this web page: + //https://www.open-mpi.org/faq/?category=runcuda + if (1 == MPIX_Query_cuda_support()) { + return true; + } + else { + return false; + } #endif return false; diff --git a/packages/stk/stk_util/stk_util/parallel/OutputStreams.cpp b/packages/stk/stk_util/stk_util/parallel/OutputStreams.cpp index b99379794fea..8a84880ee6d5 100644 --- a/packages/stk/stk_util/stk_util/parallel/OutputStreams.cpp +++ b/packages/stk/stk_util/stk_util/parallel/OutputStreams.cpp @@ -105,7 +105,9 @@ void output_flush() void set_outputP0(std::ostream* ostreamPtr, ParallelMachine comm) { - reset_default_output_streams(comm); + if (comm != OutputStreams::instance().m_comm) { + reset_default_output_streams(comm); + } if (stk::parallel_machine_rank(comm) == 0) { OutputStreams::instance().m_outputP0 = ostreamPtr; } diff --git a/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp b/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp index d1396ef1b150..c2b9f9ded275 100644 --- a/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp +++ b/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp @@ -42,7 +42,7 @@ //In Sierra, STK_VERSION_STRING is provided on the compile line by bake. //For Trilinos stk snapshots, the following macro definition gets populated with //the real version string by the trilinos_snapshot.sh script. -#define STK_VERSION_STRING "5.21.6-340-g20e31875" +#define STK_VERSION_STRING "5.23.1-605-g31b54b7f" #endif namespace stk { diff --git a/packages/stk/stk_util/stk_util/stk_config.h b/packages/stk/stk_util/stk_util/stk_config.h index 491d6c5bb090..14164584d5b9 100644 --- a/packages/stk/stk_util/stk_util/stk_config.h +++ b/packages/stk/stk_util/stk_util/stk_config.h @@ -49,6 +49,8 @@ #define STK_HAS_SEACAS_IOSS #define STK_HAS_SEACAS_EXODUS #define STK_HAS_SEACAS_NEMESIS +#define STK_HAVE_FP_EXCEPT +#define STK_HAVE_FP_ERRNO #else // This file gets created by cmake during a Trilinos build diff --git a/packages/stk/stk_util/stk_util/util/FPExceptions.cpp b/packages/stk/stk_util/stk_util/util/FPExceptions.cpp new file mode 100644 index 000000000000..f09fdf77f462 --- /dev/null +++ b/packages/stk/stk_util/stk_util/util/FPExceptions.cpp @@ -0,0 +1,51 @@ +#include "FPExceptions.hpp" + +namespace stk { +namespace util { + +namespace { +void append_string(std::string& all_exceptions_string, const std::string& new_string) +{ + if (all_exceptions_string.size() == 0) + { + all_exceptions_string = new_string; + } else + { + all_exceptions_string = all_exceptions_string + ", " + new_string; + } +} +} + +std::string get_fe_except_string(int fe_except_bitmask) +{ + std::string all_exceptions_string; + if (fe_except_bitmask & FE_DIVBYZERO) + { + append_string(all_exceptions_string, "FE_DIVBYZERO"); + } + + if (fe_except_bitmask & FE_INEXACT) + { + append_string(all_exceptions_string, "FE_INEXACT"); + } + + if ( fe_except_bitmask & FE_INVALID) + { + append_string(all_exceptions_string, "FE_INVALID"); + } + + if (fe_except_bitmask & FE_OVERFLOW) + { + append_string(all_exceptions_string, "FE_OVERFLOW"); + } + + if (fe_except_bitmask & FE_UNDERFLOW) + { + append_string(all_exceptions_string, "FE_UNDERFLOW"); + } + + return all_exceptions_string; +} + +} +} \ No newline at end of file diff --git a/packages/stk/stk_util/stk_util/util/FPExceptions.hpp b/packages/stk/stk_util/stk_util/util/FPExceptions.hpp new file mode 100644 index 000000000000..3d65d0a6017a --- /dev/null +++ b/packages/stk/stk_util/stk_util/util/FPExceptions.hpp @@ -0,0 +1,95 @@ +#ifndef STK_UTIL_FPEXCEPTIONS +#define STK_UTIL_FPEXCEPTIONS + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace stk { +namespace util { + +constexpr bool have_errno() +{ +#ifdef STK_HAVE_FP_ERRNO + return math_errhandling & MATH_ERRNO; +#else + return false; +#endif +} + +constexpr bool have_errexcept() +{ +#ifdef STK_HAVE_FP_EXCEPT + return math_errhandling & MATH_ERREXCEPT; +#else + return false; +#endif +} + +std::string get_fe_except_string(int fe_except_bitmask); + +inline void clear_fp_errors() +{ + if constexpr (have_errexcept()) + { + std::feclearexcept(FE_ALL_EXCEPT); + } else if constexpr (have_errno()) + { + errno = 0; + } +} + +inline void throw_or_warn_on_fp_error(const char* fname = nullptr, bool warn=false, std::ostream& os = std::cerr) +{ + if constexpr (have_errexcept()) + { + int fe_except_bitmask = std::fetestexcept(FE_ALL_EXCEPT & ~FE_INEXACT); + if (fe_except_bitmask != 0) + { + std::string msg = std::string(fname ? fname : "") + " raised floating point error(s): " + get_fe_except_string(fe_except_bitmask); + clear_fp_errors(); + if (warn) + { + os << msg << std::endl; + } else { + STK_ThrowRequireMsg(fe_except_bitmask == 0, msg); + } + } + } else if constexpr (have_errno()) + { + if (errno != 0) + { + std::string msg = std::string(fname ? fname : "") + " raised floating point error(s) " + std::strerror(errno); + clear_fp_errors(); + if (warn) + { + os << msg << std::endl; + } else + { + STK_ThrowRequireMsg(errno == 0, msg); + } + } + } +} + +inline void warn_on_fp_error(const char* fname = nullptr, std::ostream& os = std::cerr) +{ + throw_or_warn_on_fp_error(fname, true, os); +} + +inline void throw_on_fp_error(const char* fname = nullptr) +{ + throw_or_warn_on_fp_error(fname, false); +} + + +} +} + +#endif From fd704ca6236eb8e041ff269059c9004ae1741019 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 10 Sep 2024 10:25:02 -0600 Subject: [PATCH 116/243] Panzer MiniEM: Propagate parameter change from MueLu Signed-off-by: Christian Glusa --- .../panzer/mini-em/example/BlockPrec/solverMueLu.xml | 12 ++++++------ .../mini-em/example/BlockPrec/solverMueLuEpetra.xml | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml index 794d8ad984df..d71ba1fd5230 100644 --- a/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml +++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml @@ -164,7 +164,7 @@ - + @@ -209,7 +209,7 @@ - + @@ -269,7 +269,7 @@ - + @@ -412,7 +412,7 @@ - + @@ -465,7 +465,7 @@ - + @@ -531,7 +531,7 @@ - + diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLuEpetra.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLuEpetra.xml index deab8726c4da..45f092c25401 100644 --- a/packages/panzer/mini-em/example/BlockPrec/solverMueLuEpetra.xml +++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLuEpetra.xml @@ -368,7 +368,7 @@ - + @@ -414,7 +414,7 @@ - + @@ -480,7 +480,7 @@ - + From 8e7950b59047b8c97dc5180c341aee99719f3ba7 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 11 Nov 2024 12:20:40 -0700 Subject: [PATCH 117/243] Snapshot of kokkos.git from commit 2916eadc4b552dfd64579e79f0291e555d3ec91f From repository at git@github.com:kokkos/kokkos.git At commit: commit 2916eadc4b552dfd64579e79f0291e555d3ec91f Author: Nathan Ellingwood Date: Mon Nov 11 11:27:59 2024 -0700 update master_history.txt for 4.5.00 Signed-off-by: Nathan Ellingwood Signed-off-by: Nathan Ellingwood --- .../kokkos/{appveyor.yml => .appveyor.yml} | 2 +- packages/kokkos/.clang-format | 2 +- packages/kokkos/.clang-format-ignore | 3 - packages/kokkos/.clang-tidy | 4 +- packages/kokkos/.cmake-format.py | 28 + packages/kokkos/.codecov.yml | 11 - packages/kokkos/.jenkins | 383 +-- packages/kokkos/.jenkins_nightly | 16 +- packages/kokkos/CHANGELOG.md | 101 +- packages/kokkos/CMakeLists.txt | 417 ++- packages/kokkos/CONTRIBUTING.md | 2 + packages/kokkos/HOW_TO_SNAPSHOT | 73 - packages/kokkos/Makefile.kokkos | 282 +- packages/kokkos/Makefile.targets | 16 +- packages/kokkos/README.md | 6 +- packages/kokkos/algorithms/CMakeLists.txt | 12 +- packages/kokkos/algorithms/src/CMakeLists.txt | 33 +- .../kokkos/algorithms/src/Kokkos_Random.hpp | 11 +- .../src/sorting/Kokkos_BinOpsPublicAPI.hpp | 22 +- .../src/sorting/Kokkos_BinSortPublicAPI.hpp | 3 +- .../src/sorting/Kokkos_SortPublicAPI.hpp | 20 +- .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 22 +- .../src/sorting/impl/Kokkos_SortImpl.hpp | 44 +- .../src/std_algorithms/Kokkos_Reduce.hpp | 24 +- .../std_algorithms/Kokkos_TransformReduce.hpp | 24 +- .../impl/Kokkos_Constraints.hpp | 21 +- .../impl/Kokkos_MoveBackward.hpp | 2 +- .../impl/Kokkos_RandomAccessIterator.hpp | 20 +- .../std_algorithms/impl/Kokkos_Reverse.hpp | 2 +- .../impl/Kokkos_ReverseCopy.hpp | 2 +- .../algorithms/unit_tests/CMakeLists.txt | 400 ++- .../algorithms/unit_tests/TestBinSortA.hpp | 49 +- .../algorithms/unit_tests/TestBinSortB.hpp | 4 + .../algorithms/unit_tests/TestNestedSort.hpp | 10 + .../algorithms/unit_tests/TestRandom.hpp | 11 +- .../unit_tests/TestRandomAccessIterator.cpp | 33 +- .../kokkos/algorithms/unit_tests/TestSort.hpp | 15 +- .../algorithms/unit_tests/TestSortByKey.hpp | 20 +- .../TestStdAlgorithmsAdjacentDifference.cpp | 2 +- .../TestStdAlgorithmsAdjacentFind.cpp | 4 +- .../unit_tests/TestStdAlgorithmsCommon.hpp | 28 +- .../TestStdAlgorithmsConstraints.cpp | 19 +- .../unit_tests/TestStdAlgorithmsCopyIf.cpp | 8 +- .../TestStdAlgorithmsExclusiveScan.cpp | 2 +- .../unit_tests/TestStdAlgorithmsForEach.cpp | 2 - .../TestStdAlgorithmsHelperFunctors.hpp | 2 +- .../TestStdAlgorithmsInclusiveScan.cpp | 2 +- .../unit_tests/TestStdAlgorithmsIsSorted.cpp | 7 +- .../TestStdAlgorithmsIsSortedUntil.cpp | 5 +- .../unit_tests/TestStdAlgorithmsMismatch.cpp | 2 +- .../unit_tests/TestStdAlgorithmsModOps.cpp | 2 +- .../unit_tests/TestStdAlgorithmsModSeqOps.cpp | 2 +- .../TestStdAlgorithmsMoveBackward.cpp | 2 +- .../TestStdAlgorithmsPartitionCopy.cpp | 6 +- .../unit_tests/TestStdAlgorithmsRemove.cpp | 4 +- .../TestStdAlgorithmsRemoveCopy.cpp | 2 +- .../TestStdAlgorithmsRemoveCopyIf.cpp | 2 +- .../unit_tests/TestStdAlgorithmsRemoveIf.cpp | 6 +- .../unit_tests/TestStdAlgorithmsReplace.cpp | 4 +- .../TestStdAlgorithmsReplaceCopy.cpp | 4 +- .../TestStdAlgorithmsReplaceCopyIf.cpp | 4 +- .../unit_tests/TestStdAlgorithmsReplaceIf.cpp | 2 +- .../unit_tests/TestStdAlgorithmsReverse.cpp | 2 +- .../unit_tests/TestStdAlgorithmsRotate.cpp | 2 +- .../TestStdAlgorithmsRotateCopy.cpp | 4 +- .../unit_tests/TestStdAlgorithmsSearch.cpp | 2 +- .../unit_tests/TestStdAlgorithmsSearch_n.cpp | 4 +- .../unit_tests/TestStdAlgorithmsShiftLeft.cpp | 2 +- .../TestStdAlgorithmsShiftRight.cpp | 4 +- ...estStdAlgorithmsTeamAdjacentDifference.cpp | 8 +- .../unit_tests/TestStdAlgorithmsTeamCopy.cpp | 2 +- .../TestStdAlgorithmsTeamCopyIf.cpp | 2 +- .../TestStdAlgorithmsTeamCopy_n.cpp | 2 +- .../unit_tests/TestStdAlgorithmsTeamCount.cpp | 2 +- .../TestStdAlgorithmsTeamExclusiveScan.cpp | 4 +- .../unit_tests/TestStdAlgorithmsTeamFind.cpp | 2 +- .../TestStdAlgorithmsTeamFindEnd.cpp | 8 +- .../TestStdAlgorithmsTeamFindIf.cpp | 2 +- .../TestStdAlgorithmsTeamFindIfNot.cpp | 2 +- .../TestStdAlgorithmsTeamGenerate_n.cpp | 2 +- .../TestStdAlgorithmsTeamIsSorted.cpp | 2 +- .../TestStdAlgorithmsTeamIsSortedUntil.cpp | 10 +- .../TestStdAlgorithmsTeamMaxElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinMaxElement.cpp | 4 +- .../unit_tests/TestStdAlgorithmsTeamMove.cpp | 2 +- .../TestStdAlgorithmsTeamRemove.cpp | 2 +- .../TestStdAlgorithmsTeamRemoveCopy.cpp | 4 +- .../TestStdAlgorithmsTeamRemoveCopyIf.cpp | 4 +- .../TestStdAlgorithmsTeamReplaceCopy.cpp | 4 +- .../TestStdAlgorithmsTeamReplaceCopyIf.cpp | 4 +- .../TestStdAlgorithmsTeamRotateCopy.cpp | 2 +- .../TestStdAlgorithmsTeamShiftRight.cpp | 2 +- .../TestStdAlgorithmsTeamSwapRanges.cpp | 2 +- ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 4 +- .../TestStdAlgorithmsTeamUnique.cpp | 4 +- .../TestStdAlgorithmsTeamUniqueCopy.cpp | 10 +- ...estStdAlgorithmsTransformExclusiveScan.cpp | 4 +- ...estStdAlgorithmsTransformInclusiveScan.cpp | 4 +- .../unit_tests/TestStdAlgorithmsUnique.cpp | 2 +- .../TestStdAlgorithmsUniqueCopy.cpp | 4 +- .../algorithms/unit_tests/TestStdReducers.cpp | 6 +- packages/kokkos/benchmarks/CMakeLists.txt | 20 +- .../kokkos/benchmarks/atomic/CMakeLists.txt | 5 +- .../benchmarks/bytes_and_flops/CMakeLists.txt | 9 +- .../bytes_and_flops/bench_unroll_stride.hpp | 6 +- .../kokkos/benchmarks/gather/CMakeLists.txt | 5 +- .../kokkos/benchmarks/gups/CMakeLists.txt | 5 +- packages/kokkos/benchmarks/gups/gups.cpp | 2 +- .../benchmarks/launch_latency/CMakeLists.txt | 5 +- .../launch_latency/launch_latency.cpp | 4 +- .../policy_performance/CMakeLists.txt | 5 +- .../kokkos/benchmarks/stream/CMakeLists.txt | 5 +- .../view_copy_constructor/CMakeLists.txt | 5 +- packages/kokkos/bin/kokkos_launch_compiler | 4 +- packages/kokkos/cmake/Dependencies.cmake | 6 +- packages/kokkos/cmake/KokkosCore_config.h.in | 11 +- .../cmake/KokkosTrilinosConfig.cmake.in | 17 - .../kokkos/cmake/Modules/CudaToolkit.cmake | 196 +- .../kokkos/cmake/Modules/FindTPLCUDA.cmake | 68 +- .../kokkos/cmake/Modules/FindTPLHPX.cmake | 11 +- .../kokkos/cmake/Modules/FindTPLHWLOC.cmake | 2 +- .../kokkos/cmake/Modules/FindTPLLIBDL.cmake | 2 +- .../cmake/Modules/FindTPLLIBQUADMATH.cmake | 20 +- .../kokkos/cmake/Modules/FindTPLONEDPL.cmake | 66 +- .../kokkos/cmake/Modules/FindTPLROCM.cmake | 22 +- .../cmake/Modules/FindTPLROCTHRUST.cmake | 10 +- .../kokkos/cmake/Modules/FindTPLTHREADS.cmake | 23 +- packages/kokkos/cmake/README.md | 14 - packages/kokkos/cmake/build_env_info.cmake | 103 +- .../compile_tests/amd_apu.cc} | 24 +- packages/kokkos/cmake/cray.cmake | 11 +- packages/kokkos/cmake/deps/CUDA.cmake | 30 +- packages/kokkos/cmake/deps/HWLOC.cmake | 6 +- packages/kokkos/cmake/deps/Pthread.cmake | 38 +- packages/kokkos/cmake/deps/quadmath.cmake | 5 +- packages/kokkos/cmake/fake_tribits.cmake | 465 ++-- packages/kokkos/cmake/gnu.cmake | 38 +- packages/kokkos/cmake/intel.cmake | 29 +- packages/kokkos/cmake/kokkos_arch.cmake | 2261 +++++++++-------- packages/kokkos/cmake/kokkos_check_env.cmake | 27 +- .../kokkos/cmake/kokkos_compiler_id.cmake | 437 ++-- .../cmake/kokkos_configure_trilinos.cmake | 38 + .../kokkos/cmake/kokkos_corner_cases.cmake | 12 +- .../kokkos/cmake/kokkos_enable_devices.cmake | 212 +- .../kokkos/cmake/kokkos_enable_options.cmake | 358 +-- packages/kokkos/cmake/kokkos_functions.cmake | 1319 +++++----- packages/kokkos/cmake/kokkos_install.cmake | 78 +- .../kokkos/cmake/kokkos_pick_cxx_std.cmake | 36 +- .../kokkos/cmake/kokkos_test_cxx_std.cmake | 285 ++- packages/kokkos/cmake/kokkos_tpls.cmake | 202 +- packages/kokkos/cmake/kokkos_tribits.cmake | 732 +++--- packages/kokkos/cmake/msvc.cmake | 20 +- packages/kokkos/cmake/pgi.cmake | 10 +- packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake | 7 +- .../kokkos/cmake/tpls/FindTPLPthread.cmake | 35 +- .../kokkos/cmake/tpls/FindTPLquadmath.cmake | 5 +- packages/kokkos/containers/CMakeLists.txt | 14 +- .../performance_tests/CMakeLists.txt | 17 +- .../performance_tests/TestScatterView.hpp | 8 +- packages/kokkos/containers/src/CMakeLists.txt | 34 +- .../kokkos/containers/src/Kokkos_Bitset.hpp | 2 +- .../kokkos/containers/src/Kokkos_DualView.hpp | 241 +- .../containers/src/Kokkos_DynRankView.hpp | 1451 ++++------- .../containers/src/Kokkos_DynamicView.hpp | 104 +- .../containers/src/Kokkos_OffsetView.hpp | 961 ++----- .../containers/src/Kokkos_ScatterView.hpp | 78 +- .../containers/src/Kokkos_StaticCrsGraph.hpp | 2 +- .../containers/src/Kokkos_UnorderedMap.hpp | 16 +- .../kokkos/containers/src/Kokkos_Vector.hpp | 5 +- .../containers/unit_tests/CMakeLists.txt | 67 +- .../containers/unit_tests/TestBitset.hpp | 2 +- .../containers/unit_tests/TestDualView.hpp | 117 +- .../unit_tests/TestDynRankViewTypedefs.cpp | 260 ++ .../TestDynRankView_TeamScratch.hpp | 72 + .../containers/unit_tests/TestDynViewAPI.hpp | 25 +- .../containers/unit_tests/TestDynamicView.hpp | 33 +- .../unit_tests/TestErrorReporter.hpp | 5 +- .../containers/unit_tests/TestOffsetView.hpp | 210 +- .../containers/unit_tests/TestScatterView.hpp | 18 +- .../unit_tests/TestStaticCrsGraph.hpp | 18 +- .../unit_tests/TestUnorderedMap.hpp | 5 +- .../TestViewCtorPropEmbeddedDim.hpp | 16 +- .../unit_tests/TestWithoutInitializing.hpp | 28 +- packages/kokkos/core/CMakeLists.txt | 30 +- packages/kokkos/core/perf_test/CMakeLists.txt | 254 +- .../kokkos/core/perf_test/PerfTestHexGrad.cpp | 4 +- .../perf_test/PerfTest_CustomReduction.cpp | 2 - .../PerfTest_ExecSpacePartitioning.cpp | 3 +- .../core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 - .../core/perf_test/PerfTest_ViewFill_Raw.cpp | 2 - .../perf_test/PerfTest_ViewResize_Raw.cpp | 2 - .../kokkos/core/perf_test/test_mempool.cpp | 4 +- .../core/perf_test/test_sharedSpace.cpp | 2 +- .../kokkos/core/perf_test/test_taskdag.cpp | 9 + packages/kokkos/core/src/CMakeLists.txt | 314 ++- packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp | 1 - .../kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 9 +- .../kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp | 18 +- .../src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 41 +- .../core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 32 +- .../core/src/Cuda/Kokkos_Cuda_Instance.cpp | 24 +- .../src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 13 +- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 58 +- .../src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 2 +- .../src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 37 +- .../kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp | 11 +- .../kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp | 186 +- .../src/Cuda/Kokkos_Cuda_Vectorization.hpp | 18 +- .../core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp | 11 +- packages/kokkos/core/src/HIP/Kokkos_HIP.cpp | 60 +- .../HIP/Kokkos_HIP_BlockSize_Deduction.hpp | 5 +- .../src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 43 +- .../core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 37 +- .../core/src/HIP/Kokkos_HIP_Instance.cpp | 40 +- .../core/src/HIP/Kokkos_HIP_Instance.hpp | 13 +- .../core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 44 +- .../HIP/Kokkos_HIP_ParallelFor_MDRange.hpp | 6 +- .../src/HIP/Kokkos_HIP_ParallelFor_Range.hpp | 4 +- .../src/HIP/Kokkos_HIP_ParallelFor_Team.hpp | 20 +- .../HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 79 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.cpp | 2 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.hpp | 2 +- .../src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp | 10 +- .../kokkos/core/src/HIP/Kokkos_HIP_Space.cpp | 49 +- .../kokkos/core/src/HIP/Kokkos_HIP_Space.hpp | 44 +- .../kokkos/core/src/HIP/Kokkos_HIP_Team.hpp | 197 +- .../src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp | 3 +- .../core/src/HIP/Kokkos_HIP_Vectorization.hpp | 22 +- .../core/src/HIP/Kokkos_HIP_ZeroMemset.cpp | 36 + .../core/src/HIP/Kokkos_HIP_ZeroMemset.hpp | 21 +- packages/kokkos/core/src/HPX/Kokkos_HPX.hpp | 117 +- .../kokkos/core/src/HPX/Kokkos_HPX_Task.hpp | 11 + .../core/src/KokkosExp_MDRangePolicy.hpp | 94 +- .../kokkos/core/src/Kokkos_AnonymousSpace.hpp | 8 +- packages/kokkos/core/src/Kokkos_Array.hpp | 40 +- packages/kokkos/core/src/Kokkos_Atomic.hpp | 1 - .../Kokkos_Atomics_Desul_Volatile_Wrapper.hpp | 196 -- .../core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 277 +- packages/kokkos/core/src/Kokkos_Complex.hpp | 30 +- packages/kokkos/core/src/Kokkos_Concepts.hpp | 51 +- packages/kokkos/core/src/Kokkos_CopyViews.hpp | 346 ++- packages/kokkos/core/src/Kokkos_Core.hpp | 10 +- packages/kokkos/core/src/Kokkos_Core_fwd.hpp | 9 +- packages/kokkos/core/src/Kokkos_Crs.hpp | 14 +- .../kokkos/core/src/Kokkos_DetectionIdiom.hpp | 6 +- .../kokkos/core/src/Kokkos_ExecPolicy.hpp | 160 +- packages/kokkos/core/src/Kokkos_Extents.hpp | 2 +- packages/kokkos/core/src/Kokkos_Future.hpp | 37 +- packages/kokkos/core/src/Kokkos_Graph.hpp | 69 +- packages/kokkos/core/src/Kokkos_GraphNode.hpp | 86 +- packages/kokkos/core/src/Kokkos_HostSpace.hpp | 33 +- packages/kokkos/core/src/Kokkos_Layout.hpp | 36 +- packages/kokkos/core/src/Kokkos_Macros.hpp | 64 +- .../kokkos/core/src/Kokkos_MemoryPool.hpp | 11 +- .../kokkos/core/src/Kokkos_NumericTraits.hpp | 2 +- packages/kokkos/core/src/Kokkos_Pair.hpp | 6 +- packages/kokkos/core/src/Kokkos_Parallel.hpp | 24 +- .../core/src/Kokkos_Parallel_Reduce.hpp | 181 +- .../src/Kokkos_Profiling_ProfileSection.hpp | 2 +- .../src/Kokkos_Profiling_ScopedRegion.hpp | 2 +- .../kokkos/core/src/Kokkos_ScratchSpace.hpp | 2 +- .../kokkos/core/src/Kokkos_TaskScheduler.hpp | 68 +- .../core/src/Kokkos_TaskScheduler_fwd.hpp | 43 +- packages/kokkos/core/src/Kokkos_Timer.hpp | 2 +- packages/kokkos/core/src/Kokkos_Tuners.hpp | 125 +- packages/kokkos/core/src/Kokkos_TypeInfo.hpp | 103 + packages/kokkos/core/src/Kokkos_View.hpp | 2014 +-------------- .../core/src/Kokkos_WorkGraphPolicy.hpp | 4 +- .../core/src/OpenACC/Kokkos_OpenACC.cpp | 55 + .../core/src/OpenACC/Kokkos_OpenACC.hpp | 5 +- .../core/src/OpenACC/Kokkos_OpenACCSpace.hpp | 10 + .../OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp | 2 +- .../src/OpenACC/Kokkos_OpenACC_Instance.cpp | 15 +- .../src/OpenACC/Kokkos_OpenACC_Instance.hpp | 3 +- .../Kokkos_OpenACC_ParallelFor_MDRange.hpp | 620 +++-- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 560 +++- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 111 +- .../Kokkos_OpenACC_ParallelScan_Range.hpp | 2 +- .../src/OpenACC/Kokkos_OpenACC_Traits.hpp | 5 +- .../kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp | 14 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 11 + .../src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 2 - .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 81 +- .../Kokkos_OpenMPTarget_DeepCopy.hpp | 101 + .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 130 - .../Kokkos_OpenMPTarget_FunctorAdapter.hpp | 48 + .../Kokkos_OpenMPTarget_Instance.cpp | 88 +- .../Kokkos_OpenMPTarget_Instance.hpp | 21 +- .../Kokkos_OpenMPTarget_Parallel.hpp | 41 +- ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 129 +- .../Kokkos_OpenMPTarget_ParallelFor_Range.hpp | 24 +- .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 41 +- ...os_OpenMPTarget_ParallelReduce_MDRange.hpp | 336 +-- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 24 +- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 76 +- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 60 +- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 216 +- .../Kokkos_OpenMPTarget_Reducer.hpp | 160 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp | 251 -- .../OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp | 319 --- packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp | 12 +- packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp | 15 +- .../core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp | 36 +- .../src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp | 37 +- .../src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp | 14 +- .../core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp | 64 +- .../core/src/SYCL/Kokkos_SYCL_Instance.cpp | 49 +- .../core/src/SYCL/Kokkos_SYCL_Instance.hpp | 45 +- .../src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp | 11 +- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 13 +- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 18 +- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 24 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 16 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 19 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 29 +- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 23 +- .../core/src/SYCL/Kokkos_SYCL_Space.cpp | 32 +- .../core/src/SYCL/Kokkos_SYCL_Space.hpp | 106 +- .../kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp | 163 +- .../core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp | 5 +- .../core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp | 15 +- .../core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp | 11 +- .../kokkos/core/src/Serial/Kokkos_Serial.hpp | 5 +- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 10 + .../Serial/Kokkos_Serial_Parallel_Range.hpp | 38 +- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 20 +- .../core/src/Serial/Kokkos_Serial_Task.hpp | 15 +- .../Serial/Kokkos_Serial_WorkGraphPolicy.hpp | 4 +- .../src/Serial/Kokkos_Serial_ZeroMemset.hpp | 12 +- .../src/Threads/Kokkos_Threads_Instance.cpp | 20 +- .../src/Threads/Kokkos_Threads_Instance.hpp | 8 +- .../Kokkos_Threads_ParallelFor_MDRange.hpp | 4 +- .../Kokkos_Threads_ParallelFor_Range.hpp | 8 +- .../Kokkos_Threads_ParallelFor_Team.hpp | 24 +- .../Kokkos_Threads_ParallelReduce_MDRange.hpp | 10 +- .../Kokkos_Threads_ParallelReduce_Range.hpp | 8 +- .../Kokkos_Threads_ParallelReduce_Team.hpp | 13 +- .../Kokkos_Threads_ParallelScan_Range.hpp | 8 +- .../src/Threads/Kokkos_Threads_Spinwait.cpp | 2 +- .../src/Threads/Kokkos_Threads_Spinwait.hpp | 3 +- .../core/src/Threads/Kokkos_Threads_Team.hpp | 173 +- .../Kokkos_Threads_WorkGraphPolicy.hpp | 4 +- .../kokkos/core/src/View/Kokkos_BasicView.hpp | 652 +++++ .../kokkos/core/src/View/Kokkos_ViewAlloc.hpp | 308 +-- .../Kokkos_ViewAtomic.hpp} | 10 +- .../src/{impl => View}/Kokkos_ViewCtor.hpp | 87 +- .../Kokkos_ViewDataAnalysis.hpp | 15 +- .../core/src/View/Kokkos_ViewLegacy.hpp | 1604 ++++++++++++ .../src/{impl => View}/Kokkos_ViewMapping.hpp | 453 ++-- .../src/{impl => View}/Kokkos_ViewTracker.hpp | 0 .../core/src/View/Kokkos_ViewTraits.hpp | 457 ++++ .../{impl => View}/Kokkos_ViewUniformType.hpp | 12 +- .../View/MDSpan/Kokkos_MDSpan_Accessor.hpp | 203 +- .../src/View/MDSpan/Kokkos_MDSpan_Layout.hpp | 119 +- .../core/src/decl/Kokkos_Declare_CUDA.hpp | 2 + .../core/src/decl/Kokkos_Declare_SYCL.hpp | 10 + .../kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp | 2 - .../src/impl/KokkosExp_Host_IterateTile.hpp | 71 +- .../src/impl/KokkosExp_IterateTileGPU.hpp | 8 +- .../core/src/impl/Kokkos_AnalyzePolicy.hpp | 2 +- .../kokkos/core/src/impl/Kokkos_ChaseLev.hpp | 14 +- .../kokkos/core/src/impl/Kokkos_ClockTic.hpp | 11 +- .../core/src/impl/Kokkos_Combined_Reducer.hpp | 26 +- .../core/src/impl/Kokkos_ConcurrentBitset.hpp | 30 +- packages/kokkos/core/src/impl/Kokkos_Core.cpp | 28 +- .../impl/Kokkos_Default_GraphNodeKernel.hpp | 39 +- .../impl/Kokkos_Default_GraphNode_Impl.hpp | 30 +- .../src/impl/Kokkos_Default_Graph_Impl.hpp | 37 +- packages/kokkos/core/src/impl/Kokkos_EBO.hpp | 20 +- .../core/src/impl/Kokkos_ExecPolicy.cpp | 2 +- .../core/src/impl/Kokkos_ExecSpaceManager.hpp | 8 +- .../src/impl/Kokkos_FixedBufferMemoryPool.hpp | 279 -- .../core/src/impl/Kokkos_FunctorAnalysis.hpp | 62 +- .../kokkos/core/src/impl/Kokkos_GraphImpl.hpp | 6 +- .../src/impl/Kokkos_GraphImpl_Utilities.hpp | 6 +- .../impl/Kokkos_GraphNodeCustomization.hpp | 2 +- .../core/src/impl/Kokkos_GraphNodeImpl.hpp | 43 +- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 26 +- .../core/src/impl/Kokkos_HostBarrier.hpp | 6 +- .../kokkos/core/src/impl/Kokkos_HostSpace.cpp | 8 +- .../src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 9 +- .../src/impl/Kokkos_HostSpace_deepcopy.cpp | 23 +- .../src/impl/Kokkos_HostSpace_deepcopy.hpp | 6 +- .../core/src/impl/Kokkos_HostThreadTeam.cpp | 6 +- .../core/src/impl/Kokkos_HostThreadTeam.hpp | 158 +- packages/kokkos/core/src/impl/Kokkos_LIFO.hpp | 8 +- .../core/src/impl/Kokkos_LinkedListNode.hpp | 2 +- .../src/impl/Kokkos_MemoryPoolAllocator.hpp | 103 - .../src/impl/Kokkos_MultipleTaskQueue.hpp | 57 +- .../kokkos/core/src/impl/Kokkos_Profiling.cpp | 2 - .../kokkos/core/src/impl/Kokkos_Profiling.hpp | 30 +- .../src/impl/Kokkos_Profiling_C_Interface.h | 2 +- .../src/impl/Kokkos_Profiling_Interface.hpp | 9 + .../core/src/impl/Kokkos_SharedAlloc.cpp | 29 +- .../core/src/impl/Kokkos_SharedAlloc.hpp | 18 +- .../src/impl/Kokkos_SimpleTaskScheduler.hpp | 9 + .../core/src/impl/Kokkos_SingleTaskQueue.hpp | 12 +- .../core/src/impl/Kokkos_Stacktrace.cpp | 8 +- .../src/impl/Kokkos_StringManipulation.hpp | 10 +- .../kokkos/core/src/impl/Kokkos_TaskBase.hpp | 28 +- .../kokkos/core/src/impl/Kokkos_TaskNode.hpp | 22 +- .../core/src/impl/Kokkos_TaskPolicyData.hpp | 16 +- .../kokkos/core/src/impl/Kokkos_TaskQueue.hpp | 22 +- .../core/src/impl/Kokkos_TaskQueueCommon.hpp | 12 +- .../impl/Kokkos_TaskQueueMemoryManager.hpp | 5 + .../src/impl/Kokkos_TaskQueueMultiple.hpp | 22 +- .../impl/Kokkos_TaskQueueMultiple_impl.hpp | 5 + .../core/src/impl/Kokkos_TaskQueue_impl.hpp | 5 + .../core/src/impl/Kokkos_TaskResult.hpp | 5 + .../core/src/impl/Kokkos_TaskTeamMember.hpp | 8 +- .../core/src/impl/Kokkos_Tools_Generic.hpp | 304 ++- .../kokkos/core/src/impl/Kokkos_Traits.hpp | 5 +- .../core/src/impl/Kokkos_ZeroMemset_fwd.hpp | 2 +- .../kokkos/core/src/impl/Kokkos_hwloc.cpp | 2 +- .../core/src/setup/Kokkos_Setup_Cuda.hpp | 8 + .../core/src/setup/Kokkos_Setup_HIP.hpp | 17 + .../core/src/setup/Kokkos_Setup_SYCL.hpp | 8 + .../core/src/traits/Kokkos_IndexTypeTrait.hpp | 6 +- .../traits/Kokkos_IterationPatternTrait.hpp | 2 +- .../traits/Kokkos_OccupancyControlTrait.hpp | 9 +- .../core/src/traits/Kokkos_WorkTagTrait.hpp | 4 +- packages/kokkos/core/unit_test/CMakeLists.txt | 1419 +++++------ .../core/unit_test/IncrementalTest.cpp.in | 2 - packages/kokkos/core/unit_test/Makefile | 21 +- packages/kokkos/core/unit_test/TestAbort.hpp | 11 +- packages/kokkos/core/unit_test/TestArray.cpp | 96 + .../kokkos/core/unit_test/TestArrayOps.hpp | 5 + .../core/unit_test/TestAtomicOperations.hpp | 138 +- .../TestAtomicOperations_complexdouble.hpp | 2 +- .../TestAtomicOperations_complexfloat.hpp | 4 + .../unit_test/TestAtomicOperations_double.hpp | 4 + .../unit_test/TestAtomicOperations_float.hpp | 4 + .../unit_test/TestAtomicOperations_int.hpp | 4 + .../TestAtomicOperations_longint.hpp | 4 + .../TestAtomicOperations_longlongint.hpp | 4 + .../unit_test/TestAtomicOperations_shared.hpp | 4 + .../TestAtomicOperations_unsignedint.hpp | 4 + .../TestAtomicOperations_unsignedlongint.hpp | 4 + ...stAtomicOperations_unsignedlonglongint.hpp | 4 + .../kokkos/core/unit_test/TestAtomicViews.hpp | 291 +-- .../kokkos/core/unit_test/TestAtomics.hpp | 273 +- .../unit_test/TestBitManipulationBuiltins.hpp | 4 +- ...e_d.cpp => TestCStyleMemoryManagement.cpp} | 25 +- .../kokkos/core/unit_test/TestCTestDevice.cpp | 76 +- packages/kokkos/core/unit_test/TestCXX11.hpp | 21 +- .../core/unit_test/TestCompilerMacros.cpp | 6 +- .../kokkos/core/unit_test/TestComplex.hpp | 48 +- .../kokkos/core/unit_test/TestConcepts.hpp | 5 +- .../core/unit_test/TestDeepCopyAlignment.hpp | 6 +- .../core/unit_test/TestDetectionIdiom.cpp | 16 +- .../unit_test/TestExecSpacePartitioning.hpp | 10 +- .../unit_test/TestExecSpaceThreadSafety.hpp | 53 +- .../core/unit_test/TestExecutionSpace.hpp | 3 +- .../core/unit_test/TestFunctorAnalysis.hpp | 30 +- packages/kokkos/core/unit_test/TestGraph.hpp | 562 +++- .../core/unit_test/TestHalfConversion.hpp | 4 - .../core/unit_test/TestHalfOperators.hpp | 121 +- .../TestHostSharedPtrAccessOnDevice.hpp | 16 +- packages/kokkos/core/unit_test/TestInit.hpp | 3 - .../unit_test/TestInitializationSettings.cpp | 11 +- .../kokkos/core/unit_test/TestInterOp.cpp | 75 +- .../core/unit_test/TestIrregularLayout.hpp | 4 +- .../core/unit_test/TestLocalDeepCopy.hpp | 2 - .../kokkos/core/unit_test/TestMDRange.hpp | 36 +- .../TestMDRangePolicyConstructors.hpp | 59 + .../core/unit_test/TestMDRangeReduce.hpp | 2 - .../kokkos/core/unit_test/TestMDRange_g.hpp | 2 +- .../core/unit_test/TestMDSpanConversion.hpp | 51 + .../unit_test/TestMathematicalFunctions.hpp | 213 +- .../TestMathematicalSpecialFunctions.hpp | 40 +- .../kokkos/core/unit_test/TestMemoryPool.hpp | 5 +- .../core/unit_test/TestNestedReducerCTAD.cpp | 8 +- .../core/unit_test/TestNumericTraits.hpp | 26 +- .../TestParseCmdLineArgsAndEnvVars.cpp | 5 +- packages/kokkos/core/unit_test/TestRange.hpp | 25 - .../unit_test/TestRangePolicyConstructors.hpp | 105 +- .../core/unit_test/TestRangePolicyRequire.hpp | 25 - packages/kokkos/core/unit_test/TestReduce.hpp | 49 +- .../unit_test/TestReduceCombinatorical.hpp | 26 +- .../kokkos/core/unit_test/TestReducers.hpp | 267 +- .../kokkos/core/unit_test/TestSharedAlloc.hpp | 4 +- .../kokkos/core/unit_test/TestSharedSpace.cpp | 8 +- .../TestSpaceAwareAccessorAccessViolation.hpp | 2 +- .../kokkos/core/unit_test/TestStackTrace.hpp | 2 + .../core/unit_test/TestTaskScheduler.hpp | 53 +- packages/kokkos/core/unit_test/TestTeam.hpp | 315 ++- .../kokkos/core/unit_test/TestTeamBasic.hpp | 2 +- .../unit_test/TestTeamCombinedReducers.hpp | 6 - .../kokkos/core/unit_test/TestTeamMDRange.hpp | 10 +- .../unit_test/TestTeamMDRangePolicyCTAD.cpp | 4 +- .../core/unit_test/TestTeamReductionScan.hpp | 109 +- .../kokkos/core/unit_test/TestTeamScan.hpp | 21 +- .../kokkos/core/unit_test/TestTeamScratch.hpp | 2 - .../kokkos/core/unit_test/TestTeamVector.hpp | 22 +- .../kokkos/core/unit_test/TestTypeInfo.cpp | 74 + .../kokkos/core/unit_test/TestTypeList.cpp | 8 +- .../kokkos/core/unit_test/TestUtilities.hpp | 12 +- .../kokkos/core/unit_test/TestViewAPI.hpp | 34 +- .../kokkos/core/unit_test/TestViewAPI_b.hpp | 30 + .../kokkos/core/unit_test/TestViewAPI_e.hpp | 23 +- .../core/unit_test/TestViewBadAlloc.hpp | 12 +- .../kokkos/core/unit_test/TestViewCopy_b.hpp | 4 +- .../core/unit_test/TestViewCtorDimMatch.hpp | 46 +- .../core/unit_test/TestViewCtorProp.hpp | 95 + .../unit_test/TestViewCtorPropEmbeddedDim.hpp | 16 +- .../core/unit_test/TestViewIsAssignable.hpp | 40 +- .../core/unit_test/TestViewMapping_a.hpp | 213 +- .../unit_test/TestViewMapping_subview.hpp | 8 +- .../TestViewMemoryAccessViolation.hpp | 2 +- .../unit_test/TestViewOutOfBoundsAccess.hpp | 2 +- .../kokkos/core/unit_test/TestViewRank.cpp | 4 +- .../kokkos/core/unit_test/TestViewSubview.hpp | 174 +- .../core/unit_test/TestViewTypedefs.cpp | 274 ++ .../kokkos/core/unit_test/TestView_64bit.hpp | 2 - .../unit_test/TestWithoutInitializing.hpp | 50 +- .../UnitTest_CMakePassCmdLineArgs.cpp | 9 +- .../UnitTest_CMakeTriBITSCompatibility.cpp | 33 + .../TestCudaHostPinned_Category.hpp | 4 +- .../TestSYCLHostUSM_Category.hpp | 2 +- .../TestSYCLSharedUSM_Category.hpp | 2 +- .../category_files/TestSYCL_Category.hpp | 2 +- .../unit_test/cuda/TestCuda_InterOp_Graph.cpp | 151 ++ .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 2 +- .../default/TestDefaultDeviceTypeViewAPI.cpp | 32 +- packages/kokkos/core/unit_test/diffconfig.sh | 18 - .../headers_self_contained/CMakeLists.txt | 14 +- .../unit_test/hip/TestHIP_InterOp_Graph.cpp | 127 + .../core/unit_test/hip/TestHIP_Spaces.cpp | 6 +- .../hip/TestHIP_UnifiedMemory_ZeroMemset.cpp | 44 + .../incremental/Test01_execspace.hpp | 4 + .../Test04_ParallelFor_RangePolicy.hpp | 2 +- .../Test05_ParallelReduce_RangePolicy.hpp | 8 +- .../incremental/Test10_HierarchicalBasics.hpp | 4 +- .../Test11a_ParallelFor_TeamThreadRange.hpp | 2 +- .../Test11b_ParallelFor_TeamVectorRange.hpp | 2 +- .../Test11c_ParallelFor_ThreadVectorRange.hpp | 2 +- .../incremental/Test12a_ThreadScratch.hpp | 10 +- .../incremental/Test12b_TeamScratch.hpp | 7 +- .../Test13c_ParallelRed_ThreadVectorRange.hpp | 2 +- .../incremental/Test16_ParallelScan.hpp | 6 +- .../unit_test/sycl/TestSYCL_InterOp_Graph.cpp | 114 + .../unit_test/sycl/TestSYCL_InterOp_Init.cpp | 2 +- .../sycl/TestSYCL_InterOp_Init_Context.cpp | 11 +- .../sycl/TestSYCL_InterOp_Streams.cpp | 2 +- .../core/unit_test/sycl/TestSYCL_Spaces.cpp | 247 +- .../sycl/TestSYCL_TeamScratchStreams.cpp | 34 +- packages/kokkos/core/unit_test/testmake.sh | 18 - .../unit_test/tools/TestEventCorrectness.hpp | 41 +- .../core/unit_test/tools/TestKernelNames.cpp | 219 ++ .../unit_test/tools/TestProfilingSection.cpp | 12 +- .../core/unit_test/tools/TestScopedRegion.cpp | 12 +- .../core/unit_test/tools/TestTuning.cpp | 14 +- .../tools/include/ToolTestingUtilities.hpp | 144 +- .../core/unit_test/view/TestBasicView.hpp | 264 ++ .../view/TestBasicViewMDSpanConversion.cpp | 95 + .../view/TestExtentsDatatypeConversion.cpp | 6 +- .../view/TestReferenceCountedAccessor.hpp | 156 ++ .../view/TestReferenceCountedDataHandle.hpp | 208 ++ packages/kokkos/example/CMakeLists.txt | 11 +- .../example/query_device/CMakeLists.txt | 15 +- .../example/query_device/query_device.cpp | 2 +- .../relocatable_function/CMakeLists.txt | 6 + .../example/relocatable_function/Makefile | 33 + .../relocatable_function/functor.cpp} | 6 +- .../example/relocatable_function/main.cpp | 50 + .../tutorial/01_hello_world/CMakeLists.txt | 11 +- .../01_hello_world_lambda/CMakeLists.txt | 11 +- .../hello_world_lambda.cpp | 5 +- .../tutorial/02_simple_reduce/CMakeLists.txt | 10 +- .../02_simple_reduce_lambda/CMakeLists.txt | 11 +- .../simple_reduce_lambda.cpp | 14 +- .../tutorial/03_simple_view/CMakeLists.txt | 10 +- .../tutorial/03_simple_view/simple_view.cpp | 2 +- .../03_simple_view_lambda/CMakeLists.txt | 10 +- .../simple_view_lambda.cpp | 26 +- .../04_simple_memoryspaces/CMakeLists.txt | 10 +- .../simple_memoryspaces.cpp | 2 +- .../tutorial/05_simple_atomics/CMakeLists.txt | 11 +- .../06_simple_mdrangepolicy/CMakeLists.txt | 10 +- .../01_data_layouts/CMakeLists.txt | 10 +- .../02_memory_traits/CMakeLists.txt | 10 +- .../Advanced_Views/03_subviews/CMakeLists.txt | 10 +- .../04_dualviews/CMakeLists.txt | 10 +- .../Advanced_Views/04_dualviews/dual_view.cpp | 6 +- .../05_NVIDIA_UVM/CMakeLists.txt | 16 +- .../tutorial/Advanced_Views/CMakeLists.txt | 15 +- .../01_random_numbers/CMakeLists.txt | 5 + .../tutorial/Algorithms/CMakeLists.txt | 1 + .../kokkos/example/tutorial/CMakeLists.txt | 26 +- .../01_thread_teams/CMakeLists.txt | 10 +- .../01_thread_teams_lambda/CMakeLists.txt | 11 +- .../thread_teams_lambda.cpp | 5 +- .../02_nested_parallel_for/CMakeLists.txt | 10 +- .../03_vectorization/CMakeLists.txt | 11 +- .../04_team_scan/CMakeLists.txt | 11 +- .../Hierarchical_Parallelism/CMakeLists.txt | 10 +- .../tutorial/launch_bounds/CMakeLists.txt | 10 +- .../launch_bounds/launch_bounds_reduce.cpp | 5 +- packages/kokkos/generate_makefile.bash | 1 + packages/kokkos/master_history.txt | 1 + packages/kokkos/scripts/apply-clang-format | 4 +- .../kokkos/scripts/docker/Dockerfile.clang | 6 +- packages/kokkos/scripts/docker/Dockerfile.gcc | 2 +- .../kokkos/scripts/docker/Dockerfile.hipcc | 2 +- .../docker/Dockerfile.kokkosllvmproject | 2 +- .../kokkos/scripts/docker/Dockerfile.nvcc | 2 +- .../scripts/docker/Dockerfile.openmptarget | 3 +- .../kokkos/scripts/docker/Dockerfile.sycl | 3 +- .../kokkos/scripts/spack_test/CMakeLists.txt | 8 +- .../testing_scripts/gnu_test_all_sandia | 2 +- packages/kokkos/simd/CMakeLists.txt | 8 +- packages/kokkos/simd/src/CMakeLists.txt | 25 +- packages/kokkos/simd/src/Kokkos_SIMD.hpp | 2 +- packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp | 72 +- .../kokkos/simd/src/Kokkos_SIMD_AVX512.hpp | 98 +- packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp | 80 +- .../kokkos/simd/src/Kokkos_SIMD_Scalar.hpp | 2 +- .../kokkos/simd/unit_tests/CMakeLists.txt | 12 +- .../unit_tests/include/SIMDTesting_Ops.hpp | 4 + .../unit_tests/include/TestSIMD_MathOps.hpp | 15 +- .../include/TestSIMD_Reductions.hpp | 7 + .../include/TestSIMD_WhereExpressions.hpp | 8 +- packages/kokkos/tpls/.clang-format | 1 - .../include/desul/atomics/Atomic_Ref.hpp | 16 + .../desul/atomics/Compare_Exchange_SYCL.hpp | 8 + .../atomics/Lock_Based_Fetch_Op_SYCL.hpp | 8 + packages/kokkos/tpls/gtest/gtest/gtest.h | 9 +- .../experimental/__p0009_bits/layout_left.hpp | 3 + .../__p0009_bits/layout_right.hpp | 3 + .../__p0009_bits/layout_stride.hpp | 36 +- .../experimental/__p0009_bits/utility.hpp | 100 + .../__p2630_bits/submdspan_extents.hpp | 119 +- .../__p2630_bits/submdspan_mapping.hpp | 58 +- .../__p2642_bits/layout_padded.hpp | 26 +- .../__p2642_bits/layout_padded_fwd.hpp | 6 + 636 files changed, 21890 insertions(+), 17582 deletions(-) rename packages/kokkos/{appveyor.yml => .appveyor.yml} (82%) delete mode 100644 packages/kokkos/.clang-format-ignore create mode 100644 packages/kokkos/.cmake-format.py delete mode 100644 packages/kokkos/.codecov.yml delete mode 100644 packages/kokkos/HOW_TO_SNAPSHOT delete mode 100644 packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in rename packages/kokkos/{core/unit_test/sycl/TestSYCL_Task.cpp => cmake/compile_tests/amd_apu.cc} (57%) create mode 100644 packages/kokkos/cmake/kokkos_configure_trilinos.cmake create mode 100644 packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp create mode 100644 packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp create mode 100644 packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp delete mode 100644 packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp create mode 100644 packages/kokkos/core/src/Kokkos_TypeInfo.hpp create mode 100644 packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp delete mode 100644 packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp create mode 100644 packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp delete mode 100644 packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp delete mode 100644 packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp create mode 100644 packages/kokkos/core/src/View/Kokkos_BasicView.hpp rename packages/kokkos/core/src/{impl/Kokkos_Atomic_View.hpp => View/Kokkos_ViewAtomic.hpp} (96%) rename packages/kokkos/core/src/{impl => View}/Kokkos_ViewCtor.hpp (84%) rename packages/kokkos/core/src/{impl => View}/Kokkos_ViewDataAnalysis.hpp (96%) create mode 100644 packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp rename packages/kokkos/core/src/{impl => View}/Kokkos_ViewMapping.hpp (90%) rename packages/kokkos/core/src/{impl => View}/Kokkos_ViewTracker.hpp (100%) create mode 100644 packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp rename packages/kokkos/core/src/{impl => View}/Kokkos_ViewUniformType.hpp (88%) delete mode 100644 packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp delete mode 100644 packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp rename packages/kokkos/core/unit_test/{default/TestDefaultDeviceType_d.cpp => TestCStyleMemoryManagement.cpp} (73%) create mode 100644 packages/kokkos/core/unit_test/TestTypeInfo.cpp create mode 100644 packages/kokkos/core/unit_test/TestViewCtorProp.hpp create mode 100644 packages/kokkos/core/unit_test/TestViewTypedefs.cpp create mode 100644 packages/kokkos/core/unit_test/UnitTest_CMakeTriBITSCompatibility.cpp create mode 100644 packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Graph.cpp delete mode 100755 packages/kokkos/core/unit_test/diffconfig.sh create mode 100644 packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Graph.cpp create mode 100644 packages/kokkos/core/unit_test/hip/TestHIP_UnifiedMemory_ZeroMemset.cpp create mode 100644 packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Graph.cpp delete mode 100755 packages/kokkos/core/unit_test/testmake.sh create mode 100644 packages/kokkos/core/unit_test/tools/TestKernelNames.cpp create mode 100644 packages/kokkos/core/unit_test/view/TestBasicView.hpp create mode 100644 packages/kokkos/core/unit_test/view/TestBasicViewMDSpanConversion.cpp create mode 100644 packages/kokkos/core/unit_test/view/TestReferenceCountedAccessor.hpp create mode 100644 packages/kokkos/core/unit_test/view/TestReferenceCountedDataHandle.hpp create mode 100644 packages/kokkos/example/relocatable_function/CMakeLists.txt create mode 100644 packages/kokkos/example/relocatable_function/Makefile rename packages/kokkos/{core/src/impl/KokkosExp_ViewMapping.hpp => example/relocatable_function/functor.cpp} (81%) create mode 100644 packages/kokkos/example/relocatable_function/main.cpp create mode 100644 packages/kokkos/example/tutorial/Algorithms/01_random_numbers/CMakeLists.txt create mode 100644 packages/kokkos/example/tutorial/Algorithms/CMakeLists.txt diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/.appveyor.yml similarity index 82% rename from packages/kokkos/appveyor.yml rename to packages/kokkos/.appveyor.yml index d0a5645ef7b6..23cac222ca38 100644 --- a/packages/kokkos/appveyor.yml +++ b/packages/kokkos/.appveyor.yml @@ -5,6 +5,6 @@ build_script: - cmd: >- mkdir build && cd build && - cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && + cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON && cmake --build . --target install && ctest -C Debug --output-on-failure diff --git a/packages/kokkos/.clang-format b/packages/kokkos/.clang-format index db5f94fa2ebb..090edc2c51f1 100644 --- a/packages/kokkos/.clang-format +++ b/packages/kokkos/.clang-format @@ -1,4 +1,4 @@ -#Official Tool: clang-format version 8.0.0 +#Official Tool: clang-format version 16.0.0 BasedOnStyle: google SortIncludes: false AlignConsecutiveAssignments: true diff --git a/packages/kokkos/.clang-format-ignore b/packages/kokkos/.clang-format-ignore deleted file mode 100644 index 43d242c3106a..000000000000 --- a/packages/kokkos/.clang-format-ignore +++ /dev/null @@ -1,3 +0,0 @@ -core/unit_test/config/results/* -tpls/gtest/gtest/* -core/src/desul/* diff --git a/packages/kokkos/.clang-tidy b/packages/kokkos/.clang-tidy index 2b0d6e51d438..f1aba1f52e5f 100644 --- a/packages/kokkos/.clang-tidy +++ b/packages/kokkos/.clang-tidy @@ -1,3 +1,3 @@ -Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast' +Checks: '-*,kokkos-*,modernize-type-traits,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast' FormatStyle: file -HeaderFilterRegex: '.*/*.hpp' +HeaderFilterRegex: '(algorithms|benchmarks|containers|core|example|simd).*\.hpp' diff --git a/packages/kokkos/.cmake-format.py b/packages/kokkos/.cmake-format.py new file mode 100644 index 000000000000..6a66b6a14ec8 --- /dev/null +++ b/packages/kokkos/.cmake-format.py @@ -0,0 +1,28 @@ +# ----------------------------- +# Options affecting formatting. +# ----------------------------- +with section("format"): + + # How wide to allow formatted cmake files + line_width = 120 + + # If an argument group contains more than this many sub-groups (parg or kwarg + # groups) then force it to a vertical layout. + max_subgroups_hwrap = 3 + + # If a statement is wrapped to more than one line, than dangle the closing + # parenthesis on its own line. + dangle_parens = True + + # If the trailing parenthesis must be 'dangled' on its on line, then align it + # to this reference: `prefix`: the start of the statement, `prefix-indent`: + # the start of the statement, plus one indentation level, `child`: align to + # the column of the arguments + dangle_align = 'prefix' + +# ------------------------------------------------ +# Options affecting comment reflow and formatting. +# ------------------------------------------------ +with section("markup"): + # enable comment markup parsing and reflow + enable_markup = False diff --git a/packages/kokkos/.codecov.yml b/packages/kokkos/.codecov.yml deleted file mode 100644 index 097b0264a272..000000000000 --- a/packages/kokkos/.codecov.yml +++ /dev/null @@ -1,11 +0,0 @@ -coverage: - precision: 1 - round: down - range: "70...100" -ignore: - - tpls/ - - algorithms/unit_tests - - core/perf_test/ - - core/unit_test/ - - containers/performance_tests - - containers/unit_tests diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins index 1635a69f298f..5790ccb93ce1 100644 --- a/packages/kokkos/.jenkins +++ b/packages/kokkos/.jenkins @@ -30,7 +30,157 @@ pipeline { sh './scripts/docker/check_format_cpp.sh' } } - stage('Build') { + stage('Build-1') { + parallel { + stage('GCC-8.4.0') { + agent { + dockerfile { + filename 'Dockerfile.gcc' + dir 'scripts/docker' + label 'docker' + } + } + environment { + OMP_NUM_THREADS = 8 + OMP_NESTED = 'true' + OMP_MAX_ACTIVE_LEVELS = 3 + OMP_PROC_BIND = 'true' + } + steps { + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_LIBDL=OFF \ + -DKokkos_ENABLE_LIBQUADMATH=ON \ + -DKokkos_ENABLE_SERIAL=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c''' + } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + stage('HIP-ROCm-5.6-C++20') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:5.6-complete@sha256:578a310fb1037d9c5e23fded2564f239acf6dc7231ff4742d2e7279fe7cc5c4a' + label 'rocm-docker' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + stage('CUDA-11.0-NVCC-RDC') { + agent { + dockerfile { + filename 'Dockerfile.nvcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.0.3-devel-ubuntu20.04@sha256:10ab0f09fcdc796b4a2325ef1bce8f766f4a3500eab5a83780f80475ae26c7a6 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' + label 'nvidia-docker && (volta || ampere)' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + environment { + OMP_NUM_THREADS = 8 + // Nested OpenMP does not work for this configuration, + // so disabling it + OMP_MAX_ACTIVE_LEVELS = 1 + OMP_PLACES = 'threads' + OMP_PROC_BIND = 'spread' + NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8' + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf install && mkdir -p install && \ + rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_OPENMP=OFF \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ + -DKokkos_ENABLE_CUDA_UVM=ON \ + -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + \ + -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ + .. && \ + make -j8 install && \ + cd .. && \ + rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ + export CMAKE_PREFIX_PATH=${PWD}/../install && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ + -DCMAKE_CXX_FLAGS="-Werror --Werror=all-warnings -Xcudafe --diag_suppress=940" \ + -DCMAKE_EXE_LINKER_FLAGS="-Xnvlink -suppress-stack-size-warning" \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_INSTALL_TESTING=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose && \ + cd ../example/build_cmake_installed && \ + rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + .. && \ + make -j8 && ctest --verbose && \ + cd ../.. && \ + cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ + cmake --build build_cmake_installed_different_compiler/build --target all && \ + cmake --build build_cmake_installed_different_compiler/build --target test''' + } + post { + always { + sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build-tests/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + } + } + stage('Build-2') { parallel { stage('OPENACC-NVHPC-CUDA-12.2') { agent { @@ -49,14 +199,21 @@ pipeline { /opt/cmake/bin/cmake \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_OPENACC=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } stage('CUDA-12.2-NVHPC-AS-HOST-COMPILER') { agent { @@ -82,17 +239,22 @@ pipeline { -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS="--diag_suppress=implicit_return_from_non_void_function" \ + -DCMAKE_CXX_FLAGS="-Werror --diag_suppress=implicit_return_from_non_void_function" \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_OPENMP=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } } + } stage('SYCL-OneAPI') { agent { @@ -117,19 +279,20 @@ pipeline { -DKokkos_ARCH_AMPERE80=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_SYCL=ON \ + -DKokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=ON \ -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ -DCMAKE_CXX_STANDARD=17 \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -138,7 +301,7 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete@sha256:4030c8af0c06c286174758523dabe4b3850bf72d4a8c1ef275d3ec69aa475f65' label 'rocm-docker ' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } @@ -168,46 +331,12 @@ pipeline { -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('HIP-ROCm-5.6-C++20') { - agent { - dockerfile { - filename 'Dockerfile.hipcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:5.6-complete' - label 'rocm-docker' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' - } - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DBUILD_SHARED_LIBS=ON \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ - -DCMAKE_CXX_STANDARD=20 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_HIP=ON \ - .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -248,7 +377,7 @@ pipeline { -DKokkos_ARCH_AMD_GFX906=ON \ && \ cmake --build build --parallel ${BUILD_JOBS} && \ - cd build && ctest --output-on-failure + cd build && ctest --no-compress-output -T Test --output-on-failure ''' } post { @@ -277,7 +406,6 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_TUNING=ON \ @@ -285,11 +413,12 @@ pipeline { -DKokkos_ARCH_VOLTA70=ON \ -DCMAKE_CXX_STANDARD=17 \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -315,119 +444,18 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_TUNING=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('CUDA-11.7-NVCC') { - agent { - dockerfile { - filename 'Dockerfile.nvcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.7.1-devel-ubuntu20.04' - label 'nvidia-docker && volta' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' - } - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ - ../gnu_generate_makefile.bash \ - --with-options=compiler_warnings \ - --cxxflags="-Werror -Werror all-warnings -Xcudafe --diag_suppress=20208" \ - --cxxstandard=c++17 \ - --with-cuda \ - --with-cuda-options=enable_lambda \ - --arch=Volta70 \ - && \ - make test -j8''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('CUDA-11.0-NVCC-RDC') { - agent { - dockerfile { - filename 'Dockerfile.nvcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0.3-devel-ubuntu18.04 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' - label 'nvidia-docker && (volta || ampere)' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' - } - } - environment { - OMP_NUM_THREADS = 8 - // Nested OpenMP does not work for this configuration, - // so disabling it - OMP_MAX_ACTIVE_LEVELS = 1 - OMP_PLACES = 'threads' - OMP_PROC_BIND = 'spread' - NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8' - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf install && mkdir -p install && \ - rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=g++-8 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_OPENMP=OFF \ - -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ - -DKokkos_ENABLE_CUDA_UVM=ON \ - -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ - .. && \ - make -j8 install && \ - cd .. && \ - rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ - export CMAKE_PREFIX_PATH=${PWD}/../install && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS=-Werror --Werror=all-warnings -Xcudafe --diag_suppress=3159 \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_INSTALL_TESTING=ON \ - .. && \ - make -j8 && ctest --verbose && \ - cd ../example/build_cmake_installed && \ - rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_CXX_COMPILER=g++-8 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DCMAKE_CXX_STANDARD=17 \ - .. && \ - make -j8 && ctest --verbose && \ - cd ../.. && \ - cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ - cmake --build build_cmake_installed_different_compiler/build --target all && \ - cmake --build build_cmake_installed_different_compiler/build --target test''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -436,7 +464,7 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.2-devel-ubuntu20.04' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.6.2-devel-ubuntu20.04@sha256:d95d54bc231f8aea7fda79f60da620324584b20ed31a8ebdb0686cffd34dd405' label 'nvidia-docker && (volta || ampere)' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } @@ -449,7 +477,7 @@ pipeline { -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS="-Werror -Werror all-warnings -Xcudafe --diag_suppress=20208" \ + -DCMAKE_CXX_FLAGS="-Werror -Werror=all-warnings" \ -DCMAKE_CXX_STANDARD=17 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ @@ -459,13 +487,12 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ -DKokkos_ENABLE_OPENMP=ON \ -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ - -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \ + -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=ON \ .. && \ - make -j8 && ctest --verbose && \ + make -j8 && ctest --no-compress-output -T Test --verbose && \ cd ../example/build_cmake_in_tree && \ rm -rf build && mkdir -p build && cd build && \ cmake -DCMAKE_CXX_STANDARD=17 .. && make -j8 && ctest --verbose''' @@ -473,41 +500,37 @@ pipeline { post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } - stage('GCC-8.4.0') { + stage('CUDA-11.7-NVCC') { agent { dockerfile { - filename 'Dockerfile.gcc' + filename 'Dockerfile.nvcc' dir 'scripts/docker' - label 'docker' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.7.1-devel-ubuntu20.04@sha256:fc997521e612899a01dce92820f5f5a201dd943ebfdc3e49ba0706d491a39d2d' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } - environment { - OMP_NUM_THREADS = 8 - OMP_NESTED = 'true' - OMP_MAX_ACTIVE_LEVELS = 3 - OMP_PROC_BIND = 'true' - } steps { + sh 'ccache --zero-stats' sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ENABLE_LIBDL=OFF \ - -DKokkos_ENABLE_LIBQUADMATH=ON \ - -DKokkos_ENABLE_SERIAL=ON \ - .. && \ - make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c''' + ../gnu_generate_makefile.bash \ + --with-options=compiler_warnings \ + --cxxflags="-Werror -Werror=all-warnings" \ + --cxxstandard=c++17 \ + --with-cuda \ + --with-cuda-options=enable_lambda \ + --arch=Volta70 \ + && \ + make test -j8''' + } + post { + always { + sh 'ccache --show-stats' + } } } } diff --git a/packages/kokkos/.jenkins_nightly b/packages/kokkos/.jenkins_nightly index 8dd02e9f028a..15bef607258f 100644 --- a/packages/kokkos/.jenkins_nightly +++ b/packages/kokkos/.jenkins_nightly @@ -93,19 +93,23 @@ pipeline { -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_SERIAL=ON \ .. && \ - make -j8 && ctest --verbose + make -j8 && ctest --no-compress-output -T Test --verbose ''' } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } } - stage('HIP-ROCM-6.1') { + stage('HIP-ROCM-6.2') { agent { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:6.1.2-complete' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:6.2-complete' label 'rocm-docker && AMD_Radeon_Instinct_MI210' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } @@ -125,16 +129,16 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index 7b1d69e56630..6c237ebca867 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,7 +1,101 @@ # CHANGELOG +## 4.5.00 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.01...4.5.00) + +### Features + +* SYCL backend graduated to production ready +* Introduce new `SequentialHostInit` view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) (backported in 4.4.01) +* Support building with Run-Time Type Information (RTTI) disabled +* Add new `KOKKOS_RELOCATABLE_FUNCTION` function annotation macro [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +### Backend and Architecture Enhancements + +#### CUDA + +* Adding occupancy tuning for CUDA architectures [\#6788](https://github.com/kokkos/kokkos/pull/6788) +* By default disable `cudaMallocAsync` (i.e., revert the change made in version 4.2) [\#7353](https://github.com/kokkos/kokkos/pull/7353) + +#### HIP + +* Add support for AMD Phoenix APUs with Radeon 740M/760M/780M/880M/890M [\#7162](https://github.com/kokkos/kokkos/pull/7162) +* Update maximum waves per CU values for consumer card [\#7347](https://github.com/kokkos/kokkos/pull/7347) +* Check that Kokkos is running on the architecture it was compiled for [\#7379](https://github.com/kokkos/kokkos/pull/7379) +* Add opt-in option to use `hipMallocAsync` instead of `hipMalloc` [\#7324](https://github.com/kokkos/kokkos/pull/7324) +* Introduce new architecture option `AMD_GFX942_APU` for MI300A [\#7462](https://github.com/kokkos/kokkos/pull/7462) + +#### SYCL + +* Move the `SYCL` backend out of the `Experimental` namespace [\#7171](https://github.com/kokkos/kokkos/pull/7171) +* Introduce `KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE` as CMake option [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +#### OpenACC + +* Add support for building with the Clacc compiler [\#7198](https://github.com/kokkos/kokkos/pull/7198) +* Workaround NVHPC collapse clause bug for `MDRangePolicy` [\#7425](https://github.com/kokkos/kokkos/pull/7425) + +#### HPX + +* Implement `Experimental::partition_space` to produce truly independent execution spaces [\#7287](https://github.com/kokkos/kokkos/pull/7287) + +#### Threads + +* Fix compilation for `parallel_reduce` `MDRange` with `Dynamic` scheduling [\#7478](https://github.com/kokkos/kokkos/pull/7478) +* Fix race conditions on ARM architectures [\#7498](https://github.com/kokkos/kokkos/pull/7498) + +#### OpenMP + +* Fix run time behavior when compiling with `-fvisibility-hidden` [\#7284](https://github.com/kokkos/kokkos/pull/7284) (backported in 4.4.01) +* Fix linking with Cray Clang compiler [\#7341](https://github.com/kokkos/kokkos/pull/7341) + +#### Serial + +* Allow `Kokkos_ENABLE_ATOMICS_BYPASS` to skip mutexes to remediate performance regression in 4.4 [\#7369](https://github.com/kokkos/kokkos/pull/7369) + +### General Enhancements + +* Improve `View` initialization/destruction for non-scalar trivial and trivially-destructible types [\#7219](https://github.com/kokkos/kokkos/pull/7219) [\#7225](https://github.com/kokkos/kokkos/pull/7225) +* Add getters for default tile sizes used in `MDRangePolicy` [\#6839](https://github.com/kokkos/kokkos/pull/6839) +* Improve performance of `Kokkos::sort` when `std::sort` is used [\#7264](https://github.com/kokkos/kokkos/pull/7264) +* Add range-based for loop support for `Array` [\#7293](https://github.com/kokkos/kokkos/pull/7293) +* Allow functors as reducers for nested team parallel reduce [\#6921](https://github.com/kokkos/kokkos/pull/6921) +* Avoid making copies of string rvalue reference arguments to `view_alloc()` [\#7364](https://github.com/kokkos/kokkos/pull/7364) +* Add `atomic_{mod,xor,nand,lshift,rshift}` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Allow using `SequentialHostInit` with `Kokkos::DualView` [\#7456](https://github.com/kokkos/kokkos/pull/7456) +* Add `Graph::instantiate()` [\#7240](https://github.com/kokkos/kokkos/pull/7240) +* Allow an arbitrary execution space instance to be used in `Kokkos::Graph::submit()` [\#7249](https://github.com/kokkos/kokkos/pull/7249) +* Enable compile-time diagnostic of illegal reduction target for graphs [\#7460](https://github.com/kokkos/kokkos/pull/7460) + +### Build System Changes + +* Make sure backend-specific options such as `IMPL_CUDA_MALLOC_ASYNC` only show when that backend is actually enabled [\#7228](https://github.com/kokkos/kokkos/pull/7228) +* Major refactoring removing `TriBITS` paths [\#6164](https://github.com/kokkos/kokkos/pull/6164) +* Add support for SpacemiT K60 (RISC-V) [\#7160](https://github.com/kokkos/kokkos/pull/7160) + +### Deprecations + +* Deprecate Tasking interface [\#7393](https://github.com/kokkos/kokkos/pull/7393) +* Deprecate `atomic_query_version`, `atomic_assign`, `atomic_compare_exchange_strong`, `atomic_{inc, dec}rement` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Deprecate `{OpenMP,HPX}::is_asynchronous()` [\#7322](https://github.com/kokkos/kokkos/pull/7322) + +### Bug Fixes + +* Fix undefined behavior in `BinSort` when sorting within bins on host [\#7223](https://github.com/kokkos/kokkos/pull/7223) +* Using CUDA limits to set extents for blocks, grids [\#7235](https://github.com/kokkos/kokkos/pull/7235) +* Fix `deep_copy (serial_exec, dst, src)` with multiple host backends [\#7245](https://github.com/kokkos/kokkos/pull/7245) +* Skip `RangePolicy` bounds conversion checks if roundtrip convertibility is not provided [\#7172](https://github.com/kokkos/kokkos/pull/7172) +* Allow extracting host and device views from `DualView` with `const` value type [\#7242](https://github.com/kokkos/kokkos/pull/7242) +* Fix `TeamPolicy` array reduction for CUDA and HIP [\#6296](https://github.com/kokkos/kokkos/pull/6296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix configuring without architecture flags for SYCL [\#7303](https://github.com/kokkos/kokkos/pull/7303) +* Set an initial value index during join of `MinLoc`, `MaxLoc` or `MinMaxLoc` [\#7330](https://github.com/kokkos/kokkos/pull/7330) +* Fix storage lifetime of driver for global launch of graph nodes for CUDA and HIP [\#7365](https://github.com/kokkos/kokkos/pull/7365) +* Make `value_type` for `RandomAccessIterator` non-`const` [\#7485](https://github.com/kokkos/kokkos/pull/7485) + ## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01) -[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.00...4.4.01) ### Features: * Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) @@ -13,7 +107,7 @@ ### Bug Fixes * OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284) -* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) ## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) @@ -57,6 +151,7 @@ * SIMD: Allow flexible vector width for 32 bit types [\#6802](https://github.com/kokkos/kokkos/pull/6802) * Updates for `Kokkos::Array`: add `kokkos_swap(Array)` specialization [\#6943](https://github.com/kokkos/kokkos/pull/6943), add `Kokkos::to_array` [\#6375](https://github.com/kokkos/kokkos/pull/6375), make `Kokkos::Array` equality-comparable [\#7148](https://github.com/kokkos/kokkos/pull/7148) * Structured binding support for `Kokkos::complex` [\#7040](https://github.com/kokkos/kokkos/pull/7040) +* Introduce `KOKKOS_DEDUCTION_GUIDE` macro to allow for portable user-defined deduction guides [\#6954](https://github.com/kokkos/kokkos/pull/6954) ### Build System Changes * Do not require OpenMP support for languages other than CXX [\#6965](https://github.com/kokkos/kokkos/pull/6965) @@ -1388,7 +1483,7 @@ **Closed issues:** - Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097) -- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) +- Remove KOKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) - Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083) - In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081) - Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070) diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index 736cbac218c2..f0bf8e3634a9 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -1,12 +1,11 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) # Disable in-source builds to prevent source tree corruption. -if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) - message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) -endif() - -if (COMMAND TRIBITS_PACKAGE) - TRIBITS_PACKAGE(Kokkos) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") + message( + FATAL_ERROR + "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." + ) endif() # We want to determine if options are given with the wrong case @@ -15,143 +14,142 @@ endif() # form a list of all the given variables. If it begins with any # case of KoKkOS, we add it to the list. -GET_CMAKE_PROPERTY(_variableNames VARIABLES) -SET(KOKKOS_GIVEN_VARIABLES) -FOREACH (var ${_variableNames}) - STRING(TOUPPER ${var} UC_VAR) - STRING(FIND ${UC_VAR} KOKKOS IDX) - IF (${IDX} EQUAL 0) - LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var}) - ENDIF() -ENDFOREACH() +get_cmake_property(_variableNames VARIABLES) +set(KOKKOS_GIVEN_VARIABLES) +foreach(var ${_variableNames}) + string(TOUPPER ${var} UC_VAR) + string(FIND ${UC_VAR} KOKKOS IDX) + if(${IDX} EQUAL 0) + list(APPEND KOKKOS_GIVEN_VARIABLES ${var}) + endif() +endforeach() # Basic initialization (Used in KOKKOS_SETTINGS) -SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) - -# Is this a build as part of Trilinos? -IF(COMMAND TRIBITS_PACKAGE_DECL) - SET(KOKKOS_HAS_TRILINOS ON) -ELSE() - SET(KOKKOS_HAS_TRILINOS OFF) - SET(PACKAGE_NAME Kokkos) - SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -ENDIF() -# Is this build a subdirectory of another project -GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) +set(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(PACKAGE_NAME Kokkos) +set(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) +# Is this build a subdirectory of another project +get_directory_property(HAS_PARENT PARENT_DIRECTORY) -SET(KOKKOS_ENABLED_OPTIONS) #exported in config file -SET(KOKKOS_ENABLED_DEVICES) #exported in config file -SET(KOKKOS_ENABLED_TPLS) #exported in config file -SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file +include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) + +set(KOKKOS_ENABLED_OPTIONS) #exported in config file +set(KOKKOS_ENABLED_DEVICES) #exported in config file +set(KOKKOS_ENABLED_TPLS) #exported in config file +set(KOKKOS_ENABLED_ARCH_LIST) #exported in config file #These are helper flags used for sanity checks during config #Certain features should depend on other features being configured first -SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies -SET(KOKKOS_CFG_DAG_DEVICES_DONE Off) -SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off) -SET(KOKKOS_CFG_DAG_ARCH_DONE Off) -SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off) -SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) -FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) - SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) - SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) - IF (NOT ${PRE_FLAG}) - MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured") - ENDIF() - GLOBAL_SET(${POST_FLAG} On) -ENDFUNCTION() - - -LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) - -IF(NOT KOKKOS_HAS_TRILINOS) - set(CMAKE_DISABLE_SOURCE_CHANGES ON) - set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) - - # What language are we compiling Kokkos as - # downstream dependencies need to match this! - SET(KOKKOS_COMPILE_LANGUAGE CXX) - # use lower case here since we didn't parse options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as CUDA only - # because otherwise the C++ features don't work etc. - # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even - # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 - # days. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE CUDA) - ENDIF() - # use lower case here since we haven't parsed options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as HIP only - # because otherwise the C++ features don't work etc. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE HIP) - ENDIF() - - IF (Spack_WORKAROUND) - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - MESSAGE(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") - ENDIF() - - #if we are explicitly using Spack for development, - #nuke the Spack compiler - SET(SPACK_CXX $ENV{SPACK_CXX}) - IF(SPACK_CXX) - SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) - SET(ENV{CXX} ${SPACK_CXX}) - ENDIF() - ENDIF() - # Always call the project command to define Kokkos_ variables - # and to make sure that C++ is an enabled language - PROJECT(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) - IF(NOT HAS_PARENT) - IF (NOT CMAKE_BUILD_TYPE) - SET(DEFAULT_BUILD_TYPE "RelWithDebInfo") - MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") - SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING - "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." - FORCE) - ENDIF() - ENDIF() -ELSE() - SET(KOKKOS_COMPILE_LANGUAGE CXX) -ENDIF() - -IF (NOT CMAKE_SIZEOF_VOID_P) - STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) - IF (NOT FIND_IDX STREQUAL -1) - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") - ENDIF() -ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) - IF(CMAKE_SIZEOF_VOID_P EQUAL 4) - MESSAGE(WARNING "32-bit builds are experimental and not officially supported.") - SET(KOKKOS_IMPL_32BIT ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;") - ENDIF() -ENDIF() +set(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies +set(KOKKOS_CFG_DAG_DEVICES_DONE Off) +set(KOKKOS_CFG_DAG_OPTIONS_DONE Off) +set(KOKKOS_CFG_DAG_ARCH_DONE Off) +set(KOKKOS_CFG_DAG_CXX_STD_DONE Off) +set(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) +function(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) + set(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) + set(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) + if(NOT ${PRE_FLAG}) + message( + FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured" + ) + endif() + global_set(${POST_FLAG} On) +endfunction() + +list(APPEND CMAKE_MODULE_PATH cmake/Modules) + +set(CMAKE_DISABLE_SOURCE_CHANGES ON) +set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) + +# What language are we compiling Kokkos as +# downstream dependencies need to match this! +set(KOKKOS_COMPILE_LANGUAGE CXX) +# use lower case here since we didn't parse options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as CUDA only + # because otherwise the C++ features don't work etc. + # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even + # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 + # days. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + + set(KOKKOS_COMPILE_LANGUAGE CUDA) +endif() +# use lower case here since we haven't parsed options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as HIP only + # because otherwise the C++ features don't work etc. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE HIP) +endif() + +if(Spack_WORKAROUND) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + message(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") + endif() + + #if we are explicitly using Spack for development, + #nuke the Spack compiler + set(SPACK_CXX $ENV{SPACK_CXX}) + if(SPACK_CXX) + set(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) + set(ENV{CXX} ${SPACK_CXX}) + endif() +endif() +# Always call the project command to define Kokkos_ variables +# and to make sure that C++ is an enabled language +project(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) +if(NOT HAS_PARENT) + if(NOT CMAKE_BUILD_TYPE) + set(DEFAULT_BUILD_TYPE "RelWithDebInfo") + message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") + set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" + CACHE STRING "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." FORCE + ) + endif() +endif() + +if(NOT CMAKE_SIZEOF_VOID_P) + string(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) + if(NOT FIND_IDX STREQUAL -1) + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured." + ) + else() + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation" + ) + endif() +elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + message(WARNING "32-bit builds are experimental and not officially supported.") + set(KOKKOS_IMPL_32BIT ON) + else() + message( + FATAL_ERROR + "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;" + ) + endif() +endif() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 5) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -164,58 +162,54 @@ math(EXPR KOKKOS_VERSION_PATCH "${KOKKOS_VERSION} % 100") # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) +include(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # If we are building CUDA, we have tricked CMake because we declare a CXX project # If the default C++ standard for a given compiler matches the requested # standard, then CMake just omits the -std flag in later versions of CMake # This breaks CUDA compilation (CUDA compiler can have a different default # -std then the underlying host compiler by itself). Setting this variable # forces CMake to always add the -std flag even if it thinks it doesn't need it - GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) -ENDIF() + global_set(CMAKE_CXX_STANDARD_DEFAULT 98) +endif() # These are the variables we will append to as we go # I really wish these were regular variables # but scoping issues can make it difficult -GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) -GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) -GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) +global_set(KOKKOS_COMPILE_OPTIONS) +global_set(KOKKOS_LINK_OPTIONS) +global_set(KOKKOS_AMDGPU_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDAFE_OPTIONS) +global_set(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos -GLOBAL_SET(KOKKOS_TPL_EXPORTS) +global_set(KOKKOS_TPL_EXPORTS) # KOKKOS_DEPENDENCE is used by kokkos_launch_compiler -GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +global_set(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # MSVC never goes through kokkos_launch_compiler -IF(NOT MSVC) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) -ENDIF() +if(NOT MSVC) + global_append(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +endif() + +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/kokkos_configure_trilinos.cmake) -IF(Kokkos_ENABLE_TESTS AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_TESTS) find_package(GTest QUIET) -ENDIF() +endif() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS # These are functions like KOKKOS_INCLUDE_DIRECTORIES -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) - +include(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) # Check the environment and set certain variables # to allow platform-specific checks -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) -IF(NOT KOKKOS_HAS_TRILINOS) - # This does not work in Trilinos and we simply don't care - # to fix it for Trilinos - # Gather information about the runtime environment - INCLUDE(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) - check_git_setup() -ENDIF() +include(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) +check_git_setup() # The build environment setup goes in the following steps # 1) Check all the enable options. This includes checking Kokkos_DEVICES @@ -223,102 +217,54 @@ ENDIF() # 3) Check the CXX standard and select important CXX flags # 4) Check for any third-party libraries (TPLs) like hwloc # 5) Check if optimizing for a particular architecture and add arch-specific flags -KOKKOS_SETUP_BUILD_ENVIRONMENT() +kokkos_setup_build_environment() # Finish off the build # 6) Recurse into subdirectories and configure individual libraries # 7) Export and install targets -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) -SET(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) -SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) +set(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) +set_property(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) -IF (KOKKOS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) - SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSEIF(HAS_PARENT) - SET(KOKKOS_HEADER_DIR "include/kokkos") - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSE() - SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") - SET(KOKKOS_IS_SUBDIRECTORY FALSE) -ENDIF() +if(HAS_PARENT) + set(KOKKOS_HEADER_DIR "include/kokkos") + set(KOKKOS_IS_SUBDIRECTORY TRUE) +else() + set(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") + set(KOKKOS_IS_SUBDIRECTORY FALSE) +endif() #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for # subpackages -## This restores the old behavior of ProjectCompilerPostConfig.cmake -# We must do this before KOKKOS_PACKAGE_DECL -IF (KOKKOS_HAS_TRILINOS) - # Overwrite the old flags at the top-level - # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior, also we have to preserve quotes - # which needs another workaround. - SET(KOKKOS_COMPILE_OPTIONS_TMP) - IF (KOKKOS_ENABLE_HIP) - LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) - ENDIF() - FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) - STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) - IF(OPTION_HAS_WHITESPACE EQUAL -1) - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}") - ELSE() - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"") - ENDIF() - ENDFOREACH() - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) - IF (KOKKOS_ENABLE_CUDA) - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) - ENDIF() - FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS}) - SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) - ENDFOREACH() - IF (KOKKOS_ENABLE_CUDA) - STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") - FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) - SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) - ENDFOREACH() - ENDIF() - #These flags get set up in KOKKOS_PACKAGE_DECL, which means they - #must be configured before KOKKOS_PACKAGE_DECL - SET(KOKKOS_ALL_COMPILE_OPTIONS - $<$:${KOKKOS_ALL_COMPILE_OPTIONS}>) -ENDIF() - - #------------------------------------------------------------------------------ # # D) Process the subpackages (subdirectories) for Kokkos # -KOKKOS_PROCESS_SUBPACKAGES() - +kokkos_process_subpackages() #------------------------------------------------------------------------------ # # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_PACKAGE_POSTPROCESS() -KOKKOS_CONFIGURE_CORE() +kokkos_configure_core() -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - ADD_LIBRARY(kokkos INTERFACE) +if(NOT Kokkos_INSTALL_TESTING) + add_library(kokkos INTERFACE) #Make sure in-tree projects can reference this as Kokkos:: #to match the installed target names - ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos) + add_library(Kokkos::kokkos ALIAS kokkos) # all_libs target is required for TriBITS-compliance - ADD_LIBRARY(Kokkos::all_libs ALIAS kokkos) - TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos) -ENDIF() -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) + add_library(Kokkos::all_libs ALIAS kokkos) + target_link_libraries(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) + kokkos_internal_add_library_install(kokkos) +endif() +include(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. # Kokkos needs nvcc_wrapper in order to build. Other libraries and @@ -327,16 +273,15 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # as relative to ${CMAKE_INSTALL_PATH}. # KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated # files -KOKKOS_INSTALL_ADDITIONAL_FILES() - +kokkos_install_additional_files() # Finally - if we are a subproject - make sure the enabled devices are visible -IF (HAS_PARENT) - FOREACH(DEV Kokkos_ENABLED_DEVICES) +if(HAS_PARENT) + foreach(DEV Kokkos_ENABLED_DEVICES) #I would much rather not make these cache variables or global properties, but I can't #make any guarantees on whether PARENT_SCOPE is good enough to make #these variables visible where I need them - SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) - SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) - ENDFOREACH() -ENDIF() + set(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) + set_property(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) + endforeach() +endif() diff --git a/packages/kokkos/CONTRIBUTING.md b/packages/kokkos/CONTRIBUTING.md index b4f3057cef2c..e97f8c4d89c5 100644 --- a/packages/kokkos/CONTRIBUTING.md +++ b/packages/kokkos/CONTRIBUTING.md @@ -7,6 +7,8 @@ We actively welcome pull requests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. +Before sending your patch for review, please try to ensure that it is formatted properly. We use clang-format version 16 for this. + ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. diff --git a/packages/kokkos/HOW_TO_SNAPSHOT b/packages/kokkos/HOW_TO_SNAPSHOT deleted file mode 100644 index ad3f78efb4f8..000000000000 --- a/packages/kokkos/HOW_TO_SNAPSHOT +++ /dev/null @@ -1,73 +0,0 @@ - -Developers of Kokkos (those who commit modifications to Kokkos) -must maintain the snapshot of Kokkos in the Trilinos repository. - -This file contains instructions for how to -snapshot Kokkos from github.com/kokkos to Trilinos. - ------------------------------------------------------------------------- -*** EVERYTHING GOES RIGHT WORKFLOW *** - -1) Given a 'git clone' of Kokkos and of Trilinos repositories. -1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone. - This path *must* terminate with the directory name 'kokkos'; - e.g., ${HOME}/kokkos . -1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory. - -2) Given that the Kokkos build & test is clean and - changes are committed to the Kokkos clone. - -3) Snapshot the current commit in the Kokkos clone into the Trilinos clone. - This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}: - ${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages - -4) Verify the snapshot commit happened as expected - cd ${TRILINOS}/packages/kokkos - git log -1 --name-only - -5) Modify, build, and test Trilinos with the Kokkos snapshot. - -6) Given that that the Trilinos build & test is clean and - changes are committed to the Trilinos clone. - -7) Attempt push to the Kokkos repository. - If push fails then you must 'remove the Kokkos snapshot' - from your Trilinos clone. - See below. - -8) Attempt to push to the Trilinos repository. - If updating for a failed push requires you to change Kokkos you must - 'remove the Kokkos snapshot' from your Trilinos clone. - See below. - ------------------------------------------------------------------------- -*** WHEN SOMETHING GOES WRONG AND YOU MUST *** -*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE *** - -1) Query the Trilinos clone commit log. - git log --oneline - -2) Note the of the commit to the Trillinos clone - immediately BEFORE the Kokkos snapshot commit. - Copy this for use in the next command. - -3) IF more than one outstanding commit then you can remove just the - Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file. - Remove or comment out the Kokkos snapshot commit entry. - git rebase -i - -4) IF the Kokkos snapshot commit is the one and only - outstanding commit then remove just than commit. - git reset --hard HEAD~1 - ------------------------------------------------------------------------- -*** REGARDING 'snapshot.py' TOOL *** - -The 'snapshot.py' tool is developed and maintained by the -Center for Computing Research (CCR) -Software Engineering, Maintenance, and Support (SEMS) team. - -Contact Brent Perschbacher for questions> - ------------------------------------------------------------------------- - diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index ccb568a553ce..9e6ad3241564 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -1,8 +1,8 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 5 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -13,7 +13,7 @@ KOKKOS_DEVICES ?= "Threads" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace # IBM: Power8,Power9 -# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" @@ -30,16 +30,19 @@ KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. -# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,enable_malloc_async KOKKOS_CUDA_OPTIONS ?= "" -# Options: rdc +# Options: rdc,enable_malloc_async KOKKOS_HIP_OPTIONS ?= "" # Default settings specific options. # Options: enable_async_dispatch KOKKOS_HPX_OPTIONS ?= "" +#Options : force_host_as_device +KOKKOS_OPENACC_OPTIONS ?= "" + # Helper functions for conversion to upper case uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2) @@ -82,7 +85,7 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS), KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) -KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async) +KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_malloc_async) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) @@ -93,6 +96,8 @@ KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPT KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings) KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) +KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),enable_malloc_async) +KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE := $(call kokkos_has_string,$(KOKKOS_OPENACC_OPTIONS),force_host_as_device) # Check for Kokkos Host Execution Spaces one of which must be on. KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) @@ -168,7 +173,7 @@ KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2 KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_NVHPC := $(strip $(shell $(CXX) --version 2>&1 | grep -c "nvc++")) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++")) +KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -v "error:" | grep -c "clang++")) KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI) KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) @@ -282,6 +287,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) # Set OpenACC flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) KOKKOS_INTERNAL_OPENACC_FLAG := -acc + else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENACC_FLAG := -fopenacc -fopenacc-fake-async-wait -fopenacc-implicit-worker=vector -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version -Wno-pass-failed else $(error Makefile.kokkos: OpenACC is enabled but the compiler must be NVHPC (got version string $(KOKKOS_CXX_VERSION))) endif @@ -401,8 +408,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH) @@ -455,6 +462,15 @@ KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH), ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1103) +KOKKOS_INTERNAL_USE_ARCH_AMD := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103)) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) @@ -550,6 +566,9 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE") + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -722,7 +741,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) + ifeq ($(KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */") @@ -1013,86 +1032,122 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--offload-arch + endif +endif + # Do not add this flag if its the cray compiler or the nvhpc compiler. ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 0) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) - # Lets start with adding architecture defines - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 endif endif @@ -1108,6 +1163,9 @@ ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif endif endif @@ -1115,38 +1173,43 @@ endif # Figure out the architecture flag for ROCm. ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx906\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx906 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx908\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx908 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx90A\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx90a endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx940\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx940 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1030\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1030 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1100\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1100 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1103") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1103\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1103 endif @@ -1155,8 +1218,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) - KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) - KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE") @@ -1166,6 +1229,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_CXXFLAGS+=-fno-gpu-rdc KOKKOS_LDFLAGS+=-fno-gpu-rdc endif + + ifeq ($(KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC") + else + tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC */") + endif +endif + +ifneq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 0) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + endif + endif endif # Figure out Intel architecture flags. @@ -1219,6 +1297,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) KOKKOS_LDFLAGS+=-fsycl KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) + + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) @@ -1306,6 +1386,8 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/MDSpan/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) @@ -1358,6 +1440,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENACC_LIB) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + ifneq ($(CUDA_PATH),) + ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1) + CUDA_PATH := $(CUDA_PATH:/compilers=/cuda) + endif + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(CUDA_PATH),) + KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 + endif + KOKKOS_LIBS += -lcudart + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_LIBS += -cuda + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(ROCM_PATH),) + KOKKOS_CPPFLAGS += -I$(ROCM_PATH)/include + KOKKOS_LDFLAGS += -L$(ROCM_PATH)/lib + endif + KOKKOS_LIBS += -lamdhip64 + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=multicore + endif + else + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=gpu,multicore + endif + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -1468,7 +1592,11 @@ else endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) - tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") + endif else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") endif @@ -1496,6 +1624,12 @@ $(DESUL_CONFIG_HEADER): KOKKOS_CPP_DEPENDS := $(DESUL_CONFIG_HEADER) KokkosCore_config.h $(KOKKOS_HEADERS) +# Tasking is deprecated +ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + TMP_KOKKOS_SRC := $(KOKKOS_SRC) + KOKKOS_SRC = $(patsubst %Task.cpp,, $(TMP_KOKKOS_SRC)) +endif + KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets index e8e429e02750..be535eea3e7c 100644 --- a/packages/kokkos/Makefile.targets +++ b/packages/kokkos/Makefile.targets @@ -16,8 +16,6 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp -Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp @@ -38,17 +36,21 @@ Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort. ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +endif Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp endif @@ -73,6 +75,8 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp +Kokkos_HIP_ZeroMemset.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp endif @@ -89,26 +93,26 @@ Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_Ope $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) -Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp -Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md index c8c6f8f7cf50..0ea07f9ea2f6 100644 --- a/packages/kokkos/README.md +++ b/packages/kokkos/README.md @@ -30,12 +30,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.3.01](https://github.com/kokkos/kokkos/releases/tag/4.3.01). +The current release is [4.5.00](https://github.com/kokkos/kokkos/releases/tag/4.5.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +wget https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt index 368984647e9f..73ce9f7ec552 100644 --- a/packages/kokkos/algorithms/CMakeLists.txt +++ b/packages/kokkos/algorithms/CMakeLists.txt @@ -1,7 +1,7 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) - KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -ENDIF() +if(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) + kokkos_add_test_directories(unit_tests) +endif() diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt index b490caca6282..9f10b85e0214 100644 --- a/packages/kokkos/algorithms/src/CMakeLists.txt +++ b/packages/kokkos/algorithms/src/CMakeLists.txt @@ -1,34 +1,29 @@ #I have to leave these here for tribits -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -FILE(GLOB ALGO_HEADERS *.hpp) -FILE(GLOB ALGO_SOURCES *.cpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) +file(GLOB ALGO_HEADERS *.hpp) +file(GLOB ALGO_SOURCES *.cpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) #----------------------------------------------------------------------------- # We have to pass the sources in here for Tribits # These will get ignored for standalone CMake and a true interface library made -KOKKOS_ADD_INTERFACE_LIBRARY( - kokkosalgorithms - NOINSTALLHEADERS ${ALGO_HEADERS} - SOURCES ${ALGO_SOURCES} -) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_add_interface_library(kokkosalgorithms NOINSTALLHEADERS ${ALGO_HEADERS} SOURCES ${ALGO_SOURCES}) +kokkos_lib_include_directories( + kokkosalgorithms ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) -KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) +kokkos_link_tpl(kokkoscontainers PUBLIC ROCTHRUST) +kokkos_link_tpl(kokkoscore PUBLIC ONEDPL) diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp index 7df12b8518eb..b28ea4c2ca9a 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp @@ -615,7 +615,7 @@ template struct Random_UniqueIndex { using locks_view_type = View; KOKKOS_FUNCTION - static int get_state_idx(const locks_view_type) { + static int get_state_idx(const locks_view_type&) { KOKKOS_IF_ON_HOST( (return DeviceType::execution_space::impl_hardware_thread_id();)) @@ -665,17 +665,16 @@ struct Random_UniqueIndex< #ifdef KOKKOS_ENABLE_SYCL template -struct Random_UniqueIndex< - Kokkos::Device> { +struct Random_UniqueIndex> { using locks_view_type = - View>; + View>; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; std::size_t gridDim[3] = { @@ -1121,7 +1120,7 @@ class Random_XorShift1024_Pool { using execution_space = typename device_type::execution_space; using locks_type = View; using int_view_type = View; - using state_data_type = View; + using state_data_type = View; locks_type locks_ = {}; state_data_type state_ = {}; diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp index 73e751f572c5..8e7de32a07b2 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -35,11 +35,11 @@ struct BinOp1D { #endif // Construct BinOp with number of bins, minimum value and maximum value - BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + BinOp1D(int max_bins, typename KeyViewType::const_value_type min, typename KeyViewType::const_value_type max) - : max_bins_(max_bins__ + 1), + : max_bins_(max_bins + 1), // Cast to double to avoid possible overflow when using integer - mul_(static_cast(max_bins__) / + mul_(static_cast(max_bins) / (static_cast(max) - static_cast(min))), min_(static_cast(min)) { // For integral types the number of bins may be larger than the range @@ -47,7 +47,7 @@ struct BinOp1D { // and then don't need to sort bins. if (std::is_integral::value && (static_cast(max) - static_cast(min)) <= - static_cast(max_bins__)) { + static_cast(max_bins)) { mul_ = 1.; } } @@ -82,16 +82,16 @@ struct BinOp3D { BinOp3D() = delete; #endif - BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + BinOp3D(int max_bins[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[]) { - max_bins_[0] = max_bins__[0]; - max_bins_[1] = max_bins__[1]; - max_bins_[2] = max_bins__[2]; - mul_[0] = static_cast(max_bins__[0]) / + max_bins_[0] = max_bins[0]; + max_bins_[1] = max_bins[1]; + max_bins_[2] = max_bins[2]; + mul_[0] = static_cast(max_bins[0]) / (static_cast(max[0]) - static_cast(min[0])); - mul_[1] = static_cast(max_bins__[1]) / + mul_[1] = static_cast(max_bins[1]) / (static_cast(max[1]) - static_cast(min[1])); - mul_[2] = static_cast(max_bins__[2]) / + mul_[2] = static_cast(max_bins[2]) / (static_cast(max[2]) - static_cast(min[2])); min_[0] = static_cast(min[0]); min_[1] = static_cast(min[1]); diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index c399279fe48f..f417b6b13b3c 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -388,7 +388,8 @@ class BinSort { // reasonable experimentally. if (use_std_sort && bin_size > 10) { KOKKOS_IF_ON_HOST( - (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + (std::sort(sort_order.data() + lower_bound, + sort_order.data() + upper_bound, [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index 308e9e3a008b..20026c77e415 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -53,9 +53,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort without comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size()); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last); + } } else { Impl::sort_device_view_without_comparator(exec, view); } @@ -107,9 +111,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort with comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last, comparator); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size(), comparator); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last, comparator); + } } else { Impl::sort_device_view_with_comparator(exec, view, comparator); } diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index f11f80704843..2a8f761d9b4f 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -30,6 +30,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -76,13 +77,10 @@ namespace Kokkos::Impl { template constexpr inline bool is_admissible_to_kokkos_sort_by_key = - ::Kokkos::is_view::value&& T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value); + ::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v); template KOKKOS_INLINE_FUNCTION constexpr void @@ -144,7 +142,7 @@ void sort_by_key_rocthrust( #if defined(KOKKOS_ENABLE_ONEDPL) template -inline constexpr bool sort_on_device_v = +inline constexpr bool sort_on_device_v = std::is_same_v || std::is_same_v; @@ -152,7 +150,7 @@ inline constexpr bool sort_on_device_v = template void sort_by_key_onedpl( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, MaybeComparator&&... maybeComparator) { @@ -176,7 +174,7 @@ template void applyPermutation(const ExecutionSpace& space, const PermutationView& permutation, const ViewType& view) { - static_assert(std::is_integral::value); + static_assert(std::is_integral_v); auto view_copy = Kokkos::create_mirror( Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, @@ -335,7 +333,7 @@ void sort_by_key_device_view_without_comparator( template void sort_by_key_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values) { #ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY @@ -392,7 +390,7 @@ void sort_by_key_device_view_with_comparator( template void sort_by_key_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, const ComparatorType& comparator) { diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index 08946228919b..734ce450f69e 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -34,6 +34,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -146,7 +147,7 @@ void sort_via_binsort(const ExecutionSpace& exec, bool sort_in_bins = true; // TODO: figure out better max_bins then this ... int64_t max_bins = view.extent(0) / 2; - if (std::is_integral::value) { + if (std::is_integral_v) { // Cast to double to avoid possible overflow when using integer auto const max_val = static_cast(result.max_val); auto const min_val = static_cast(result.min_val); @@ -157,7 +158,7 @@ void sort_via_binsort(const ExecutionSpace& exec, sort_in_bins = false; } } - if (std::is_floating_point::value) { + if (std::is_floating_point_v) { KOKKOS_ASSERT(std::isfinite(static_cast(result.max_val) - static_cast(result.min_val))); } @@ -211,11 +212,11 @@ void sort_rocthrust(const HIP& space, #if defined(KOKKOS_ENABLE_ONEDPL) template -void sort_onedpl(const Kokkos::Experimental::SYCL& space, +void sort_onedpl(const Kokkos::SYCL& space, const Kokkos::View& view, MaybeComparator&&... maybeComparator) { using ViewType = Kokkos::View; - static_assert(SpaceAccessibility::accessible, "SYCL execution space is not able to access the memory space " "of the View argument!"); @@ -268,19 +269,29 @@ void copy_to_host_run_stdsort_copy_back( KE::copy(exec, view, view_dc); // run sort on the mirror of view_dc - auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); - auto first = KE::begin(mv_h); - auto last = KE::end(mv_h); - std::sort(first, last, std::forward(maybeComparator)...); + auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); + if (view.span_is_contiguous()) { + std::sort(mv_h.data(), mv_h.data() + mv_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(mv_h); + auto last = KE::end(mv_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view_dc, mv_h); // copy back to argument view KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view)); } else { auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view); - auto first = KE::begin(view_h); - auto last = KE::end(view_h); - std::sort(first, last, std::forward(maybeComparator)...); + if (view.span_is_contiguous()) { + std::sort(view_h.data(), view_h.data() + view_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(view_h); + auto last = KE::end(view_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view, view_h); } } @@ -310,7 +321,7 @@ void sort_device_view_without_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& view) { using ViewType = Kokkos::View; static_assert( @@ -365,8 +376,7 @@ void sort_device_view_with_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, - const Kokkos::View& view, + const Kokkos::SYCL& exec, const Kokkos::View& view, const ComparatorType& comparator) { using ViewType = Kokkos::View; static_assert( @@ -397,12 +407,12 @@ sort_device_view_with_comparator( // and then copies data back. Potentially, this can later be changed // with a better solution like our own quicksort on device or similar. - using ViewType = Kokkos::View; - using MemSpace = typename ViewType::memory_space; // Note with HIP unified memory this code path is still the right thing to do // if we end up here when RocThrust is not enabled. // The create_mirror_view_and_copy will do the right thing (no copy). -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; static_assert(!SpaceAccessibility::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp index b84f00f8bb50..ea7e55ca6190 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp @@ -91,7 +91,7 @@ template = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl( @@ -105,7 +105,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl(label, ex, first, last, @@ -119,7 +119,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -137,7 +137,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -157,7 +157,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -172,7 +172,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -186,7 +186,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -204,7 +204,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -258,7 +258,7 @@ template < KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_team_impl(teamHandle, first, last, @@ -273,7 +273,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -294,7 +294,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_team_impl(teamHandle, first, last, @@ -309,7 +309,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp index 101f5113f68a..89585ddbea0c 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp @@ -117,7 +117,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -136,7 +136,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -157,7 +157,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -182,7 +182,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -208,7 +208,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1, IteratorType last1, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -228,7 +228,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -248,7 +248,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -270,7 +270,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -345,7 +345,7 @@ KOKKOS_FUNCTION ValueType transform_reduce( const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -366,7 +366,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -393,7 +393,7 @@ KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -412,7 +412,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 54bb13e25b9e..da16141f5a7f 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -33,12 +33,12 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)>> + (std::is_same_v || + std::is_same_v || + std::is_same_v)>> : std::true_type {}; template @@ -102,8 +102,8 @@ struct are_random_access_iterators; template struct are_random_access_iterators { static constexpr bool value = - is_iterator_v && std::is_base_of::value; + is_iterator_v && std::is_base_of_v; }; template @@ -165,9 +165,8 @@ struct iterators_have_matching_difference_type { template struct iterators_have_matching_difference_type { - static constexpr bool value = - std::is_same::value; + static constexpr bool value = std::is_same_v; }; template diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp index 9075562d460e..dc910861d507 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdMoveBackwardFunctor { using index_type = typename IteratorType1::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdMoveBackwardFunctor requires signed index type"); IteratorType1 m_last; diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5bce89e98f7f..e8c638c94c75 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -36,18 +36,18 @@ class RandomAccessIterator< ::Kokkos::View > { using iterator_type = RandomAccessIterator; using iterator_category = std::random_access_iterator_tag; - using value_type = typename view_type::value_type; + using value_type = typename view_type::non_const_value_type; using difference_type = ptrdiff_t; using pointer = typename view_type::pointer_type; using reference = typename view_type::reference_type; static_assert(view_type::rank == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), "RandomAccessIterator only supports 1D Views with LayoutLeft, " "LayoutRight, LayoutStride."); @@ -61,9 +61,9 @@ class RandomAccessIterator< ::Kokkos::View > { #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond template - requires(std::is_constructible_v) KOKKOS_FUNCTION - explicit(!std::is_convertible_v) - RandomAccessIterator(const RandomAccessIterator& other) + requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) : m_view(other.m_view), m_current_index(other.m_current_index) {} #else template < diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index b4046c7645bd..e6caa0728805 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseFunctor requires signed index type"); InputIterator m_first; diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp index dd20d90e3995..7aa0e4fc44c8 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseCopyFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseCopyFunctor requires signed index type"); InputIterator m_last; diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt index db184bc8a999..31247af159b9 100644 --- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -1,12 +1,10 @@ - #Leave these here for now - I don't need transitive deps anyway -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) - +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) -SET(ALGORITHM UnitTestMain.cpp) +set(ALGORITHM UnitTestMain.cpp) foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) string(TOUPPER ${Tag} DEVICE) @@ -23,21 +21,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Generate a .cpp file for each one that runs it on the current backend (Tag), # and add this .cpp file to the sources for UnitTest_RandomAndSort. set(ALGO_SORT_SOURCES) - foreach(SOURCE_Input - TestSort - TestSortByKey - TestSortCustomComp - TestBinSortA - TestBinSortB - TestNestedSort - ) + foreach(SOURCE_Input TestSort TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB TestNestedSort) set(file ${dir}/${SOURCE_Input}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_SORT_SOURCES ${file}) endforeach() @@ -47,14 +35,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # ------------------------------------------ # do as above set(ALGO_RANDOM_SOURCES) - foreach(SOURCE_Input - TestRandom - ) + foreach(SOURCE_Input TestRandom) set(file ${dir}/${SOURCE_Input}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() @@ -65,11 +48,7 @@ endforeach() # std set A # ------------------------------------------ set(STDALGO_SOURCES_A) -foreach(Name - StdReducers - StdAlgorithmsConstraints - RandomAccessIterator - ) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator) list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) endforeach() @@ -77,10 +56,7 @@ endforeach() # std set B # ------------------------------------------ set(STDALGO_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsMinMaxElementOps - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps) list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) endforeach() @@ -88,22 +64,23 @@ endforeach() # std set C # ------------------------------------------ set(STDALGO_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsLexicographicalCompare - StdAlgorithmsForEach - StdAlgorithmsFind - StdAlgorithmsFindFirstOf - StdAlgorithmsFindEnd - StdAlgorithmsCount - StdAlgorithmsEqual - StdAlgorithmsAllAnyNoneOf - StdAlgorithmsAdjacentFind - StdAlgorithmsSearch - StdAlgorithmsSearch_n - StdAlgorithmsMismatch - StdAlgorithmsMoveBackward - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsLexicographicalCompare + StdAlgorithmsForEach + StdAlgorithmsFind + StdAlgorithmsFindFirstOf + StdAlgorithmsFindEnd + StdAlgorithmsCount + StdAlgorithmsEqual + StdAlgorithmsAllAnyNoneOf + StdAlgorithmsAdjacentFind + StdAlgorithmsSearch + StdAlgorithmsSearch_n + StdAlgorithmsMismatch + StdAlgorithmsMoveBackward +) list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) endforeach() @@ -111,27 +88,28 @@ endforeach() # std set D # ------------------------------------------ set(STDALGO_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsModOps - StdAlgorithmsModSeqOps - StdAlgorithmsReplace - StdAlgorithmsReplaceIf - StdAlgorithmsReplaceCopy - StdAlgorithmsReplaceCopyIf - StdAlgorithmsCopyIf - StdAlgorithmsUnique - StdAlgorithmsUniqueCopy - StdAlgorithmsRemove - StdAlgorithmsRemoveIf - StdAlgorithmsRemoveCopy - StdAlgorithmsRemoveCopyIf - StdAlgorithmsRotate - StdAlgorithmsRotateCopy - StdAlgorithmsReverse - StdAlgorithmsShiftLeft - StdAlgorithmsShiftRight - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsModOps + StdAlgorithmsModSeqOps + StdAlgorithmsReplace + StdAlgorithmsReplaceIf + StdAlgorithmsReplaceCopy + StdAlgorithmsReplaceCopyIf + StdAlgorithmsCopyIf + StdAlgorithmsUnique + StdAlgorithmsUniqueCopy + StdAlgorithmsRemove + StdAlgorithmsRemoveIf + StdAlgorithmsRemoveCopy + StdAlgorithmsRemoveCopyIf + StdAlgorithmsRotate + StdAlgorithmsRotateCopy + StdAlgorithmsReverse + StdAlgorithmsShiftLeft + StdAlgorithmsShiftRight +) list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) endforeach() @@ -139,20 +117,21 @@ endforeach() # std set E # ------------------------------------------ set(STDALGO_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsIsSorted - StdAlgorithmsIsSortedUntil - StdAlgorithmsPartitioningOps - StdAlgorithmsPartitionCopy - StdAlgorithmsNumerics - StdAlgorithmsAdjacentDifference - StdAlgorithmsExclusiveScan - StdAlgorithmsInclusiveScan - StdAlgorithmsTransformUnaryOp - StdAlgorithmsTransformExclusiveScan - StdAlgorithmsTransformInclusiveScan - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsIsSorted + StdAlgorithmsIsSortedUntil + StdAlgorithmsPartitioningOps + StdAlgorithmsPartitionCopy + StdAlgorithmsNumerics + StdAlgorithmsAdjacentDifference + StdAlgorithmsExclusiveScan + StdAlgorithmsInclusiveScan + StdAlgorithmsTransformUnaryOp + StdAlgorithmsTransformExclusiveScan + StdAlgorithmsTransformInclusiveScan +) list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) endforeach() @@ -160,11 +139,7 @@ endforeach() # std team Q # ------------------------------------------ set(STDALGO_TEAM_SOURCES_Q) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamInclusiveScan - StdAlgorithmsTeamTransformInclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) endforeach() @@ -172,11 +147,7 @@ endforeach() # std team P # ------------------------------------------ set(STDALGO_TEAM_SOURCES_P) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamExclusiveScan - StdAlgorithmsTeamTransformExclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) endforeach() @@ -184,14 +155,9 @@ endforeach() # std team M # ------------------------------------------ set(STDALGO_TEAM_SOURCES_M) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamTransformUnaryOp - StdAlgorithmsTeamTransformBinaryOp - StdAlgorithmsTeamGenerate - StdAlgorithmsTeamGenerate_n - StdAlgorithmsTeamSwapRanges - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp + StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges +) list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) endforeach() @@ -199,14 +165,9 @@ endforeach() # std team L # ------------------------------------------ set(STDALGO_TEAM_SOURCES_L) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamIsSorted - StdAlgorithmsTeamIsSortedUntil - StdAlgorithmsTeamIsPartitioned - StdAlgorithmsTeamPartitionCopy - StdAlgorithmsTeamPartitionPoint - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil + StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint +) list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) endforeach() @@ -214,13 +175,9 @@ endforeach() # std team I # ------------------------------------------ set(STDALGO_TEAM_SOURCES_I) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamUnique - StdAlgorithmsTeamAdjacentDifference - StdAlgorithmsTeamReduce - StdAlgorithmsTeamTransformReduce - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce + StdAlgorithmsTeamTransformReduce +) list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) endforeach() @@ -228,18 +185,19 @@ endforeach() # std team H # ------------------------------------------ set(STDALGO_TEAM_SOURCES_H) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamCopy - StdAlgorithmsTeamCopy_n - StdAlgorithmsTeamCopyBackward - StdAlgorithmsTeamCopyIf - StdAlgorithmsTeamUniqueCopy - StdAlgorithmsTeamRemove - StdAlgorithmsTeamRemoveIf - StdAlgorithmsTeamRemoveCopy - StdAlgorithmsTeamRemoveCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamCopy + StdAlgorithmsTeamCopy_n + StdAlgorithmsTeamCopyBackward + StdAlgorithmsTeamCopyIf + StdAlgorithmsTeamUniqueCopy + StdAlgorithmsTeamRemove + StdAlgorithmsTeamRemoveIf + StdAlgorithmsTeamRemoveCopy + StdAlgorithmsTeamRemoveCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) endforeach() @@ -247,13 +205,9 @@ endforeach() # std team G # ------------------------------------------ set(STDALGO_TEAM_SOURCES_G) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMove - StdAlgorithmsTeamMoveBackward - StdAlgorithmsTeamShiftLeft - StdAlgorithmsTeamShiftRight - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft + StdAlgorithmsTeamShiftRight +) list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) endforeach() @@ -261,13 +215,9 @@ endforeach() # std team F # ------------------------------------------ set(STDALGO_TEAM_SOURCES_F) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamReverse - StdAlgorithmsTeamReverseCopy - StdAlgorithmsTeamRotate - StdAlgorithmsTeamRotateCopy - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate + StdAlgorithmsTeamRotateCopy +) list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) endforeach() @@ -275,15 +225,16 @@ endforeach() # std team E # ------------------------------------------ set(STDALGO_TEAM_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFill - StdAlgorithmsTeamFill_n - StdAlgorithmsTeamReplace - StdAlgorithmsTeamReplaceIf - StdAlgorithmsTeamReplaceCopy - StdAlgorithmsTeamReplaceCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFill + StdAlgorithmsTeamFill_n + StdAlgorithmsTeamReplace + StdAlgorithmsTeamReplaceIf + StdAlgorithmsTeamReplaceCopy + StdAlgorithmsTeamReplaceCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) endforeach() @@ -291,12 +242,7 @@ endforeach() # std team D # ------------------------------------------ set(STDALGO_TEAM_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMinElement - StdAlgorithmsTeamMaxElement - StdAlgorithmsTeamMinMaxElement - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement) list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) endforeach() @@ -304,16 +250,17 @@ endforeach() # std team C # ------------------------------------------ set(STDALGO_TEAM_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFind - StdAlgorithmsTeamFindIf - StdAlgorithmsTeamFindIfNot - StdAlgorithmsTeamAllOf - StdAlgorithmsTeamAnyOf - StdAlgorithmsTeamNoneOf - StdAlgorithmsTeamSearchN - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFind + StdAlgorithmsTeamFindIf + StdAlgorithmsTeamFindIfNot + StdAlgorithmsTeamAllOf + StdAlgorithmsTeamAnyOf + StdAlgorithmsTeamNoneOf + StdAlgorithmsTeamSearchN +) list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) endforeach() @@ -321,13 +268,9 @@ endforeach() # std team B # ------------------------------------------ set(STDALGO_TEAM_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamEqual - StdAlgorithmsTeamSearch - StdAlgorithmsTeamFindEnd - StdAlgorithmsTeamFindFirstOf - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd + StdAlgorithmsTeamFindFirstOf +) list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) endforeach() @@ -335,34 +278,33 @@ endforeach() # std team A # ------------------------------------------ set(STDALGO_TEAM_SOURCES_A) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamAdjacentFind - StdAlgorithmsTeamCount - StdAlgorithmsTeamCountIf - StdAlgorithmsTeamForEach - StdAlgorithmsTeamForEachN - StdAlgorithmsTeamLexicographicalCompare - StdAlgorithmsTeamMismatch - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamAdjacentFind + StdAlgorithmsTeamCount + StdAlgorithmsTeamCountIf + StdAlgorithmsTeamForEach + StdAlgorithmsTeamForEachN + StdAlgorithmsTeamLexicographicalCompare + StdAlgorithmsTeamMismatch +) list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. -if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - list(REMOVE_ITEM ALGO_SORT_SOURCES - TestSort.cpp - ) +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION + VERSION_GREATER_EQUAL 16.0.0 +) + list(REMOVE_ITEM ALGO_SORT_SOURCES TestSort.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases # the impl needs to use either Kokkos or tailored reducers # which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L - TestStdAlgorithmsTeamIsPartitioned.cpp - TestStdAlgorithmsTeamPartitionPoint.cpp - TestStdAlgorithmsTeamPartitionCopy.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L TestStdAlgorithmsTeamIsPartitioned.cpp + TestStdAlgorithmsTeamPartitionPoint.cpp TestStdAlgorithmsTeamPartitionCopy.cpp ) endif() @@ -370,7 +312,9 @@ endif() # in these cases the impl needs to use either Kokkos or # tailored reducers which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_C + list( + REMOVE_ITEM + STDALGO_TEAM_SOURCES_C TestStdAlgorithmsTeamFind.cpp TestStdAlgorithmsTeamFindIf.cpp TestStdAlgorithmsTeamFindIfNot.cpp @@ -386,35 +330,20 @@ endif() # FRIZZI: 04/26/2023: not sure if the compilation error is still applicable # but we conservatively leave this guard on if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Sort - SOURCES - UnitTestMain.cpp - TestStdAlgorithmsCommon.cpp - ${ALGO_SORT_SOURCES} + kokkos_add_executable_and_test( + UnitTest_Sort SOURCES UnitTestMain.cpp TestStdAlgorithmsCommon.cpp ${ALGO_SORT_SOURCES} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Random - SOURCES - UnitTestMain.cpp - ${ALGO_RANDOM_SOURCES} - ) + kokkos_add_executable_and_test(UnitTest_Random SOURCES UnitTestMain.cpp ${ALGO_RANDOM_SOURCES}) endif() # FIXME_OPENMPTARGET: These tests cause internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - list(REMOVE_ITEM STDALGO_SOURCES_D - TestStdAlgorithmsCopyIf.cpp - TestStdAlgorithmsRemoveCopy.cpp - TestStdAlgorithmsUnique.cpp - TestStdAlgorithmsUniqueCopy.cpp - ) - list(REMOVE_ITEM STDALGO_SOURCES_E - TestStdAlgorithmsExclusiveScan.cpp - TestStdAlgorithmsInclusiveScan.cpp + list(REMOVE_ITEM STDALGO_SOURCES_D TestStdAlgorithmsCopyIf.cpp TestStdAlgorithmsRemoveCopy.cpp + TestStdAlgorithmsUnique.cpp TestStdAlgorithmsUniqueCopy.cpp ) + list(REMOVE_ITEM STDALGO_SOURCES_E TestStdAlgorithmsExclusiveScan.cpp TestStdAlgorithmsInclusiveScan.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget @@ -422,48 +351,31 @@ endif() if(KOKKOS_ENABLE_OPENMPTARGET) # the following use either Kokkos or tailored reducers # which results in runtime memory errors. - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B - TestStdAlgorithmsTeamFindEnd.cpp - TestStdAlgorithmsTeamFindFirstOf.cpp - TestStdAlgorithmsTeamSearch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B TestStdAlgorithmsTeamFindEnd.cpp TestStdAlgorithmsTeamFindFirstOf.cpp + TestStdAlgorithmsTeamSearch.cpp ) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A - TestStdAlgorithmsTeamAdjacentFind.cpp - TestStdAlgorithmsTeamLexicographicalCompare.cpp - TestStdAlgorithmsTeamMismatch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A TestStdAlgorithmsTeamAdjacentFind.cpp + TestStdAlgorithmsTeamLexicographicalCompare.cpp TestStdAlgorithmsTeamMismatch.cpp ) # this causes an illegal memory access if team_members_have_matching_result # is called - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M - TestStdAlgorithmsTeamTransformBinaryOp.cpp - ) + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M TestStdAlgorithmsTeamTransformBinaryOp.cpp) endif() foreach(ID A;B;C;D;E) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_SOURCES_${ID}} - ) + kokkos_add_executable_and_test(AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}}) endforeach() foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_Team_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_TEAM_SOURCES_${ID}} - ) + kokkos_add_executable_and_test( + AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES UnitTestMain.cpp ${STDALGO_TEAM_SOURCES_${ID}} + ) endforeach() # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE( - AlgorithmsUnitTest_StdAlgoCompileOnly - SOURCES TestStdAlgorithmsCompileOnly.cpp - ) + kokkos_add_executable(AlgorithmsUnitTest_StdAlgoCompileOnly SOURCES TestStdAlgorithmsCompileOnly.cpp) endif() diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp index dd3569e6715a..bb074f248034 100644 --- a/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp @@ -31,13 +31,13 @@ struct bin3d_is_sorted_struct { using value_type = unsigned int; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; int max_bins; Scalar min; Scalar max; - bin3d_is_sorted_struct(Kokkos::View keys_, + bin3d_is_sorted_struct(Kokkos::View keys_, int max_bins_, Scalar min_, Scalar max_) : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {} KOKKOS_INLINE_FUNCTION @@ -65,9 +65,9 @@ struct sum3D { using value_type = double; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; - sum3D(Kokkos::View keys_) : keys(keys_) {} + sum3D(Kokkos::View keys_) : keys(keys_) {} KOKKOS_INLINE_FUNCTION void operator()(int i, double& count) const { count += keys(i, 0); @@ -77,8 +77,8 @@ struct sum3D { }; template -void test_3D_sort_impl(unsigned int n) { - using KeyViewType = Kokkos::View; +void test_3D_sort_impl(size_t n) { + using KeyViewType = Kokkos::View; KeyViewType keys("Keys", n * n * n); @@ -207,7 +207,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -219,6 +219,10 @@ void test_sort_integer_overflow() { } // namespace BinSortSetA TEST(TEST_CATEGORY, BinSortGenericTests) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -246,11 +250,11 @@ TEST(TEST_CATEGORY, BinSortEmptyView) { // does not matter if we use int or something else Kokkos::View v("v", 0); - // test all exposed public sort methods - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Sorter.sort(v)); + // test all exposed public sort methods are callable and do not throw + Sorter.sort(ExecutionSpace(), v, 0, 0); + Sorter.sort(v, 0, 0); + Sorter.sort(ExecutionSpace(), v); + Sorter.sort(v); } TEST(TEST_CATEGORY, BinSortEmptyKeysView) { @@ -263,7 +267,26 @@ TEST(TEST_CATEGORY, BinSortEmptyKeysView) { BinOp_t binOp(5, 0, 10); Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp); - ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{})); + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw +} + +// BinSort may delegate sorting within bins to std::sort when running on host +// and having a sufficiently large number of items within a single bin (10 by +// default). Test that this is done without undefined behavior when accessing +// the boundaries of the bin. Should be used in conjunction with a memory +// sanitizer or bounds check. +TEST(TEST_CATEGORY, BinSort_issue_7221) { + using ExecutionSpace = TEST_EXECSPACE; + + using KeyViewType = Kokkos::View; + KeyViewType kv("kv", 11); + + using BinOp_t = Kokkos::BinOp1D; + BinOp_t binOp(1, -10, 10); + Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp, + /*sort_within_bins*/ true); + + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw } } // namespace Test diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp index a90224bf3158..d11b53a9a61f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp @@ -185,6 +185,10 @@ void run_for_rank2() { } // namespace BinSortSetB TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExeSpace = TEST_EXECSPACE; using key_type = unsigned; BinSortSetB::run_for_rank1(); diff --git a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp index 1b7a3f48fc52..cd57fd23ecfa 100644 --- a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp @@ -386,6 +386,11 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey, } // namespace NestedSortImpl TEST(TEST_CATEGORY, NestedSort) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; NestedSortImpl::test_nested_sort(171, 0U, UINT_MAX); NestedSortImpl::test_nested_sort(42, -1e6f, 1e6f); @@ -394,6 +399,11 @@ TEST(TEST_CATEGORY, NestedSort) { } TEST(TEST_CATEGORY, NestedSortByKey) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; // Second/third template arguments are key and value respectively. diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp index 472af1403b2d..6960b912d0e3 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -542,6 +542,11 @@ void test_duplicate_stream() { } // namespace AlgoRandomImpl TEST(TEST_CATEGORY, Random_XorShift64) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ @@ -562,6 +567,10 @@ TEST(TEST_CATEGORY, Random_XorShift64) { TEST(TEST_CATEGORY, Random_XorShift1024_0) { using ExecutionSpace = TEST_EXECSPACE; + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ defined(KOKKOS_ENABLE_HIP) @@ -589,7 +598,7 @@ TEST(TEST_CATEGORY, Multi_streams) { #endif #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { GTEST_SKIP() << "Failing on NVIDIA GPUs"; // FIXME_SYCL } #endif diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp index 7d484136b6dd..5ab348cb1933 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct random_access_iterator_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); @@ -264,6 +264,37 @@ TEST_F(random_access_iterator_test, traits_helpers) { static_assert(KE::Impl::are_iterators_v); static_assert(KE::Impl::are_random_access_iterators_v); static_assert(!KE::Impl::are_iterators_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); } } // namespace stdalgos diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp index 968fb8950b74..5ea88ae5d628 100644 --- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp @@ -197,7 +197,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -209,6 +209,10 @@ void test_sort_integer_overflow() { } // namespace SortImpl TEST(TEST_CATEGORY, SortUnsignedValueType) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -224,14 +228,19 @@ TEST(TEST_CATEGORY, SortUnsignedValueType) { } TEST(TEST_CATEGORY, SortEmptyView) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else Kokkos::View v("v", 0); + // checking that it does not throw // TODO check the synchronous behavior of the calls below - ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Kokkos::sort(v)); + Kokkos::sort(ExecutionSpace(), v); + Kokkos::sort(v); } } // namespace Test diff --git a/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp index 9e5bd4a57487..44abe4e73a4b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -83,8 +83,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyView) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } // Test #7036 @@ -95,8 +95,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyViewHost) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } TEST(TEST_CATEGORY, SortByKey) { @@ -183,12 +183,12 @@ TEST(TEST_CATEGORY, SortByKeyStaticExtents) { Kokkos::View keys("keys"); Kokkos::View values_static("values_static"); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_static)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_static); Kokkos::View values_dynamic("values_dynamic", 10); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic); } template @@ -234,7 +234,9 @@ TEST(TEST_CATEGORY, SortByKeyWithStrides) { ASSERT_EQ(sort_fails, 0u); } -TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { +TEST(TEST_CATEGORY_DEATH, SortByKeyKeysLargerThanValues) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6ee4..208b46b15f27 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -96,7 +96,7 @@ void fill_view(DestViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, aux_v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index fa4ff48dbef8..d8b80675c9d8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -173,7 +173,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { { auto res_it = KE::adjacent_find(exespace(), KE::cbegin(view), - KE::cend(view), args...); + KE::cend(view), args...); const auto my_diff = res_it - KE::cbegin(view); verify(my_diff, view, args...); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 67052e2f9d4d..dadce2d4748a 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -534,10 +534,10 @@ void fill_views_inc(ViewType view, ViewHostType host_view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); for (std::size_t i = 0; i < view_h.extent(0); i++) { @@ -546,10 +546,10 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); using non_strided_view_t = Kokkos::View; @@ -566,11 +566,11 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); auto expected_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected); @@ -583,11 +583,11 @@ compare_views(ViewType1 expected, const ViewType2 actual) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); using non_strided_view_t = Kokkos::View; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 2a4525a8c332..923ea970f91d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,7 +81,7 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } -TEST(std_algorithms, expect_no_overlap) { +TEST(std_algorithms_DeathTest, expect_no_overlap) { namespace KE = Kokkos::Experimental; using value_type = double; @@ -104,6 +104,8 @@ TEST(std_algorithms, expect_no_overlap) { // Overlapping because iterators are identical #if defined(KOKKOS_ENABLE_DEBUG) + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + auto first_s = KE::begin(static_view_1d); auto last_s = first_s + extent0; EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, @@ -148,8 +150,7 @@ TEST(std_algorithms, expect_no_overlap) { auto last_st0 = first_st0 + strided_view_1d_0.extent(0); auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) // Does not overlap since offset (=3) is not divisible by stride (=2) - EXPECT_NO_THROW( - { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); + KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); // Iterating over the same range without overlapping Kokkos::View static_view_2d{ @@ -160,9 +161,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); - }); + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); Kokkos::View dynamic_view_2d{ "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; @@ -172,9 +171,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); - }); + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; Kokkos::View strided_view_2d{ @@ -185,9 +182,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); - }); + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); } } // namespace stdalgos diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index 5778e37be04d..7c9e8f84bfa4 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -107,7 +107,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name, } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -202,7 +202,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } @@ -224,7 +224,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } @@ -233,7 +233,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index b364c53a8882..a85e63fe3454 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp index 793b98a67f16..b24730ff0094 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp @@ -55,7 +55,6 @@ void test_for_each(const ViewType view) { std::for_each(KE::begin(expected), KE::end(expected), non_mod_functor); compare_views(expected, view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) const auto mod_lambda = KOKKOS_LAMBDA(value_t & i) { ++i; }; // pass view, lambda takes non-const ref @@ -79,7 +78,6 @@ void test_for_each(const ViewType view) { KE::for_each(exespace(), KE::cbegin(view), KE::cend(view), non_mod_lambda); std::for_each(KE::cbegin(expected), KE::cend(expected), non_mod_lambda); compare_views(expected, view); -#endif } // std::for_each_n is C++17, so we cannot compare results directly diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp index 8dbd6cd7e30b..2b3361743e4d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp @@ -104,7 +104,7 @@ struct AssignIndexFunctor { template struct IsEvenFunctor { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "IsEvenFunctor uses operator%, so ValueType must be int"); KOKKOS_INLINE_FUNCTION diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index a08a73721088..b4f40b4651d6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index 75d4f0afebce..18928a352669 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -122,7 +122,8 @@ bool compute_gold(const std::string& name) { } else if (name == "large-b") { return false; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return false; // unreachable } } @@ -154,7 +155,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[0] = KE::is_sorted(exespace(), KE::cbegin(view), KE::cend(view), comp); resultsB[1] = KE::is_sorted("label", exespace(), KE::cbegin(view), - KE::cend(view), comp); + KE::cend(view), comp); resultsB[2] = KE::is_sorted(exespace(), view, comp); resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index 29ac7cc9bc12..8327bfe13c0b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -123,7 +123,8 @@ auto compute_gold(ViewType view, const std::string& name) { } else if (name == "large-b") { return KE::begin(view) + 156; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return KE::end(view); // unreachable } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index f3b3e269c446..df5df756d2ae 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -86,7 +86,7 @@ void run_single_scenario(ViewType view1, ViewType view2, v2_h(ext2 / 2) = -5; } } else { - throw std::runtime_error("Kokkos: stdalgo: test: mismatch: Invalid string"); + FAIL() << "Kokkos: stdalgo: test: mismatch: Invalid string"; } Kokkos::deep_copy(aux_view1, v1_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 1b1a02f39c4f..6918185bc083 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value); + static_assert(std::is_rvalue_reference_v); // move constr MyMovableType b(std::move(a)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp index f80f30797e43..42a17d737796 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct std_algorithms_mod_seq_ops_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp index b201ab95c1a6..88e2a68ff17d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp @@ -56,7 +56,7 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) { ASSERT_EQ(dist, 5); } else if (apiId == 1) { auto rit = KE::move_backward("mylabel", exespace(), KE::begin(v), - KE::end(v), KE::end(v2)); + KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); ASSERT_EQ(dist, 5); } else if (apiId == 2) { diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index a36c9db2b9eb..e47cacdd7d9c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -95,7 +95,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -110,9 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value); + std::is_same_v); static_assert( - std::is_same::value); + std::is_same_v); const std::size_t ext = view_from.extent(0); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index c35fc5c24b20..f897e9b65749 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -99,7 +99,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -147,7 +147,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove(exespace(), KE::begin(view), KE::end(view), - (ValueType)match_value); + (ValueType)match_value); verify_data(data_h, view, rit); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 3d7c52108be0..3137880ea813 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index cb699aa92356..d88ab5473de6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index f06f2234eedb..e42788799e47 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -144,7 +144,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if(exespace(), KE::begin(view), KE::end(view), - remove_if_even); + remove_if_even); verify_data(data_h, view, rit, remove_if_even); } @@ -154,7 +154,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if("label", exespace(), KE::begin(view), - KE::end(view), remove_if_even); + KE::end(view), remove_if_even); verify_data(data_h, view, rit, remove_if_even); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp index a22ab32d764a..4596726cf3ce 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -153,7 +153,7 @@ void verify_data(const std::string& name, ViewType1 test_view, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp index a964ec8e173e..b18c859af593 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp index ceeba8897119..82f859bac124 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index 802c0093c5cc..5ae2ff427853 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -96,7 +96,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp index 6e6ca7278300..3c934d64850c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp @@ -62,7 +62,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index 5638cbee4a62..bf5c2ee7828b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp index d0caca7cea3f..1a860c58cee8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view(Tag{}, view_ext, "rotate_copy_dest"); auto n_it = KE::cbegin(view_from) + rotation_point; auto rit = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it, - KE::cend(view_from), KE::begin(view_dest)); + KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index 021609c444d2..195f88a0b737 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -256,7 +256,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, { auto myrit = KE::search(exespace(), KE::cbegin(view), KE::cend(view), - KE::cbegin(s_view), KE::cend(s_view), args...); + KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); ASSERT_EQ(mydiff, stddiff); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index 53ad8daa2ec9..79d88bec23f7 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -154,7 +154,7 @@ void fill_view(ViewType dest_view, ValueType value, std::size_t count, } else { - throw std::runtime_error("Kokkos: test: search_n: this should not happen"); + FAIL() << "Kokkos: test: search_n: this should not happen"; } Kokkos::deep_copy(aux_view, v_h); @@ -208,7 +208,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count, { auto myrit = KE::search_n("label", exespace(), KE::cbegin(view), - KE::cend(view), count, value, args...); + KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); ASSERT_EQ(mydiff, stddiff); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 0b5fe9216eac..12835d5a2f7c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -150,7 +150,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_left or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_left("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index 8e4ae9437590..3e350cf3b384 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -141,7 +141,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right(exespace(), KE::begin(view), KE::end(view), - shift_value); + shift_value); verify_data(rit, view, view_h, shift_value); } @@ -152,7 +152,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp index c388cadc9bba..5a2c04693945 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp @@ -62,8 +62,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest)); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -73,8 +73,8 @@ struct TestFunctorA { case 1: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_binaryOp); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_binaryOp); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp index e24ac37bf012..071ecd5a9a80 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index 7c3c465dc8d0..3f83ac7404fe 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate); + KE::begin(rowDest), predicate); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp index 7cbc788f8e3c..9b509af55bf2 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp @@ -53,7 +53,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy_n(member, KE::begin(myRowViewFrom), m_copyCount, - KE::begin(myRowViewDest)); + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp index 922424afbd98..38df5c30cec8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp @@ -111,7 +111,7 @@ void test_A(const bool searched_value_exist, std::size_t numTeams, using rand_pool = Kokkos::Random_XorShift64_Pool; - rand_pool pool(lowerBound * upperBound); + rand_pool pool(static_cast(lowerBound) * upperBound); if (searched_value_exist) { Kokkos::View randomIndices( diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 7cb9851087a1..0c35c5e59934 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::exclusive_scan(member, KE::cbegin(rowViewSrc), - KE::cend(rowViewSrc), - KE::begin(rowViewDest), initVal); + KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal); resultDist = KE::distance(KE::begin(rowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp index 430e4917e06e..88c5e21f312f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp @@ -51,7 +51,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), searchedValue); + KE::cend(myRowViewFrom), searchedValue); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp index 83eca33569e1..d350bc62cdb3 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp @@ -86,9 +86,9 @@ struct TestFunctorA { case 2: { auto it = KE::find_end(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::cbegin(myRowSearchedSeqView), - KE::cend(myRowSearchedSeqView), m_binaryPred); + KE::cend(myRowViewFrom), + KE::cbegin(myRowSearchedSeqView), + KE::cend(myRowSearchedSeqView), m_binaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -99,7 +99,7 @@ struct TestFunctorA { case 3: { auto it = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView, - m_binaryPred); + m_binaryPred); resultDist = KE::distance(KE::begin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp index ee4bbed7a30d..70f2be77f632 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp index b9448c1a3e68..873e8faf4cad 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if_not(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp index 4b66dd9131fa..265cdf474616 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::generate_n(member, myRowView, m_count, - GenerateFunctor()); + GenerateFunctor()); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index 850e80dde1e0..f76a595b3f42 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -62,7 +62,7 @@ struct TestFunctorA { } else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index e3b95527c77f..5bc49e46007e 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -61,7 +61,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView)); + KE::cend(myRowView)); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -77,8 +77,8 @@ struct TestFunctorA { else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView), - CustomLessThanComparator{}); + KE::cend(myRowView), + CustomLessThanComparator{}); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -88,7 +88,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -210,7 +210,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index 283525dbd10f..452a48df2161 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::max_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::max_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index 8579b48315d8..2c79370b926e 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::min_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::min_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index 51010fdff59b..25a4487855b6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -84,7 +84,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto itPair = KE::minmax_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist1 = KE::distance(KE::begin(myRowView), itPair.first); resultDist2 = KE::distance(KE::begin(myRowView), itPair.second); @@ -160,7 +160,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } else { auto itPair = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first); stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp index 1122d6d554ac..2c445dacf8e7 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::move(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp index fb9c70391b3d..2defa1dc6fc8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove(member, KE::begin(myRowView), KE::end(myRowView), - m_targetValue); + m_targetValue); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 6bb0d249988d..71a50e39e3ef 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove_copy(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_targetValue); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_targetValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index cff9aa178a29..d5b5304f6315 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -65,8 +65,8 @@ struct TestFunctorA { GreaterThanValueFunctor predicate(m_threshold); if (m_apiPick == 0) { auto it = KE::remove_copy_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), predicate); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), predicate); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp index 70dbf10574b8..64f172e401cc 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp @@ -78,7 +78,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy(member, myRowViewFrom, myRowViewDest, - m_targetValue, m_newValue); + m_targetValue, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -172,7 +172,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), targetVal, newVal); + KE::begin(rowDest), targetVal, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index d0217aed7a8e..9c3699320d8c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -76,7 +76,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy_if(member, myRowViewFrom, myRowViewDest, - predicate, m_newValue); + predicate, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -151,7 +151,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate, newVal); + KE::begin(rowDest), predicate, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp index e865b998f600..51f600fabad6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp @@ -136,7 +136,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift, auto pivot = KE::cbegin(myRowFrom) + pivotShift; auto it = std::rotate_copy(KE::cbegin(myRowFrom), pivot, - KE::cend(myRowFrom), KE::begin(myRowDest)); + KE::cend(myRowFrom), KE::begin(myRowDest)); const std::size_t stdDistance = KE::distance(KE::begin(myRowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp index 00a80c5ef070..08ff8fbbca62 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp @@ -47,7 +47,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::shift_right(member, KE::begin(myRowView), - KE::end(myRowView), m_shift); + KE::end(myRowView), m_shift); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp index 5fc9612caa7b..60cb3f083779 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp @@ -49,7 +49,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::swap_ranges(member, KE::begin(myRowView1), - KE::end(myRowView1), KE::begin(myRowView2)); + KE::end(myRowView1), KE::begin(myRowView2)); resultDist = KE::distance(KE::begin(myRowView2), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 0b0d798fd801..78a21c443055 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -91,7 +91,7 @@ struct TestFunctorA { case 1: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp); + m_binaryOp, m_unaryOp); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -111,7 +111,7 @@ struct TestFunctorA { case 3: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp, initVal); + m_binaryOp, m_unaryOp, initVal); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp index c46146e0a8f6..cef0f7c13d07 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp @@ -58,7 +58,7 @@ struct TestFunctorA { } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::unique(member, KE::begin(myRowView), KE::end(myRowView), - CustomEqualityComparator{}); + CustomEqualityComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::begin(myRow), it); } else { auto it = std::unique(KE::begin(myRow), KE::end(myRow), - CustomEqualityComparator{}); + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 0d3289e196f0..89ea8154c7ec 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -72,8 +72,8 @@ struct TestFunctorA { using comparator_t = CustomEqualityComparator; auto it = KE::unique_copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), - KE::begin(myRowViewDest), comparator_t()); + KE::end(myRowViewFrom), + KE::begin(myRowViewDest), comparator_t()); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -159,12 +159,12 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { std::size_t stdDistance = 0; if (apiId <= 1) { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest)); + KE::begin(myRowDest)); stdDistance = KE::distance(KE::begin(myRowDest), it); } else { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest), - CustomEqualityComparator{}); + KE::begin(myRowDest), + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRowDest), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index fa2804256ac2..365ca21688b4 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -161,7 +161,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index fb81ae91b049..cc8726214786 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -173,7 +173,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index 9c5ae0cf8a1e..6ee93e3d5fa0 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -138,7 +138,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index 3cf43ad4db8f..e3e969645839 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -146,7 +146,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -235,7 +235,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp index c05006a1617c..0044b935587f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -72,7 +72,7 @@ auto create_host_view_with_reduction_order_indices( result(8) = 7; result(9) = 5; } else { - throw std::runtime_error("test: Invalid enum"); + Kokkos::abort("test: Invalid enum"); } return result; @@ -80,7 +80,7 @@ auto create_host_view_with_reduction_order_indices( template auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; @@ -191,7 +191,7 @@ template void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, const ValuesPair gold_values, const IndexPair gold_locs) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; diff --git a/packages/kokkos/benchmarks/CMakeLists.txt b/packages/kokkos/benchmarks/CMakeLists.txt index 529ef393d994..968c8ae3bf59 100644 --- a/packages/kokkos/benchmarks/CMakeLists.txt +++ b/packages/kokkos/benchmarks/CMakeLists.txt @@ -1,12 +1,12 @@ #FIXME_OPENMPTARGET - compiling in debug mode causes ICE. -KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(view_copy_constructor) +kokkos_add_benchmark_directories(atomic) +kokkos_add_benchmark_directories(gather) +kokkos_add_benchmark_directories(gups) +kokkos_add_benchmark_directories(launch_latency) +kokkos_add_benchmark_directories(stream) +kokkos_add_benchmark_directories(view_copy_constructor) #FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. -IF(NOT Kokkos_ENABLE_OPENMPTARGET) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) -ENDIF() +if(NOT Kokkos_ENABLE_OPENMPTARGET) + kokkos_add_benchmark_directories(policy_performance) + kokkos_add_benchmark_directories(bytes_and_flops) +endif() diff --git a/packages/kokkos/benchmarks/atomic/CMakeLists.txt b/packages/kokkos/benchmarks/atomic/CMakeLists.txt index 85f7412f492f..7fda2bf6f6a4 100644 --- a/packages/kokkos/benchmarks/atomic/CMakeLists.txt +++ b/packages/kokkos/benchmarks/atomic/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - atomic - SOURCES main.cpp -) +kokkos_add_executable(atomic SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt index 0ce44a6f1a8e..9c65d06ce28e 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt +++ b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -1,4 +1,9 @@ -KOKKOS_ADD_EXECUTABLE( +kokkos_add_executable( bytes_and_flops - SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp + SOURCES + bench_double.cpp + bench_float.cpp + bench_int32_t.cpp + bench_int64_t.cpp + main.cpp ) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 78cfd48effec..762cc988f14e 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -17,9 +17,9 @@ template struct Run { static void run(int N, int K, int R, int F, int T, int S, int Ba, int I) { - Kokkos::View A("A", N, K); - Kokkos::View B("B", N, K); - Kokkos::View C("C", N, K); + Kokkos::View A("A", N, K); + Kokkos::View B("B", N, K); + Kokkos::View C("C", N, K); Kokkos::deep_copy(A, Scalar(1.5)); Kokkos::deep_copy(B, Scalar(2.5)); diff --git a/packages/kokkos/benchmarks/gather/CMakeLists.txt b/packages/kokkos/benchmarks/gather/CMakeLists.txt index 24c706277259..2de1ce85e637 100644 --- a/packages/kokkos/benchmarks/gather/CMakeLists.txt +++ b/packages/kokkos/benchmarks/gather/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gather - SOURCES main.cpp -) +kokkos_add_executable(gather SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/gups/CMakeLists.txt b/packages/kokkos/benchmarks/gups/CMakeLists.txt index 8de5b73cc67f..dc7074702925 100644 --- a/packages/kokkos/benchmarks/gups/CMakeLists.txt +++ b/packages/kokkos/benchmarks/gups/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gups - SOURCES gups.cpp -) +kokkos_add_executable(gups SOURCES gups.cpp) diff --git a/packages/kokkos/benchmarks/gups/gups.cpp b/packages/kokkos/benchmarks/gups/gups.cpp index 369052321d7b..e00f87968bde 100644 --- a/packages/kokkos/benchmarks/gups/gups.cpp +++ b/packages/kokkos/benchmarks/gups/gups.cpp @@ -140,7 +140,7 @@ int run_benchmark(const Index indicesCount, const Index dataCount, break; } default: { - throw std::runtime_error("unexpected mode"); + Kokkos::abort("unexpected mode"); } } diff --git a/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt index bb14da749d12..4775bf2261e0 100644 --- a/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt +++ b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - launch_latency - SOURCES launch_latency.cpp -) +kokkos_add_executable(launch_latency SOURCES launch_latency.cpp) diff --git a/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp index 73b176ab8dd7..156c29af09e6 100644 --- a/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp +++ b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -254,7 +254,7 @@ int main(int argc, char* argv[]) { else if (i == 3) K = atoi(arg.data()); else { - throw std::runtime_error("unexpected argument!"); + Kokkos::abort("unexpected argument!"); } } else if (arg == "--no-parallel-for") { opts.par_for = false; @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) { } else { std::stringstream ss; ss << "unexpected argument \"" << arg << "\" at position " << i; - throw std::runtime_error(ss.str()); + Kokkos::abort(ss.str().c_str()); } } diff --git a/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt index 929b9c970237..4a939775c0bc 100644 --- a/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt +++ b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - policy_performance - SOURCES main.cpp -) +kokkos_add_executable(policy_performance SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/stream/CMakeLists.txt b/packages/kokkos/benchmarks/stream/CMakeLists.txt index 0dded6e3a541..b096976c486f 100644 --- a/packages/kokkos/benchmarks/stream/CMakeLists.txt +++ b/packages/kokkos/benchmarks/stream/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - stream - SOURCES stream-kokkos.cpp -) +kokkos_add_executable(stream SOURCES stream-kokkos.cpp) diff --git a/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt b/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt index 50a331b2b354..f7bbc13b6ec5 100644 --- a/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt +++ b/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - view_copy_constructor - SOURCES view_copy_constructor.cpp -) +kokkos_add_executable(view_copy_constructor SOURCES view_copy_constructor.cpp) diff --git a/packages/kokkos/bin/kokkos_launch_compiler b/packages/kokkos/bin/kokkos_launch_compiler index d1f8896f91b1..ee3c29e96d3a 100755 --- a/packages/kokkos/bin/kokkos_launch_compiler +++ b/packages/kokkos/bin/kokkos_launch_compiler @@ -62,7 +62,7 @@ KOKKOS_COMPILER=${1} shift # store the expected C++ compiler -CXX_COMPILER=${1} +CXX_COMPILER=$(which "${1}") # remove the expected C++ compiler from the arguments shift @@ -84,7 +84,7 @@ shift # kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: # ${KOKKOS_COMPILER} -c file.cpp -o file.o -if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != $(which "${1}") ]]; then debug-message "$@" # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} exec "$@" diff --git a/packages/kokkos/cmake/Dependencies.cmake b/packages/kokkos/cmake/Dependencies.cmake index fb1e73b5799c..2f70c2f038c1 100644 --- a/packages/kokkos/cmake/Dependencies.cmake +++ b/packages/kokkos/cmake/Dependencies.cmake @@ -1,5 +1,3 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - ) +tribits_package_define_dependencies(LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib) -TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) +tribits_tpl_tentatively_enable(DLlib) diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index a93007ff83f6..44f81bb8cea2 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -24,7 +24,6 @@ #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX #cmakedefine KOKKOS_ENABLE_SYCL -#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 @@ -40,7 +39,10 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS -#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED +#cmakedefine KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK @@ -80,6 +82,7 @@ #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 #cmakedefine KOKKOS_ARCH_RISCV_SG2042 +#cmakedefine KOKKOS_ARCH_RISCV_RVA22V #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 @@ -118,9 +121,11 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX90A #cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 +#cmakedefine KOKKOS_ARCH_AMD_GFX942_APU #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 -#cmakedefine KOKKOS_ARCH_AMD_GPU +#cmakedefine KOKKOS_ARCH_AMD_GFX1103 +#cmakedefine KOKKOS_ARCH_AMD_GPU "@KOKKOS_ARCH_AMD_GPU@" #cmakedefine KOKKOS_ARCH_VEGA // deprecated #cmakedefine KOKKOS_ARCH_VEGA906 // deprecated #cmakedefine KOKKOS_ARCH_VEGA908 // deprecated diff --git a/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in b/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in deleted file mode 100644 index 626ef5a8ebef..000000000000 --- a/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in +++ /dev/null @@ -1,17 +0,0 @@ -IF (NOT TARGET Kokkos::kokkos) - # Compute the installation prefix relative to this file. - get_filename_component(KOKKOS_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - if(KOKKOS_IMPORT_PREFIX STREQUAL "/") - set(KOKKOS_IMPORT_PREFIX "") - endif() - add_library(Kokkos::kokkos INTERFACE IMPORTED) - set_target_properties(Kokkos::kokkos PROPERTIES - INTERFACE_LINK_LIBRARIES "@Kokkos_LIBRARIES@;@KOKKOS_LINK_OPTIONS@" - INTERFACE_COMPILE_FEATURES "@KOKKOS_CXX_STANDARD_FEATURE@" - INTERFACE_COMPILE_OPTIONS "@KOKKOS_ALL_COMPILE_OPTIONS@" - INTERFACE_INCLUDE_DIRECTORIES "${KOKKOS_IMPORT_PREFIX}/include" - ) -ENDIF() diff --git a/packages/kokkos/cmake/Modules/CudaToolkit.cmake b/packages/kokkos/cmake/Modules/CudaToolkit.cmake index eda5541f7c06..b8ac2048b5fc 100644 --- a/packages/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/packages/kokkos/cmake/Modules/CudaToolkit.cmake @@ -483,38 +483,40 @@ endif() # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${CUDAToolkit_BIN_DIR} NO_DEFAULT_PATH - ) + ) endif() # Search using CUDAToolkit_ROOT -find_program(CUDAToolkit_NVCC_EXECUTABLE +find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ENV CUDA_PATH PATH_SUFFIXES bin ) # If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. -if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) +if(NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) # Declare error messages now, print later depending on find_package args. set(fail_base "Could not find nvcc executable in path specified by") set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") - if (CUDAToolkit_FIND_REQUIRED) - if (DEFINED CUDAToolkit_ROOT) + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) message(FATAL_ERROR ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(FATAL_ERROR ${env_cuda_root_fail}) endif() else() - if (NOT CUDAToolkit_FIND_QUIETLY) - if (DEFINED CUDAToolkit_ROOT) + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) message(STATUS ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(STATUS ${env_cuda_root_fail}) endif() endif() @@ -535,9 +537,9 @@ endif() # We will also search the default symlink location /usr/local/cuda first since # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked # directory is the desired location. -if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (UNIX) - if (NOT APPLE) +if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(UNIX) + if(NOT APPLE) set(platform_base "/usr/local/cuda-") else() set(platform_base "/Developer/NVIDIA/CUDA-") @@ -550,10 +552,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) file(GLOB possible_paths "${platform_base}*") # Iterate the glob results and create a descending list. set(possible_versions) - foreach (p ${possible_paths}) + foreach(p ${possible_paths}) # Extract version number from end of string string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) - if (IS_DIRECTORY ${p} AND p_version) + if(IS_DIRECTORY ${p} AND p_version) list(APPEND possible_versions ${p_version}) endif() endforeach() @@ -563,10 +565,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # every possible version of CUDA installed, this wouldn't create any # significant overhead. set(versions) - foreach (v ${possible_versions}) + foreach(v ${possible_versions}) list(LENGTH versions num_versions) # First version, nothing to compare with so just append. - if (num_versions EQUAL 0) + if(num_versions EQUAL 0) list(APPEND versions ${v}) else() # Loop through list. Insert at an index when comparison is @@ -574,9 +576,9 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # happen since this came from a glob list of directories. set(i 0) set(early_terminate FALSE) - while (i LESS num_versions) + while(i LESS num_versions) list(GET versions ${i} curr) - if (v VERSION_GREATER curr) + if(v VERSION_GREATER curr) list(INSERT versions ${i} ${v}) set(early_terminate TRUE) break() @@ -584,7 +586,7 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) math(EXPR i "${i} + 1") endwhile() # If it did not get inserted, place it at the end. - if (NOT early_terminate) + if(NOT early_terminate) list(APPEND versions ${v}) endif() endif() @@ -592,17 +594,18 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # With a descending list of versions, populate possible paths to search. set(search_paths) - foreach (v ${versions}) + foreach(v ${versions}) list(APPEND search_paths "${platform_base}${v}") endforeach() # Force the global default /usr/local/cuda to the front on Unix. - if (UNIX) + if(UNIX) list(INSERT search_paths 0 "/usr/local/cuda") endif() # Now search for nvcc again using the platform default search paths. - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${search_paths} PATH_SUFFIXES bin @@ -617,8 +620,8 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) unset(early_terminate) unset(search_paths) - if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (CUDAToolkit_FIND_REQUIRED) + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(CUDAToolkit_FIND_REQUIRED) message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") elseif(NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") @@ -636,8 +639,7 @@ if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) unset(cuda_dir) endif() -if(CUDAToolkit_NVCC_EXECUTABLE AND - CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) +if(CUDAToolkit_NVCC_EXECUTABLE AND CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value # This if statement will always match, but is used to provide variables for MATCH 1,2,3... if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) @@ -648,39 +650,38 @@ if(CUDAToolkit_NVCC_EXECUTABLE AND endif() else() # Compute the version by invoking nvcc - execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") - set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") endif() unset(NVCC_OUT) endif() - get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) # Handle cross compilation if(CMAKE_CROSSCOMPILING) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") # Support for NVPACK - set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") # Support for arm cross compilation set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") # Support for aarch64 cross compilation - if (ANDROID_ARCH_NAME STREQUAL "arm64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") else() set(CUDAToolkit_TARGET_NAME "aarch64-linux") - endif (ANDROID_ARCH_NAME STREQUAL "arm64") + endif(ANDROID_ARCH_NAME STREQUAL "arm64") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(CUDAToolkit_TARGET_NAME "x86_64-linux") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") endif() - if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") # add known CUDA target root path to the set of directories we search for programs, libraries and headers list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") @@ -702,25 +703,16 @@ else() set(_CUDAToolkit_Pop_Prefix True) endif() - # Find the include/ directory -find_path(CUDAToolkit_INCLUDE_DIR - NAMES cuda_runtime.h -) +find_path(CUDAToolkit_INCLUDE_DIR NAMES cuda_runtime.h) # And find the CUDA Runtime Library libcudart -find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64 lib/x64 -) -if (NOT CUDA_CUDART) - find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64/stubs lib/x64/stubs - ) +find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64 lib/x64) +if(NOT CUDA_CUDART) + find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64/stubs lib/x64/stubs) endif() -if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Unable to find cudart library.") endif() @@ -733,24 +725,17 @@ endif() #----------------------------------------------------------------------------- # Perform version comparison and validate all required variables are set. include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(CUDAToolkit - REQUIRED_VARS - CUDAToolkit_INCLUDE_DIR - CUDA_CUDART - CUDAToolkit_NVCC_EXECUTABLE - VERSION_VAR - CUDAToolkit_VERSION +find_package_handle_standard_args( + CUDAToolkit REQUIRED_VARS CUDAToolkit_INCLUDE_DIR CUDA_CUDART CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR CUDAToolkit_VERSION ) -mark_as_advanced(CUDA_CUDART - CUDAToolkit_INCLUDE_DIR - CUDAToolkit_NVCC_EXECUTABLE - ) +mark_as_advanced(CUDA_CUDART CUDAToolkit_INCLUDE_DIR CUDAToolkit_NVCC_EXECUTABLE) #----------------------------------------------------------------------------- # Construct result variables if(CUDAToolkit_FOUND) - set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) - get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) endif() #----------------------------------------------------------------------------- @@ -762,27 +747,26 @@ if(CUDAToolkit_FOUND) set(search_names ${lib_name} ${arg_ALT}) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH - PATH_SUFFIXES nvidia/current lib64 lib/x64 lib - ${arg_EXTRA_PATH_SUFFIXES} + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib ${arg_EXTRA_PATH_SUFFIXES} ) # Don't try any stub directories intil we have exhausted all other # search locations. if(NOT CUDA_${lib_name}_LIBRARY) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs ) endif() mark_as_advanced(CUDA_${lib_name}_LIBRARY) - if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") @@ -800,16 +784,15 @@ if(CUDAToolkit_FOUND) target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() - _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + _cudatoolkit_find_and_add_import_lib(cuda_driver ALT cuda) - _CUDAToolkit_find_and_add_import_lib(cudart) - _CUDAToolkit_find_and_add_import_lib(cudart_static) + _cudatoolkit_find_and_add_import_lib(cudart) + _cudatoolkit_find_and_add_import_lib(cudart_static) # setup dependencies that are required for cudart_static when building # on linux. These are generally only required when using the CUDA toolkit # when CUDA language is disabled - if(NOT TARGET CUDA::cudart_static_deps - AND TARGET CUDA::cudart_static) + if(NOT TARGET CUDA::cudart_static_deps AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) @@ -831,55 +814,64 @@ if(CUDAToolkit_FOUND) endif() endif() - _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library - foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + _cudatoolkit_find_and_add_import_lib(culibos) # it's a static library + foreach(cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) endforeach() # cuFFTW depends on cuFFT - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft_static) # cuSOLVER depends on cuBLAS, and cuSPARSE - _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) - _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + _cudatoolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _cudatoolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) # nvGRAPH depends on cuRAND, and cuSOLVER. - _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) - _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + _cudatoolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _cudatoolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) # Process the majority of the NPP libraries. - foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + foreach( + cuda_lib + nppial + nppicc + nppidei + nppif + nppig + nppim + nppist + nppitc + npps + nppicom + nppisu + ) + _cudatoolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) endforeach() - _CUDAToolkit_find_and_add_import_lib(cupti - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(cupti_static - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti_static EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + _cudatoolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) - _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + _cudatoolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) if(WIN32) # nvtools can be installed outside the CUDA toolkit directory # so prefer the NVTOOLSEXT_PATH windows only environment variable # In addition on windows the most common name is nvToolsExt64_1 - find_library(CUDA_nvToolsExt_LIBRARY + find_library( + CUDA_nvToolsExt_LIBRARY NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt - PATHS ENV NVTOOLSEXT_PATH - ENV CUDA_PATH + PATHS ENV NVTOOLSEXT_PATH ENV CUDA_PATH PATH_SUFFIXES lib/x64 lib ) endif() - _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + _cudatoolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) - _CUDAToolkit_find_and_add_import_lib(OpenCL) + _cudatoolkit_find_and_add_import_lib(OpenCL) endif() if(_CUDAToolkit_Pop_ROOT_PATH) diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake index 445f4e93a592..3a6a826197ec 100644 --- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -1,44 +1,40 @@ -IF (NOT CUDAToolkit_ROOT) - IF (NOT CUDA_ROOT) - SET(CUDA_ROOT $ENV{CUDA_ROOT}) - ENDIF() - IF(CUDA_ROOT) - SET(CUDAToolkit_ROOT ${CUDA_ROOT}) - ENDIF() -ENDIF() +if(NOT CUDAToolkit_ROOT) + if(NOT CUDA_ROOT) + set(CUDA_ROOT $ENV{CUDA_ROOT}) + endif() + if(CUDA_ROOT) + set(CUDAToolkit_ROOT ${CUDA_ROOT}) + endif() +endif() -IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") - MESSAGE(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") + message(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") +endif() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") find_package(CUDAToolkit REQUIRED) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - KOKKOS_EXPORT_CMAKE_TPL(CUDAToolkit REQUIRED) -ELSE() + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + kokkos_export_cmake_tpl(CUDAToolkit REQUIRED) +else() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) - IF (TARGET CUDA::cudart) - SET(FOUND_CUDART TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) - ELSE() - SET(FOUND_CUDART FALSE) - ENDIF() + if(TARGET CUDA::cudart) + set(FOUND_CUDART TRUE) + kokkos_export_imported_tpl(CUDA::cudart) + else() + set(FOUND_CUDART FALSE) + endif() - IF (TARGET CUDA::cuda_driver) - SET(FOUND_CUDA_DRIVER TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) - ELSE() - SET(FOUND_CUDA_DRIVER FALSE) - ENDIF() + if(TARGET CUDA::cuda_driver) + set(FOUND_CUDA_DRIVER TRUE) + kokkos_export_imported_tpl(CUDA::cuda_driver) + else() + set(FOUND_CUDA_DRIVER FALSE) + endif() include(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) - IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - ENDIF() -ENDIF() + find_package_handle_standard_args(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) + if(FOUND_CUDA_DRIVER AND FOUND_CUDART) + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + endif() +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake index d7b54fb9c9ab..e3c199b7c5de 100644 --- a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake @@ -1,15 +1,10 @@ - -FIND_PACKAGE(HPX REQUIRED 1.8.0) +find_package(HPX REQUIRED 1.8.0) #as of right now, HPX doesn't export correctly #so let's convert it to an interface target -KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE - LINK_LIBRARIES ${HPX_LIBRARIES} - INCLUDES ${HPX_INCLUDE_DIRS} -) +kokkos_create_imported_tpl(HPX INTERFACE LINK_LIBRARIES ${HPX_LIBRARIES} INCLUDES ${HPX_INCLUDE_DIRS}) #this is a bit funky since this is a CMake target #but HPX doesn't export itself correctly -KOKKOS_EXPORT_CMAKE_TPL(HPX) +kokkos_export_cmake_tpl(HPX) #I would prefer all of this gets replaced with #KOKKOS_IMPORT_CMAKE_TPL(HPX) - diff --git a/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake index cf763b7e5bb5..77ce8c71f730 100644 --- a/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(HWLOC HEADER hwloc.h LIBRARY hwloc) +kokkos_find_imported(HWLOC HEADER hwloc.h LIBRARY hwloc) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake index 8adcdcdbb8e3..85ae0b82244c 100644 --- a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) +kokkos_find_imported(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake index 70e0d6c454ad..ce428b0aeec6 100644 --- a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake @@ -2,17 +2,19 @@ # (which would not be contained in CMake's search paths anyway). # Hence, try if the compiler supports libquadmath natively first before doing # the standard package search. -SET(CMAKE_REQUIRED_LIBRARIES "quadmath") -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +set(CMAKE_REQUIRED_LIBRARIES "quadmath") +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main(void){ __float128 foo = ::sqrtq(123.456); return foo; }" - KOKKOS_QUADMATH_COMPILER_SUPPORT) -IF (KOKKOS_QUADMATH_COMPILER_SUPPORT) - KOKKOS_CREATE_IMPORTED_TPL(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) -ELSE() - KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) -ENDIF() + KOKKOS_QUADMATH_COMPILER_SUPPORT +) +if(KOKKOS_QUADMATH_COMPILER_SUPPORT) + kokkos_create_imported_tpl(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) +else() + kokkos_find_imported(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 603510c315e4..68de942a6983 100644 --- a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -1,9 +1,10 @@ -INCLUDE(CheckIncludeFileCXX) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) +include(CheckIncludeFileCXX) +check_include_file_cxx(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) +check_include_file_cxx(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main() @@ -13,37 +14,40 @@ CHECK_CXX_SOURCE_COMPILES(" #endif return 0; }" - KOKKOS_NO_TBB_CONFLICT) + KOKKOS_NO_TBB_CONFLICT +) -IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE +if(KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() -ELSE() - FIND_PACKAGE(oneDPL REQUIRED) + endif() +else() + find_package(oneDPL REQUIRED) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE LINK_LIBRARIES oneDPL) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE + LINK_LIBRARIES + oneDPL # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() + endif() # Export oneDPL as a Kokkos dependency - KOKKOS_EXPORT_CMAKE_TPL(oneDPL) -ENDIF() + kokkos_export_cmake_tpl(oneDPL) +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake index f796737f5b29..9673af0b9d90 100644 --- a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -1,7 +1,7 @@ include(FindPackageHandleStandardArgs) -FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) -FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) # FIXME_HIP Starting with ROCm 5.5 it is not necessary to link againt clang_rt. # We keep the code as is for now because it is hard to find the version of ROCM @@ -16,18 +16,24 @@ execute_process( COMMAND ${CMAKE_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE CLANG_RT_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE CLANG_RT_CHECK) + RESULT_VARIABLE CLANG_RT_CHECK +) -if( NOT "${CLANG_RT_CHECK}" STREQUAL "0" ) +if(NOT "${CLANG_RT_CHECK}" STREQUAL "0") # if the above failed, we delete CLANG_RT_LIBRARY to make the args check # below fail unset(CLANG_RT_LIBRARY) endif() - find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY CLANG_RT_LIBRARY) -kokkos_create_imported_tpl(ROCM INTERFACE - LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} ${CLANG_RT_LIBRARY} - COMPILE_DEFINITIONS __HIP_ROCclr__ +kokkos_create_imported_tpl( + ROCM + INTERFACE + LINK_LIBRARIES + ${HSA_RUNTIME_LIBRARY} + ${AMD_HIP_LIBRARY} + ${CLANG_RT_LIBRARY} + COMPILE_DEFINITIONS + __HIP_ROCclr__ ) diff --git a/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake index dae7dc3c9520..b4b905795dd0 100644 --- a/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -6,10 +6,10 @@ # behavior of ROCm 5.7 and later for earlier version of ROCm we set # AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If # the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. -SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") -SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") -FIND_PACKAGE(rocthrust REQUIRED) -KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) +set(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +set(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +find_package(rocthrust REQUIRED) +kokkos_create_imported_tpl(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) # Export ROCTHRUST as a Kokkos dependency -KOKKOS_EXPORT_CMAKE_TPL(rocthrust) +kokkos_export_cmake_tpl(rocthrust) diff --git a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake index ff0db5123f8e..280b8641da15 100644 --- a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake @@ -1,15 +1,14 @@ -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE(Threads) +include(FindPackageHandleStandardArgs) +find_package(Threads) -IF (TARGET Threads::Threads) - SET(FOUND_THREADS TRUE) -ELSE() - SET(FOUND_THREADS FALSE) -ENDIF() +if(TARGET Threads::Threads) + set(FOUND_THREADS TRUE) +else() + set(FOUND_THREADS FALSE) +endif() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLTHREADS DEFAULT_MSG FOUND_THREADS) +find_package_handle_standard_args(TPLTHREADS DEFAULT_MSG FOUND_THREADS) #Only create the TPL if we succeed -IF (FOUND_THREADS) - KOKKOS_CREATE_IMPORTED_TPL(THREADS INTERFACE LINK_OPTIONS - ${CMAKE_THREAD_LIBS_INIT}) -ENDIF() +if(FOUND_THREADS) + kokkos_create_imported_tpl(THREADS INTERFACE LINK_OPTIONS ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/packages/kokkos/cmake/README.md b/packages/kokkos/cmake/README.md index 385bbfcd5d5a..0548e89a90e7 100644 --- a/packages/kokkos/cmake/README.md +++ b/packages/kokkos/cmake/README.md @@ -310,20 +310,6 @@ When Kokkos is loaded by a downstream project, this TPL must be loaded. Calling this function simply appends text recording the location where the TPL was found and adding a `find_dependency(...)` call that will reload the CMake target. -### The Great TriBITS Compromise - -TriBITS was a masterpiece of CMake version 2 before the modern CMake idioms of building and using. -TriBITS greatly limited verbosity of CMake files, handled complicated dependency trees between packages, and handled automatically setting up include and linker paths for dependent libraries. - -Kokkos is now used by numerous projects that don't (and won't) depend on TriBITS for their build systems. -Kokkos has to work outside of TriBITS and provide a standard CMake 3+ build system. -At the same time, Kokkos is used by numerous projects that depend on TriBITS and don't (and won't) switch to a standard CMake 3+ build system. - -Instead of calling functions `TRIBITS_X(...)`, the CMake calls wrapper functions `KOKKOS_X(...)`. -If TriBITS is available (as in Trilinos), `KOKKOS_X` will just be a thin wrapper around `TRIBITS_X`. -If TriBITS is not available, Kokkos maps `KOKKOS_X` calls to native CMake that complies with CMake 3 idioms. -For the time being, this seems the most sensible way to handle the competing requirements of a standalone modern CMake and TriBITS build system. - ##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) diff --git a/packages/kokkos/cmake/build_env_info.cmake b/packages/kokkos/cmake/build_env_info.cmake index 0eeb6372455b..ac28b2d8503a 100644 --- a/packages/kokkos/cmake/build_env_info.cmake +++ b/packages/kokkos/cmake/build_env_info.cmake @@ -2,111 +2,108 @@ find_package(Git QUIET) -SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) -SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) +set(post_configure_dir ${CMAKE_BINARY_DIR}/generated) -SET(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) -SET(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) +set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) +set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) -FUNCTION(check_git_write git_hash git_clean_status) - FILE( - WRITE - ${CMAKE_BINARY_DIR}/git-state.txt - "${git_hash}-${git_clean_status}") -ENDFUNCTION() +function(check_git_write git_hash git_clean_status) + file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt "${git_hash}-${git_clean_status}") +endfunction() -FUNCTION(check_git_read git_hash) - IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) - FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) - LIST(GET CONTENT 0 var) +function(check_git_read git_hash) + if(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + file(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + list(GET CONTENT 0 var) message(DEBUG "Cached Git hash: ${var}") - SET(${git_hash} ${var} PARENT_SCOPE) + set(${git_hash} ${var} PARENT_SCOPE) else() - SET(${git_hash} "INVALID" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(check_git_version) - IF(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) - FILE( - COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp - DESTINATION ${post_configure_dir}) - ENDIF() - - IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) + set(${git_hash} "INVALID" PARENT_SCOPE) + endif() +endfunction() + +function(check_git_version) + if(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) + file(COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp DESTINATION ${post_configure_dir}) + endif() + + if(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) return() - ENDIF() + endif() # Get the current working branch execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit description execute_process( COMMAND ${GIT_EXECUTABLE} show -s --format=%s WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit date execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DATE - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Check if repo is dirty / clean execute_process( COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} RESULT_VARIABLE IS_DIRTY - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - IF(IS_DIRTY EQUAL 0) - SET(GIT_CLEAN_STATUS "CLEAN") + if(IS_DIRTY EQUAL 0) + set(GIT_CLEAN_STATUS "CLEAN") else() - SET(GIT_CLEAN_STATUS "DIRTY") - ENDIF() + set(GIT_CLEAN_STATUS "DIRTY") + endif() # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%h WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) check_git_read(GIT_HASH_CACHE) - IF(NOT EXISTS ${post_configure_dir}) + if(NOT EXISTS ${post_configure_dir}) file(MAKE_DIRECTORY ${post_configure_dir}) - ENDIF() + endif() # Only update the git_version.cpp if the hash has changed. This will # prevent us from rebuilding the project more than we need to. - IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} - OR NOT EXISTS ${post_configure_file}) + if(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${post_configure_file}) # Set the GIT_HASH_CACHE variable so the next build won't have # to regenerate the source file. check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) message(STATUS "Configured git information in ${post_configure_file}") - ENDIF() -ENDFUNCTION() + endif() +endfunction() -FUNCTION(check_git_setup) +function(check_git_setup) add_custom_target( - AlwaysCheckGit COMMAND ${CMAKE_COMMAND} - -DRUN_CHECK_GIT_VERSION=1 - -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} - -P ${CURRENT_LIST_DIR}/build_env_info.cmake - BYPRODUCTS ${post_configure_file}) + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} -DRUN_CHECK_GIT_VERSION=1 -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} -P + ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} + ) add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) @@ -114,9 +111,9 @@ FUNCTION(check_git_setup) add_dependencies(impl_git_version AlwaysCheckGit) check_git_version() -ENDFUNCTION() +endfunction() # This is used to run this function from an external cmake process. -IF(RUN_CHECK_GIT_VERSION) +if(RUN_CHECK_GIT_VERSION) check_git_version() -ENDIF() +endif() diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp b/packages/kokkos/cmake/compile_tests/amd_apu.cc similarity index 57% rename from packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp rename to packages/kokkos/cmake/compile_tests/amd_apu.cc index 3c599b95a6f3..a9c1edbd57b0 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp +++ b/packages/kokkos/cmake/compile_tests/amd_apu.cc @@ -14,5 +14,25 @@ // //@HEADER -#include -#include +#include +#include + +int main() { + hipDeviceProp_t hipProp; + hipError_t error = hipGetDeviceProperties(&hipProp, 0); + + if (error != hipSuccess) { + std::cout << hipGetErrorString(error) << '\n'; + return error; + } + + if (hipProp.integrated == 1) { + // We detected an APU + std::cout << "ON"; + } else { + // We detected a discrete GPU + std::cout << "OFF"; + } + + return 0; +} diff --git a/packages/kokkos/cmake/cray.cmake b/packages/kokkos/cmake/cray.cmake index 08912f5130f9..4ce5352bda26 100644 --- a/packages/kokkos/cmake/cray.cmake +++ b/packages/kokkos/cmake/cray.cmake @@ -1,9 +1,6 @@ - - function(kokkos_set_cray_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/packages/kokkos/cmake/deps/CUDA.cmake b/packages/kokkos/cmake/deps/CUDA.cmake index 5b6afd61512d..49eaf883a46f 100644 --- a/packages/kokkos/cmake/deps/CUDA.cmake +++ b/packages/kokkos/cmake/deps/CUDA.cmake @@ -17,24 +17,24 @@ # Check for CUDA support -SET(_CUDA_FAILURE OFF) +set(_CUDA_FAILURE OFF) # Have CMake find CUDA -IF(NOT _CUDA_FAILURE) - FIND_PACKAGE(CUDA 3.2) - IF (NOT CUDA_FOUND) - SET(_CUDA_FAILURE ON) - ENDIF() -ENDIF() +if(NOT _CUDA_FAILURE) + find_package(CUDA 3.2) + if(NOT CUDA_FOUND) + set(_CUDA_FAILURE ON) + endif() +endif() -IF(NOT _CUDA_FAILURE) +if(NOT _CUDA_FAILURE) # if we haven't met failure macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) - TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + tribits_add_library(${cuda_target} ${ARGN} CUDALIBRARY) endmacro() - GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) -ELSE() - SET(TPL_ENABLE_CUDA OFF) -ENDIF() + global_set(TPL_CUDA_LIBRARY_DIRS) + global_set(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + global_set(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) +else() + set(TPL_ENABLE_CUDA OFF) +endif() diff --git a/packages/kokkos/cmake/deps/HWLOC.cmake b/packages/kokkos/cmake/deps/HWLOC.cmake index 77d5a9b83a64..52d8368d0419 100644 --- a/packages/kokkos/cmake/deps/HWLOC.cmake +++ b/packages/kokkos/cmake/deps/HWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,7 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/packages/kokkos/cmake/deps/Pthread.cmake b/packages/kokkos/cmake/deps/Pthread.cmake index e879bff3741d..b811f850841d 100644 --- a/packages/kokkos/cmake/deps/Pthread.cmake +++ b/packages/kokkos/cmake/deps/Pthread.cmake @@ -15,31 +15,27 @@ # ************************************************************************ # @HEADER +set(USE_THREADS FALSE) -SET(USE_THREADS FALSE) - -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") + kokkos_create_imported_tpl_library(Pthread) +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/packages/kokkos/cmake/deps/quadmath.cmake b/packages/kokkos/cmake/deps/quadmath.cmake index 6aef08e8812f..9006d0cb9efb 100644 --- a/packages/kokkos/cmake/deps/quadmath.cmake +++ b/packages/kokkos/cmake/deps/quadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +kokkos_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake index a18d2ac518a6..d3fe1e6e2f62 100644 --- a/packages/kokkos/cmake/fake_tribits.cmake +++ b/packages/kokkos/cmake/fake_tribits.cmake @@ -1,288 +1,213 @@ #These are tribits wrappers used by all projects in the Kokkos ecosystem -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) +include(CMakeParseArguments) +include(CTest) -FUNCTION(ASSERT_DEFINED VARS) - FOREACH(VAR ${VARS}) - IF(NOT DEFINED ${VAR}) - MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -IF(NOT KOKKOS_HAS_TRILINOS) -MACRO(APPEND_GLOB VAR) - FILE(GLOB LOCAL_TMP_VAR ${ARGN}) - LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) -ENDMACRO() - -MACRO(GLOBAL_SET VARNAME) - SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) -ENDMACRO() - -MACRO(PREPEND_GLOBAL_SET VARNAME) - ASSERT_DEFINED(${VARNAME}) - GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) -ENDMACRO() -ENDIF() - -MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") - ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) - SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) -ENDMACRO() - -FUNCTION(KOKKOS_ADD_TEST) - if (KOKKOS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "SKIP_TRIBITS" - "EXE;NAME;TOOL" - "ARGS" - ${ARGN}) - - IF(TEST_SKIP_TRIBITS) - MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") - RETURN() - ENDIF() - - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - COMM serial mpi - NUM_MPI_PROCS 1 - ARGS ${TEST_ARGS} - ${TEST_UNPARSED_ARGUMENTS} - ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED - ) - - # We will get prepended package name here - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - - # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults - # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, - # the test won't actually be added and attempting to set a property on it below - # will yield an error. - if(TARGET ${EXE}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_TOOLS_LIBS=$") - endforeach() - endif() +function(ASSERT_DEFINED VARS) + foreach(VAR ${VARS}) + if(NOT DEFINED ${VAR}) + message(SEND_ERROR "Error, the variable ${VAR} is not defined!") endif() + endforeach() +endfunction() + +macro(APPEND_GLOB VAR) + file(GLOB LOCAL_TMP_VAR ${ARGN}) + list(APPEND ${VAR} ${LOCAL_TMP_VAR}) +endmacro() + +macro(GLOBAL_SET VARNAME) + set(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) +endmacro() + +macro(PREPEND_GLOBAL_SET VARNAME) + assert_defined(${VARNAME}) + global_set(${VARNAME} ${ARGN} ${${VARNAME}}) +endmacro() + +macro(ADD_INTERFACE_LIBRARY LIB_NAME) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + add_library(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + set_target_properties(${LIB_NAME} PROPERTIES INTERFACE TRUE) +endmacro() + +function(KOKKOS_ADD_TEST) + cmake_parse_arguments( + TEST "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" + ${ARGN} + ) + # To match Tribits, we should always be receiving + # the root names of exes/libs + if(TEST_EXE) + set(EXE_ROOT ${TEST_EXE}) else() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL;SKIP_TRIBITS" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;ARGS" - ${ARGN}) - # To match Tribits, we should always be receiving - # the root names of exes/libs - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - # Prepend package name to the test name - # These should be the full target name - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_TOOL) - ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - ENDIF() -ENDFUNCTION() - -MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) - ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) - TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) - TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) -ENDMACRO() + set(EXE_ROOT ${TEST_NAME}) + endif() + # Prepend package name to the test name + # These should be the full target name + set(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) + + # For compatibility with Trilinos testing, we support: + # * `-D _DISABLE=ON` + # * `-D _EXTRA_ARGS=";;;..."` + # * `-D _SET_RUN_SERIAL=ON` + if(${TEST_NAME}_DISABLE) + return() + endif() -FUNCTION(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES(${TPL_NAME} ${ARGN}) + set(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + if(WIN32) + add_test(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} + ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS} + ) else() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" - ${ARGN}) - - SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) - IF (PARSE_REQUIRED_LIBS_NAMES) - FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) - IF(NOT TPL_${TPL_NAME}_LIBRARIES) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (PARSE_REQUIRED_HEADERS) - FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) - IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (_${TPL_NAME}_ENABLE_SUCCESS) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) - ENDIF() - VERIFY_EMPTY(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) + add_test(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS}) endif() -ENDFUNCTION() - -MACRO(KOKKOS_TARGET_COMPILE_OPTIONS TARGET) -if(KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -else() - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -endif() -ENDMACRO() - -FUNCTION(KOKKOS_LIB_TYPE LIB RET) -GET_TARGET_PROPERTY(PROP ${LIB} TYPE) -IF (${PROP} STREQUAL "INTERFACE_LIBRARY") - SET(${RET} "INTERFACE" PARENT_SCOPE) -ELSE() - SET(${RET} "PUBLIC" PARENT_SCOPE) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) -IF(KOKKOS_HAS_TRILINOS) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - #don't trust tribits to do this correctly - but need to add package name - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSEIF(TARGET ${TARGET}) - #the target actually exists - this means we are doing separate libs - #or this a test library - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSE() - GET_PROPERTY(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - IF (${TARGET} IN_LIST LIBS) - SET_PROPERTY(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) - ELSE() - MESSAGE(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") - ENDIF() -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) -IF(KOKKOS_HAS_TRILINOS) - #do nothing -ELSE() - SET(options INTERFACE) - SET(oneValueArgs) - SET(multiValueArgs) - CMAKE_PARSE_ARGUMENTS(PARSE - "INTERFACE" - "" - "" - ${ARGN}) - SET(LINK_TYPE) - IF(PARSE_INTERFACE) - SET(LINK_TYPE INTERFACE) - ELSE() - SET(LINK_TYPE PUBLIC) - ENDIF() - TARGET_LINK_LIBRARIES(${TARGET} ${LINK_TYPE} ${DEPLIB}) - VERIFY_EMPTY(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_TEST_LIBRARY NAME) -IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN} TESTONLY) -ELSE() - SET(oneValueArgs) - SET(multiValueArgs HEADERS SOURCES) - - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES;DEPLIBS" - ${ARGN}) - - SET(LIB_TYPE) - IF (PARSE_STATIC) - SET(LIB_TYPE STATIC) - ELSEIF (PARSE_SHARED) - SET(LIB_TYPE SHARED) - ENDIF() + # Trilinos testing benefits from labeling the tests as "Kokkos" tests + set_tests_properties(${TEST_NAME} PROPERTIES LABELS Kokkos) + if(${TEST_NAME}_SET_RUN_SERIAL) + set_tests_properties(${TEST_NAME} PROPERTIES RUN_SERIAL ON) + endif() + # TriBITS doesn't actually currently support `-D _ENVIRONMENT` + # but we decided to add it anyway + if(${TEST_NAME}_ENVIRONMENT) + set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT "${${TEST_NAME}_ENVIRONMENT}") + endif() + if(TEST_WILL_FAIL) + set_tests_properties(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + endif() + if(TEST_FAIL_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + endif() + if(TEST_PASS_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + endif() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property( + TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$" + ) + endif() + verify_empty(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +endfunction() + +macro(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + add_interface_library(TPL_LIB_${TPL_NAME}) + target_link_libraries(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + target_include_directories(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +endmacro() + +function(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + cmake_parse_arguments(PARSE "" "" "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" ${ARGN}) + + set(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + if(PARSE_REQUIRED_LIBS_NAMES) + find_library(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + if(NOT TPL_${TPL_NAME}_LIBRARIES) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(PARSE_REQUIRED_HEADERS) + find_path(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + if(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(_${TPL_NAME}_ENABLE_SUCCESS) + kokkos_create_imported_tpl_library(${TPL_NAME}) + endif() + verify_empty(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - ADD_LIBRARY(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) - IF (PARSE_DEPLIBS) - TARGET_LINK_LIBRARIES(${NAME} PRIVATE ${PARSE_DEPLIBS}) - ENDIF() -ENDIF() -ENDFUNCTION() +function(KOKKOS_LIB_TYPE LIB RET) + get_target_property(PROP ${LIB} TYPE) + if(${PROP} STREQUAL "INTERFACE_LIBRARY") + set(${RET} "INTERFACE" PARENT_SCOPE) + else() + set(${RET} "PUBLIC" PARENT_SCOPE) + endif() +endfunction() + +function(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) + if(TARGET ${TARGET}) + #the target actually exists - this means we are doing separate libs + #or this a test library + kokkos_lib_type(${TARGET} INCTYPE) + target_include_directories(${TARGET} ${INCTYPE} ${ARGN}) + else() + get_property(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + if(${TARGET} IN_LIST LIBS) + set_property(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) + else() + message(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") + endif() + endif() +endfunction() + +function(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) + set(options INTERFACE) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments(PARSE "INTERFACE" "" "" ${ARGN}) + set(LINK_TYPE) + if(PARSE_INTERFACE) + set(LINK_TYPE INTERFACE) + else() + set(LINK_TYPE PUBLIC) + endif() + target_link_libraries(${TARGET} ${LINK_TYPE} ${DEPLIB}) + verify_empty(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() +function(KOKKOS_ADD_TEST_LIBRARY NAME) + set(oneValueArgs) + set(multiValueArgs HEADERS SOURCES) -FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) -IF(KOKKOS_HAS_TRILINOS) - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS( - INC - "REQUIRED_DURING_INSTALLATION_TESTING" - "" - "" - ${ARGN} - ) - INCLUDE_DIRECTORIES(${INC_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES;DEPLIBS" ${ARGN}) + set(LIB_TYPE) + if(PARSE_STATIC) + set(LIB_TYPE STATIC) + elseif(PARSE_SHARED) + set(LIB_TYPE SHARED) + endif() -MACRO(PRINTALL match) -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - if("${_variableName}" MATCHES "${match}") - message(STATUS "${_variableName}=${${_variableName}}") + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) endif() -endforeach() -ENDMACRO() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + add_library(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) + if(PARSE_DEPLIBS) + target_link_libraries(${NAME} PRIVATE ${PARSE_DEPLIBS}) + endif() +endfunction() + +function(KOKKOS_INCLUDE_DIRECTORIES) + cmake_parse_arguments(INC "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN}) + include_directories(${INC_UNPARSED_ARGUMENTS}) +endfunction() + +macro(PRINTALL match) + get_cmake_property(_variableNames VARIABLES) + list(SORT _variableNames) + foreach(_variableName ${_variableNames}) + if("${_variableName}" MATCHES "${match}") + message(STATUS "${_variableName}=${${_variableName}}") + endif() + endforeach() +endmacro() -MACRO(SET_GLOBAL_REPLACE SUBSTR VARNAME) - STRING(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDMACRO() +macro(SET_GLOBAL_REPLACE SUBSTR VARNAME) + string(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) + global_set(${VARNAME} ${TEMP}) +endmacro() -FUNCTION(GLOBAL_APPEND VARNAME) +function(GLOBAL_APPEND VARNAME) #We make this a function since we are setting variables #and want to use scope to avoid overwriting local variables - SET(TEMP ${${VARNAME}}) - LIST(APPEND TEMP ${ARGN}) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDFUNCTION() + set(TEMP ${${VARNAME}}) + list(APPEND TEMP ${ARGN}) + global_set(${VARNAME} ${TEMP}) +endfunction() diff --git a/packages/kokkos/cmake/gnu.cmake b/packages/kokkos/cmake/gnu.cmake index aa11fe87b111..e53b4a7becdd 100644 --- a/packages/kokkos/cmake/gnu.cmake +++ b/packages/kokkos/cmake/gnu.cmake @@ -1,23 +1,21 @@ - -FUNCTION(kokkos_set_gnu_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_gnu_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) + endif() +endfunction() diff --git a/packages/kokkos/cmake/intel.cmake b/packages/kokkos/cmake/intel.cmake index 7e6ee3358c90..b7752caabdf8 100644 --- a/packages/kokkos/cmake/intel.cmake +++ b/packages/kokkos/cmake/intel.cmake @@ -1,18 +1,15 @@ - -FUNCTION(kokkos_set_intel_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_intel_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) -ENDFUNCTION() - - + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + set(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake index a581d9f94571..ae45da806f73 100644 --- a/packages/kokkos/cmake/kokkos_arch.cmake +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -1,611 +1,732 @@ - -FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) +function(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) #all optimizations off by default - KOKKOS_DEPENDENT_OPTION(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) - SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - IF(KOKKOS_ARCH_${SUFFIX}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) - SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - + kokkos_dependent_option(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) + set(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + if(KOKKOS_ARCH_${SUFFIX}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) + set(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) + endif() +endfunction() # Make sure devices and compiler ID are done -KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID) -KOKKOS_CFG_DEPENDS(ARCH DEVICES) -KOKKOS_CFG_DEPENDS(ARCH OPTIONS) +kokkos_cfg_depends(ARCH COMPILER_ID) +kokkos_cfg_depends(ARCH DEVICES) +kokkos_cfg_depends(ARCH OPTIONS) -KOKKOS_CHECK_DEPRECATED_OPTIONS( - ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" - ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform" +kokkos_check_deprecated_options( + ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" ARCH_RYZEN + "Please replace RYZEN with ZEN or ZEN2, depending on your platform" ) #------------------------------------------------------------------------------- # List of possible host architectures. #------------------------------------------------------------------------------- -SET(KOKKOS_ARCH_LIST) +set(KOKKOS_ARCH_LIST) include(CheckCXXCompilerFlag) -KOKKOS_DEPRECATED_LIST(ARCH ARCH) - -SET(HOST_ARCH_ALREADY_SPECIFIED "") -MACRO(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) - KOKKOS_ARCH_OPTION(${ARCH} HOST "${LABEL}" TRUE) - IF(KOKKOS_ARCH_${ARCH}) - IF(HOST_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) - ENDIF() -ENDMACRO() - -DECLARE_AND_CHECK_HOST_ARCH(NATIVE "local machine") -DECLARE_AND_CHECK_HOST_ARCH(AMDAVX "AMD chip") -DECLARE_AND_CHECK_HOST_ARCH(ARMV80 "ARMv8.0 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") -DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") -DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") -DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") -DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ICL "Intel Ice Lake Client CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(SKL "Intel Skylake Client CPUs") -DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") -DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") - -IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_CUDA_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") - -IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -ENDIF() +kokkos_deprecated_list(ARCH ARCH) + +set(HOST_ARCH_ALREADY_SPECIFIED "") +macro(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) + kokkos_arch_option(${ARCH} HOST "${LABEL}" TRUE) + if(KOKKOS_ARCH_${ARCH}) + if(HOST_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) + endif() +endmacro() + +declare_and_check_host_arch(NATIVE "local machine") +declare_and_check_host_arch(AMDAVX "AMD chip") +declare_and_check_host_arch(ARMV80 "ARMv8.0 Compatible CPU") +declare_and_check_host_arch(ARMV81 "ARMv8.1 Compatible CPU") +declare_and_check_host_arch(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") +declare_and_check_host_arch(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") +declare_and_check_host_arch(A64FX "ARMv8.2 with SVE Support") +declare_and_check_host_arch(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") +declare_and_check_host_arch(SNB "Intel Sandy/Ivy Bridge CPUs") +declare_and_check_host_arch(HSW "Intel Haswell CPUs") +declare_and_check_host_arch(BDW "Intel Broadwell Xeon E-class CPUs") +declare_and_check_host_arch(ICL "Intel Ice Lake Client CPUs (AVX512)") +declare_and_check_host_arch(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(SKL "Intel Skylake Client CPUs") +declare_and_check_host_arch(SKX "Intel Skylake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(KNC "Intel Knights Corner Xeon Phi") +declare_and_check_host_arch(KNL "Intel Knights Landing Xeon Phi") +declare_and_check_host_arch(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(POWER8 "IBM POWER8 CPUs") +declare_and_check_host_arch(POWER9 "IBM POWER9 CPUs") +declare_and_check_host_arch(ZEN "AMD Zen architecture") +declare_and_check_host_arch(ZEN2 "AMD Zen2 architecture") +declare_and_check_host_arch(ZEN3 "AMD Zen3 architecture") +declare_and_check_host_arch(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(RISCV_RVA22V "RVA22V (RISC-V) CPUs") + +if(Kokkos_ENABLE_CUDA + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_CUDA_ARCHS ON) +endif() + +kokkos_arch_option(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") + +if(Kokkos_ENABLE_HIP + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_HIP_ARCHS ON) +endif() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940) -LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) -LIST(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) -LIST(APPEND SUPPORTED_AMD_GPUS RX7900XTX RX7900XTX V620/W6800 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS NAVI1100 AMD_GFX1100 NAVI1030 AMD_GFX1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1100 gfx1100 gfx1030 gfx1030) +list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940) +list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940) +list(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) +list(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) +list(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) +list(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) +list(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) +list(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) +list(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) +list(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - KOKKOS_ARCH_OPTION(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") -ENDFOREACH() - -IF(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) - SET(KOKKOS_SHOW_SYCL_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") - -IF(KOKKOS_ENABLE_COMPILER_WARNINGS) - SET(COMMON_WARNINGS - "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic" - "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + kokkos_arch_option(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") +endforeach() + +if(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) + set(KOKKOS_SHOW_SYCL_ARCHS ON) +endif() + +kokkos_arch_option(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") + +if(KOKKOS_ENABLE_COMPILER_WARNINGS) + set(COMMON_WARNINGS + "-Wall" + "-Wextra" + "-Wunused-parameter" + "-Wshadow" + "-pedantic" + "-Wsign-compare" + "-Wtype-limits" + "-Wuninitialized" + "-Wsuggest-override" + ) # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH - IF(Kokkos_ENABLE_LIBQUADMATH) + if(Kokkos_ENABLE_LIBQUADMATH) # warning: non-standard suffix on floating constant [-Wpedantic] - LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") - ENDIF() + list(REMOVE_ITEM COMMON_WARNINGS "-pedantic") + endif() # NVHPC compiler does not support -Wtype-limits. - IF(KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") - ENDIF() - ENDIF() - - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - LIST(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") - ENDIF() - - SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" - ${COMMON_WARNINGS}) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - LIST(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") + endif() + endif() + + # ICPC doesn't support -Wsuggest-override + if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + list(REMOVE_ITEM COMMON_WARNINGS "-Wsuggest-override") + endif() + + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") + endif() + + set(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) + if(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + list(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") + endif() # Not using COMPILER_SPECIFIC_FLAGS function so the warning flags are not passed downstream - IF(CMAKE_CXX_COMPILER_ID STREQUAL GNU) - STRING(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") - ELSEIF(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + string(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) # FIXME_NVHPC - ELSE() - STRING(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") - ENDIF() - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") -ENDIF() - + else() + string(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") +endif() #------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") - ENDIF() -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-extended-lambda") + global_append(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") +endif() + +if(KOKKOS_ENABLE_CUDA_CONSTEXPR) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") + endif() +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + set(CUDA_ARCH_FLAG "--cuda-gpu-arch") + global_append(KOKKOS_CUDA_OPTIONS -x cuda) # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR - IF (Kokkos_CUDA_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) - ELSEIF(CUDAToolkit_BIN_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - SET(CUDA_ARCH_FLAG "-arch") -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - STRING(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) - IF (KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -lineinfo) - ENDIF() - UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -ENDIF() - + if(Kokkos_CUDA_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + elseif(CUDAToolkit_BIN_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + set(CUDA_ARCH_FLAG "-arch") +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + string(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) + if(KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + global_append(KOKKOS_CUDA_OPTIONS -lineinfo) + endif() + unset(_UPPERCASE_CMAKE_BUILD_TYPE) +endif() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- -KOKKOS_OPTION(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") -KOKKOS_OPTION(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_FLAGS) -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_LINK) +kokkos_option(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") +kokkos_option(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") +mark_as_advanced(Kokkos_IMPL_AMDGPU_FLAGS) +mark_as_advanced(Kokkos_IMPL_AMDGPU_LINK) #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_ENABLE_HIP) - SET(AMDGPU_ARCH_FLAG "--offload-arch") - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF (NOT CMAKE_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip) - IF(DEFINED ENV{ROCM_PATH}) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) - ENDIF() - ENDIF() -ENDIF() - - -IF(KOKKOS_ARCH_NATIVE) - IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") - MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") - ENDIF() - - STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) - IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") - SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") - ELSE() - SET(KOKKOS_NATIVE_FLAGS "-mcpu=native") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - NVHPC -tp=native - DEFAULT ${KOKKOS_NATIVE_FLAGS} +global_set(KOKKOS_AMDGPU_OPTIONS) +if(KOKKOS_ENABLE_HIP) + set(AMDGPU_ARCH_FLAG "--offload-arch") + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(NOT CMAKE_CXX_STANDARD) + message(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") + endif() + global_append(KOKKOS_AMDGPU_OPTIONS -xhip) + if(DEFINED ENV{ROCM_PATH}) + global_append(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + endif() + endif() +endif() + +if(KOKKOS_ARCH_NATIVE) + if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + message(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") + endif() + + string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) + if(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") + set(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") + else() + set(KOKKOS_NATIVE_FLAGS "-mcpu=native") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID NVHPC -tp=native DEFAULT ${KOKKOS_NATIVE_FLAGS}) +endif() + +if(KOKKOS_ARCH_ARMV80) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV80) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a +endif() + +if(KOKKOS_ARCH_ARMV81) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.1-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV81) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.1-a +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a + -mtune=thunderx ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a -mtune=thunderx +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX2) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=thunderx2t99 + -mtune=thunderx2t99 ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX2) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 +endif() + +if(KOKKOS_ARCH_A64FX) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Clang + -march=armv8.2-a+sve + -msve-vector-bits=512 + GNU + -march=armv8.2-a+sve + -msve-vector-bits=512 + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.2-a+sve ) -ENDIF() - -IF (KOKKOS_ARCH_A64FX) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Clang -march=armv8.2-a+sve -msve-vector-bits=512 - GNU -march=armv8.2-a+sve -msve-vector-bits=512 - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.2-a+sve - ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV9_GRACE) - SET(KOKKOS_ARCH_ARM_NEON ON) +if(KOKKOS_ARCH_ARMV9_GRACE) + set(KOKKOS_ARCH_ARM_NEON ON) check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2) check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS) - IF (COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128 - ) - ELSE() - MESSAGE(WARNING "Compiler does not support ARMv9 Grace architecture") - ENDIF() -ENDIF() - -IF (KOKKOS_ARCH_ZEN) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen - DEFAULT -march=znver1 -mtune=znver1 + if(COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128) + else() + message(WARNING "Compiler does not support ARMv9 Grace architecture") + endif() +endif() + +if(KOKKOS_ARCH_ZEN) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen + DEFAULT + -march=znver1 + -mtune=znver1 ) - SET(KOKKOS_ARCH_AMD_ZEN ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN2) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver2 -mtune=znver2 + set(KOKKOS_ARCH_AMD_ZEN ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN2) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver2 + -mtune=znver2 ) - SET(KOKKOS_ARCH_AMD_ZEN2 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN3) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver3 -mtune=znver3 + set(KOKKOS_ARCH_AMD_ZEN2 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN3) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver3 + -mtune=znver3 ) - SET(KOKKOS_ARCH_AMD_ZEN3 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) - SET(KOKKOS_ARCH_AVX ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -mavx - MSVC /arch:AVX - NVHPC -tp=sandybridge - DEFAULT -mavx + set(KOKKOS_ARCH_AMD_ZEN3 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) + set(KOKKOS_ARCH_AVX ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -mavx + MSVC + /arch:AVX + NVHPC + -tp=sandybridge + DEFAULT + -mavx ) -ENDIF() - -IF (KOKKOS_ARCH_HSW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 +endif() + +if(KOKKOS_ARCH_HSW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 ) -ENDIF() - -IF (KOKKOS_ARCH_RISCV_SG2042) - IF(NOT - (KOKKOS_CXX_COMPILER_ID STREQUAL GNU - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) - OR - (KOKKOS_CXX_COMPILER_ID STREQUAL Clang - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) +endif() + +if(KOKKOS_ARCH_RISCV_SG2042) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) - MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -march=rv64imafdcv - ) -ENDIF() - - -IF (KOKKOS_ARCH_BDW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -march=rv64imafdcv) +endif() + +if(KOKKOS_ARCH_RISCV_RVA22V) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) -ENDIF() - -IF (KOKKOS_ARCH_KNL) - #avx512-mic - SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xMIC-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=knl - DEFAULT -march=knl -mtune=knl + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT + -march=rv64imafdcv_sscofpmf_sstc_svpbmt_zicbom_zicboz_zicbop_zihintpause + ) +endif() + +if(KOKKOS_ARCH_BDW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 + -mrtm ) -ENDIF() +endif() -IF (KOKKOS_ARCH_KNC) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - DEFAULT -mmic +if(KOKKOS_ARCH_KNL) + #avx512-mic + set(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xMIC-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=knl + DEFAULT + -march=knl + -mtune=knl ) -ENDIF() - -IF (KOKKOS_ARCH_SKL) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSKYLAKE - MSVC /arch:AVX2 - NVHPC -tp=skylake - DEFAULT -march=skylake -mtune=skylake +endif() + +if(KOKKOS_ARCH_KNC) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC NO-VALUE-SPECIFIED DEFAULT -mmic) +endif() + +if(KOKKOS_ARCH_SKL) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xSKYLAKE + MSVC + /arch:AVX2 + NVHPC + -tp=skylake + DEFAULT + -march=skylake + -mtune=skylake ) -ENDIF() - -IF (KOKKOS_ARCH_SKX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=skylake - DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 +endif() + +if(KOKKOS_ARCH_SKX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=skylake + DEFAULT + -march=skylake-avx512 + -mtune=skylake-avx512 ) -ENDIF() - -IF (KOKKOS_ARCH_ICL) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-client -mtune=icelake-client +endif() + +if(KOKKOS_ARCH_ICL) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-client + -mtune=icelake-client ) -ENDIF() - -IF (KOKKOS_ARCH_ICX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-server -mtune=icelake-server +endif() + +if(KOKKOS_ARCH_ICX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-server + -mtune=icelake-server ) -ENDIF() - -IF (KOKKOS_ARCH_SPR) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=sapphirerapids -mtune=sapphirerapids +endif() + +if(KOKKOS_ARCH_SPR) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=sapphirerapids + -mtune=sapphirerapids ) -ENDIF() - -IF (KOKKOS_ARCH_POWER7) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=power7 -mtune=power7 +endif() + +if(KOKKOS_ARCH_POWER7) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=power7 + -mtune=power7 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER8) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr8 - DEFAULT -mcpu=power8 -mtune=power8 +endif() + +if(KOKKOS_ARCH_POWER8) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr8 + DEFAULT + -mcpu=power8 + -mtune=power8 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER9) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr9 - DEFAULT -mcpu=power9 -mtune=power9 +endif() + +if(KOKKOS_ARCH_POWER9) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr9 + DEFAULT + -mcpu=power9 + -mtune=power9 ) -ENDIF() +endif() # If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect # the SIMD capabilities based on compiler macros. -IF (KOKKOS_ARCH_NATIVE) +if(KOKKOS_ARCH_NATIVE) # Make sure to rerun the checks if compile options have changed - IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") - SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") - - SET(CMAKE_REQUIRED_QUIET ON) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - - UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) - UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) - UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) - CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) - UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - - UNSET(CMAKE_REQUIRED_QUIET) - UNSET(CMAKE_REQUIRED_FLAGS) - ENDIF() + if(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") + set(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") + + set(CMAKE_REQUIRED_QUIET ON) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + + unset(KOKKOS_COMPILER_HAS_AVX512 CACHE) + check_cxx_symbol_exists(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) + unset(KOKKOS_COMPILER_HAS_AVX2 CACHE) + check_cxx_symbol_exists(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + check_cxx_symbol_exists(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) + unset(KOKKOS_COMPILER_HAS_AVX CACHE) + check_cxx_symbol_exists(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + + unset(CMAKE_REQUIRED_QUIET) + unset(CMAKE_REQUIRED_FLAGS) + endif() # Only define one of these macros for now # to be uniform with what we are doing for other architectures. - IF(KOKKOS_COMPILER_HAS_AVX512) - MESSAGE(STATUS "SIMD: AVX512 detected") - SET(KOKKOS_ARCH_AVX512XEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX2) - MESSAGE(STATUS "SIMD: AVX2 detected") - SET(KOKKOS_ARCH_AVX2 ON) - ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON) - MESSAGE(STATUS "SIMD: ARM_NEON detected") - SET(KOKKOS_ARCH_ARM_NEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX) - MESSAGE(STATUS "SIMD: AVX detected") - SET(KOKKOS_ARCH_AVX ON) - ENDIF() -ENDIF() + if(KOKKOS_COMPILER_HAS_AVX512) + message(STATUS "SIMD: AVX512 detected") + set(KOKKOS_ARCH_AVX512XEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX2) + message(STATUS "SIMD: AVX2 detected") + set(KOKKOS_ARCH_AVX2 ON) + elseif(KOKKOS_COMPILER_HAS_ARM_NEON) + message(STATUS "SIMD: ARM_NEON detected") + set(KOKKOS_ARCH_ARM_NEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX) + message(STATUS "SIMD: AVX detected") + set(KOKKOS_ARCH_AVX ON) + endif() +endif() # FIXME_NVHPC nvc++ doesn't seem to support AVX512. -IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON OFF) -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) + set(KOKKOS_ARCH_AVX512XEON OFF) +endif() # FIXME_NVCC nvcc doesn't seem to support Arm Neon. -IF(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - UNSET(KOKKOS_ARCH_ARM_NEON) -ENDIF() - -IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - Clang -fcuda-rdc - NVIDIA --relocatable-device-code=true - ) - ENDIF() -ENDIF() +if(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + unset(KOKKOS_ARCH_ARM_NEON) +endif() + +if(NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(Clang -fcuda-rdc NVIDIA --relocatable-device-code=true) + endif() +endif() # Clang needs mcx16 option enabled for Windows atomic functions -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) - COMPILER_SPECIFIC_OPTIONS( - Clang -mcx16 - ) -ENDIF() +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) + compiler_specific_options(Clang -mcx16) +endif() # MSVC ABI has many deprecation warnings, so ignore them -IF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - COMPILER_SPECIFIC_DEFS( - Clang _CRT_SECURE_NO_WARNINGS - ) -ENDIF() - +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + compiler_specific_defs(Clang _CRT_SECURE_NO_WARNINGS) +endif() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (KOKKOS_ENABLE_HIP) - IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fgpu-rdc - ) - IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT --hip-link - ) - ENDIF() - ELSE() - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fno-gpu-rdc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization - ) - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-unnamed-lambda - ) -ENDIF() +if(KOKKOS_ENABLE_HIP) + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(DEFAULT -fgpu-rdc) + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + compiler_specific_link_options(DEFAULT --hip-link) + endif() + else() + compiler_specific_flags(DEFAULT -fno-gpu-rdc) + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + compiler_specific_flags(DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization) + compiler_specific_options(DEFAULT -fsycl-unnamed-lambda) + if(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2024.1.0) + # Before oneAPI 2024.1.0 passing -fno-sycl didn't work properly + if(NOT KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + message(FATAL_ERROR "Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=OFF requires oneAPI 2024.1.0 or later") + endif() + elseif(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT -fsycl-rdc) + else() + compiler_specific_options(DEFAULT -fno-sycl-rdc) + endif() +endif() # Check support for device_global variables # FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device @@ -613,17 +734,18 @@ ENDIF() # implementation. Otherwise, the feature is not supported when building shared # libraries. Thus, we don't even check for support if shared libraries are # requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. -IF(KOKKOS_ENABLE_SYCL) - STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) +if(KOKKOS_ENABLE_SYCL) + string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + if(KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) # Use the non-separable compilation implementation to support shared libraries as well. - COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) - ELSEIF(NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) - CHECK_CXX_SOURCE_COMPILES(" + compiler_specific_flags(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + elseif(NOT BUILD_SHARED_LIBS AND KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " #include using namespace sycl::ext::oneapi::experimental; using namespace sycl; @@ -638,548 +760,617 @@ IF(KOKKOS_ENABLE_SYCL) int main(){ return 0; } " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + compiler_specific_flags(DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + endif() + endif() + + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_GRAPH "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_GRAPH) +endif() + +set(CUDA_ARCH_ALREADY_SPECIFIED "") +function(CHECK_CUDA_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(CUDA_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." ) - ENDIF() - ENDIF() -ENDIF() - -SET(CUDA_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) - MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_CUDA) - STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) - IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() - ENDIF() - LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) - SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) - LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) - SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) -ENDFUNCTION() - + endif() + set(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_CUDA + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_SYCL + AND NOT KOKKOS_ENABLE_OPENACC + ) + message( + WARNING + "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_CUDA) + string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + endif() + set(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) + if(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + set(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) + else() + global_append(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() + endif() + list(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) + set(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) + list(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) + set(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) +endfunction() #These will define KOKKOS_CUDA_ARCH_FLAG #to the corresponding flag name if ON -CHECK_CUDA_ARCH(KEPLER30 sm_30) -CHECK_CUDA_ARCH(KEPLER32 sm_32) -CHECK_CUDA_ARCH(KEPLER35 sm_35) -CHECK_CUDA_ARCH(KEPLER37 sm_37) -CHECK_CUDA_ARCH(MAXWELL50 sm_50) -CHECK_CUDA_ARCH(MAXWELL52 sm_52) -CHECK_CUDA_ARCH(MAXWELL53 sm_53) -CHECK_CUDA_ARCH(PASCAL60 sm_60) -CHECK_CUDA_ARCH(PASCAL61 sm_61) -CHECK_CUDA_ARCH(VOLTA70 sm_70) -CHECK_CUDA_ARCH(VOLTA72 sm_72) -CHECK_CUDA_ARCH(TURING75 sm_75) -CHECK_CUDA_ARCH(AMPERE80 sm_80) -CHECK_CUDA_ARCH(AMPERE86 sm_86) -CHECK_CUDA_ARCH(ADA89 sm_89) -CHECK_CUDA_ARCH(HOPPER90 sm_90) - -SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL) - MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_HIP) - SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) - ENDIF() - IF(NOT KOKKOS_IMPL_AMDGPU_FLAGS) - SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() +check_cuda_arch(KEPLER30 sm_30) +check_cuda_arch(KEPLER32 sm_32) +check_cuda_arch(KEPLER35 sm_35) +check_cuda_arch(KEPLER37 sm_37) +check_cuda_arch(MAXWELL50 sm_50) +check_cuda_arch(MAXWELL52 sm_52) +check_cuda_arch(MAXWELL53 sm_53) +check_cuda_arch(PASCAL60 sm_60) +check_cuda_arch(PASCAL61 sm_61) +check_cuda_arch(VOLTA70 sm_70) +check_cuda_arch(VOLTA72 sm_72) +check_cuda_arch(TURING75 sm_75) +check_cuda_arch(AMPERE80 sm_80) +check_cuda_arch(AMPERE86 sm_86) +check_cuda_arch(ADA89 sm_89) +check_cuda_arch(HOPPER90 sm_90) + +set(AMDGPU_ARCH_ALREADY_SPECIFIED "") +function(CHECK_AMDGPU_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(AMDGPU_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_HIP + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_OPENACC + AND NOT KOKKOS_ENABLE_SYCL + ) + message( + WARNING + "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_HIP) + set(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + endif() + if(NOT KOKKOS_IMPL_AMDGPU_FLAGS) + set(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + global_append(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + global_append(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() +endfunction() #These will define KOKKOS_AMDGPU_ARCH_FLAG #to the corresponding flag name if ON -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) -ENDFOREACH() - -IF(KOKKOS_IMPL_AMDGPU_FLAGS) - IF (NOT AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " - "Please explicitly set the GPU architecture.") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") -ENDIF() - -MACRO(SET_AND_CHECK_AMD_ARCH ARCH FLAG) - KOKKOS_SET_OPTION(ARCH_${ARCH} ON) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) -ENDMACRO() - -MACRO(CHECK_MULTIPLE_INTEL_ARCH) - IF(KOKKOS_ARCH_INTEL_GPU) - MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") - ENDIF() - SET(KOKKOS_ARCH_INTEL_GPU ON) -ENDMACRO() - -IF(KOKKOS_ARCH_INTEL_GEN) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_DG1) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN9) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN11) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN12LP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_XEHP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_PVC) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - IF (CLANG_CUDA_ARCH) - IF(KOKKOS_CLANG_IS_CRAY) - COMPILER_SPECIFIC_FLAGS( - Cray -fopenmp - ) - ELSE() - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 - NVHPC -gpu=${NVHPC_CUDA_ARCH} +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + check_amdgpu_arch(${ARCH} ${FLAG}) +endforeach() + +if(KOKKOS_IMPL_AMDGPU_FLAGS) + if(NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + message(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " + "Please explicitly set the GPU architecture." + ) + endif() + global_append(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") + global_append(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") +endif() + +macro(SET_AND_CHECK_AMD_ARCH ARCH FLAG) + kokkos_set_option(ARCH_${ARCH} ON) + check_amdgpu_arch(${ARCH} ${FLAG}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) +endmacro() + +macro(CHECK_MULTIPLE_INTEL_ARCH) + if(KOKKOS_ARCH_INTEL_GPU) + message(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") + endif() + set(KOKKOS_ARCH_INTEL_GPU ON) +endmacro() + +if(KOKKOS_ARCH_INTEL_GEN) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_DG1) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN9) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN11) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN12LP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_XEHP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_PVC) + check_multiple_intel_arch() +endif() + +if(KOKKOS_ENABLE_OPENMP) + compiler_specific_link_options(CrayClang -fopenmp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + if(CLANG_CUDA_ARCH) + if(KOKKOS_CLANG_IS_CRAY) + compiler_specific_flags(Cray -fopenmp) + else() + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) + compiler_specific_flags( + Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 NVHPC -gpu=${NVHPC_CUDA_ARCH} ) - ENDIF() - ENDIF() - SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) - IF (CLANG_AMDGPU_ARCH) - COMPILER_SPECIFIC_FLAGS( + endif() + endif() + set(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) + if(CLANG_AMDGPU_ARCH) + compiler_specific_flags( Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa ) - ENDIF() - IF (KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ - ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ - ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" + endif() + if(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__) + else() + compiler_specific_options(IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__) + if(KOKKOS_ARCH_INTEL_GEN9) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7") + endif() + endif() +endif() + +if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CUDA_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" - ) - ENDIF() - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CUDA_ARCH_FLAG) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc -gpu=${NVHPC_CUDA_ARCH} - Clang -Xopenmp-target=nvptx64-nvidia-cuda -march=${CLANG_CUDA_ARCH} - -fopenmp-targets=nvptx64-nvidia-cuda + endif() + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + compiler_specific_flags( + NVHPC + -acc + -gpu=${NVHPC_CUDA_ARCH} + Clang + -Xopenmp-target=nvptx64-nvidia-cuda + -march=${CLANG_CUDA_ARCH} + -fopenmp-targets=nvptx64-nvidia-cuda ) - ELSEIF(KOKKOS_AMDGPU_ARCH_FLAG) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} - -fopenmp-targets=amdgcn-amd-amdhsa - ) - ELSE() - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} + if(DEFINED ENV{CUDA_PATH}) + compiler_specific_link_options(Clang -L$ENV{CUDA_PATH}/lib64) + endif() + compiler_specific_libs(Clang -lcudart NVHPC -cuda) + elseif(KOKKOS_AMDGPU_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSE() - MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(AMDGPU_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} - ) - ELSE() - MESSAGE(SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64 + endif() + compiler_specific_flags( + Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} -fopenmp-targets=amdgcn-amd-amdhsa ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen + if(DEFINED ENV{ROCM_PATH}) + compiler_specific_flags(Clang -I$ENV{ROCM_PATH}/include) + compiler_specific_link_options(Clang -L$ENV{ROCM_PATH}/lib) + endif() + compiler_specific_libs(Clang -lamdhip64) + elseif(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + compiler_specific_flags(NVHPC -acc=multicore) + else() + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + compiler_specific_flags(NVHPC -acc=gpu,multicore) + message( + STATUS + "No OpenACC target device is specificed; the OpenACC backend will be executed in an automatic fallback mode." ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + if(CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda + --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" + else() + message( + SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" + endif() + elseif(AMDGPU_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" + else() + message( + SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ENDIF() - ENDIF() -ENDIF() - -IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) + endif() + elseif(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(DEFAULT -fsycl-targets=spir64) + elseif(KOKKOS_ARCH_INTEL_GPU) + set(SYCL_TARGET_FLAG -fsycl-targets=spir64_gen) + + if(KOKKOS_ARCH_INTEL_GEN9) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.60.7") + endif() + + if(Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG}) + compiler_specific_link_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + else() + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + endif() + endif() +endif() + +if(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device - SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) - FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR}) - FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) - - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + set(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) + file(REMOVE_RECURSE ${_BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) + + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) # if user is using kokkos_compiler_launcher, above will fail. - IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + if(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) - GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + get_property(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough - IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + if(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) # make sure the user knows that we aren't using CUDA compiler for anything else - MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") - INCLUDE(CheckLanguage) - CHECK_LANGUAGE(CUDA) - IF(CMAKE_CUDA_COMPILER) - ENABLE_LANGUAGE(CUDA) - ELSE() - MESSAGE(STATUS "CUDA language could not be enabled") - ENDIF() - ENDIF() + message( + STATUS + "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture..." + ) + include(CheckLanguage) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(STATUS "CUDA language could not be enabled") + endif() + endif() # if CUDA was enabled, this will be defined - IF(CMAKE_CUDA_COMPILER) + if(CMAKE_CUDA_COMPILER) # copy our test to .cu so cmake compiles as CUDA - CONFIGURE_FILE( + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COPYONLY + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COPYONLY ) # run test again - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) - ENDIF() - ENDIF() - - LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) - IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) - MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") - LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) - KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON) - CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) - ELSE() - MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " - "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" - "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " - "If you are cross-compiling, you should try to do this on a compute node.") - ENDIF() -ENDIF() + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) + endif() + endif() + + list(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) + if(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) + message(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") + list(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) + kokkos_set_option(ARCH_${ARCHITECTURE} ON) + check_cuda_arch(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) + else() + message( + SEND_ERROR + "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " + "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" + "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " + "If you are cross-compiling, you should try to do this on a compute node." + ) + endif() +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_KEPLER30 OR KOKKOS_ARCH_KEPLER32 OR KOKKOS_ARCH_KEPLER35 OR KOKKOS_ARCH_KEPLER37) - SET(KOKKOS_ARCH_KEPLER ON) -ENDIF() +if(KOKKOS_ARCH_KEPLER30 + OR KOKKOS_ARCH_KEPLER32 + OR KOKKOS_ARCH_KEPLER35 + OR KOKKOS_ARCH_KEPLER37 +) + set(KOKKOS_ARCH_KEPLER ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) - SET(KOKKOS_ARCH_MAXWELL ON) -ENDIF() +if(KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) + set(KOKKOS_ARCH_MAXWELL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) - SET(KOKKOS_ARCH_PASCAL ON) -ENDIF() +if(KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) + set(KOKKOS_ARCH_PASCAL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) - SET(KOKKOS_ARCH_VOLTA ON) -ENDIF() +if(KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) + set(KOKKOS_ARCH_VOLTA ON) +endif() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) -ENDIF() +if(KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + set(KOKKOS_ARCH_AMPERE ON) +endif() -IF (KOKKOS_ARCH_HOPPER90) - SET(KOKKOS_ARCH_HOPPER ON) -ENDIF() +if(KOKKOS_ARCH_HOPPER90) + set(KOKKOS_ARCH_HOPPER ON) +endif() + +function(CHECK_AMD_APU ARCH) + set(BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/AmdApuWorkdir) + file(REMOVE_RECURSE ${BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${BINARY_TEST_DIR}) + + try_run(RESULT COMPILE_RESULT ${BINARY_TEST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/amd_apu.cc + RUN_OUTPUT_VARIABLE AMD_APU + ) + + if(NOT COMPILE_RESULT OR NOT RESULT EQUAL 0) + message(SEND_ERROR "Autodetection of AMD APU failed." + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + + if(AMD_APU) + set(${ARCH} AMD_GFX942_APU PARENT_SCOPE) + endif() +endfunction() #HIP detection of gpu arch -IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) - IF(NOT ROCM_ENUMERATOR) - MESSAGE(FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " - "rocm_agent_enumerator could not be found. " - "Please specify an arch manually via -DKokkos_ARCH_{..}=ON") - ELSE() - EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) - STRING(LENGTH "${GPU_ARCHS}" len_str) +if(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + find_program(ROCM_ENUMERATOR rocm_agent_enumerator) + if(NOT ROCM_ENUMERATOR) + message( + FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " "rocm_agent_enumerator could not be found. " + "Please specify an arch manually via -DKokkos_ARCH_{..}=ON" + ) + else() + execute_process(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) + string(LENGTH "${GPU_ARCHS}" len_str) # enumerator always output gfx000 as the first line - IF(${len_str} LESS 8) - MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - # check for known gpu archs, otherwise error out - ELSE() - SET(AMD_ARCH_DETECTED "") - FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - STRING(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) - IF("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") - SET_AND_CHECK_AMD_ARCH(${ARCH} ${FLAG}) - SET(AMD_ARCH_DETECTED ${ARCH}) - BREAK() - ENDIF() - ENDFOREACH() - IF("${AMD_ARCH_DETECTED}" STREQUAL "") - MESSAGE(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " - "is supported. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - ENDIF() - ENDIF() - ENDIF() -ENDIF() - -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - STRING(REGEX MATCH "90A" IS_90A ${ARCH}) - IF(IS_90A) - SET(KOKKOS_ARCH_AMD_GFX90A ON) - SET(KOKKOS_ARCH_VEGA90A ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "908" IS_908 ${ARCH}) - IF(IS_908) - SET(KOKKOS_ARCH_AMD_GFX908 ON) - SET(KOKKOS_ARCH_VEGA908 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "906" IS_906 ${ARCH}) - IF(IS_906) - SET(KOKKOS_ARCH_AMD_GFX906 ON) - SET(KOKKOS_ARCH_VEGA906 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1100" IS_1100 ${ARCH}) - IF(IS_1100) - SET(KOKKOS_ARCH_AMD_GFX1100 ON) - SET(KOKKOS_ARCH_NAVI1100 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1030" IS_1030 ${ARCH}) - IF(IS_1030) - SET(KOKKOS_ARCH_AMD_GFX1030 ON) - SET(KOKKOS_ARCH_NAVI1030 ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() + if(${len_str} LESS 8) + message(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + # check for known gpu archs, otherwise error out + else() + set(AMD_ARCH_DETECTED "") + foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + string(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) + if("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") + # If we detected gfx942, we need to discriminate between APU and discrete GPU + if(FLAG STREQUAL "gfx942") + check_amd_apu(ARCH) + endif() + set_and_check_amd_arch(${ARCH} ${FLAG}) + set(AMD_ARCH_DETECTED ${ARCH}) + break() + endif() + endforeach() + if("${AMD_ARCH_DETECTED}" STREQUAL "") + message(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " "is supported. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + endif() + endif() +endif() + +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + string(REGEX MATCH "90A" IS_90A ${ARCH}) + if(IS_90A) + set(KOKKOS_ARCH_AMD_GFX90A ON) + set(KOKKOS_ARCH_VEGA90A ON) + break() + endif() + string(REGEX MATCH "908" IS_908 ${ARCH}) + if(IS_908) + set(KOKKOS_ARCH_AMD_GFX908 ON) + set(KOKKOS_ARCH_VEGA908 ON) + break() + endif() + string(REGEX MATCH "906" IS_906 ${ARCH}) + if(IS_906) + set(KOKKOS_ARCH_AMD_GFX906 ON) + set(KOKKOS_ARCH_VEGA906 ON) + break() + endif() + string(REGEX MATCH "1100" IS_1100 ${ARCH}) + if(IS_1100) + set(KOKKOS_ARCH_AMD_GFX1100 ON) + set(KOKKOS_ARCH_NAVI1100 ON) + break() + endif() + string(REGEX MATCH "1030" IS_1030 ${ARCH}) + if(IS_1030) + set(KOKKOS_ARCH_AMD_GFX1030 ON) + set(KOKKOS_ARCH_NAVI1030 ON) + break() + endif() + endif() +endforeach() #Regardless of version, make sure we define the general architecture name -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - SET(KOKKOS_ARCH_AMD_GPU ON) - STRING(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) - IF(IS_VEGA) - SET(KOKKOS_ARCH_VEGA ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) - IF(IS_NAVI) - SET(KOKKOS_ARCH_NAVI ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + set(KOKKOS_ARCH_AMD_GPU "${FLAG}") + string(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) + if(IS_VEGA) + set(KOKKOS_ARCH_VEGA ON) + break() + endif() + string(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) + if(IS_NAVI) + set(KOKKOS_ARCH_NAVI ON) + break() + endif() + endif() +endforeach() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Built-in Execution Spaces:") - -FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_DEVICE_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_DEVICE_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "Cuda") - IF(KOKKOS_ENABLE_CUDA_UVM) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead") - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") - ENDIF() - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - ENDIF() - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSEIF(${_BACKEND} STREQUAL "HIP") - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() -IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NoTypeDefined") - SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") -ENDIF() -MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") - -FOREACH (_BACKEND OpenMP Threads HPX) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_HOST_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_HOST_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "HPX") - SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ELSE() - SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() - -IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "At least one host execution space must be enabled, " - "but no host parallel execution space was requested " - "and Kokkos_ENABLE_SERIAL=OFF.") -ENDIF() - -IF(_HOST_PARALLEL) -MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -ELSE() - SET(_HOST_PARALLEL "NoTypeDefined") - MESSAGE(STATUS " Host Parallel: NoTypeDefined") -ENDIF() - -IF(KOKKOS_ENABLE_SERIAL) - MESSAGE(STATUS " Host Serial: SERIAL") -ELSE() - MESSAGE(STATUS " Host Serial: NONE") -ENDIF() - -MESSAGE(STATUS "") -MESSAGE(STATUS "Architectures:") -FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) - MESSAGE(STATUS " ${Arch}") -ENDFOREACH() - - -IF(KOKKOS_ENABLE_ATOMICS_BYPASS) - IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") - MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") - ENDIF() - IF(NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "Implementation bug") # safeguard - ENDIF() - MESSAGE(STATUS "Atomics: **DISABLED**") -ENDIF() +message(STATUS "Built-in Execution Spaces:") + +foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_DEVICE_PARALLEL) + message( + FATAL_ERROR + "Multiple device parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_DEVICE_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "Cuda") + if(KOKKOS_ENABLE_CUDA_UVM) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead" + ) + if(NOT KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") + endif() + endif() + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + elseif(${_BACKEND} STREQUAL "HIP" OR ${_BACKEND} STREQUAL "SYCL") + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + else() + set(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + endif() + endif() +endforeach() +if(NOT _DEVICE_PARALLEL) + set(_DEVICE_PARALLEL "NoTypeDefined") +endif() +message(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") + +foreach(_BACKEND OpenMP Threads HPX) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_HOST_PARALLEL) + message( + FATAL_ERROR + "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_HOST_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "HPX") + set(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + else() + set(_HOST_PARALLEL "Kokkos::${_BACKEND}") + endif() + endif() +endforeach() + +if(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "At least one host execution space must be enabled, " + "but no host parallel execution space was requested " "and Kokkos_ENABLE_SERIAL=OFF." + ) +endif() + +if(_HOST_PARALLEL) + message(STATUS " Host Parallel: ${_HOST_PARALLEL}") +else() + set(_HOST_PARALLEL "NoTypeDefined") + message(STATUS " Host Parallel: NoTypeDefined") +endif() + +if(KOKKOS_ENABLE_SERIAL) + message(STATUS " Host Serial: SERIAL") +else() + message(STATUS " Host Serial: NONE") +endif() + +message(STATUS "") +message(STATUS "Architectures:") +foreach(Arch ${KOKKOS_ENABLED_ARCH_LIST}) + message(STATUS " ${Arch}") +endforeach() + +if(KOKKOS_ENABLE_ATOMICS_BYPASS) + if(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + message( + FATAL_ERROR + "Disabling atomics (via -DKokkos_ENABLE_ATOMICS_BYPASS=ON) is not allowed if a host parallel or a device backend is enabled!" + ) + endif() + if(NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "Implementation bug") # safeguard + endif() + message(STATUS "Atomics: **DISABLED**") +endif() diff --git a/packages/kokkos/cmake/kokkos_check_env.cmake b/packages/kokkos/cmake/kokkos_check_env.cmake index a455a403b9d5..f1a309ff8579 100644 --- a/packages/kokkos/cmake/kokkos_check_env.cmake +++ b/packages/kokkos/cmake/kokkos_check_env.cmake @@ -1,12 +1,15 @@ -SET(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) -IF (CRAYPE_VERSION) - SET(KOKKOS_IS_CRAYPE TRUE) - SET(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) - IF (CRAYPE_LINK_TYPE) - IF (NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") - MESSAGE(WARNING "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() - ELSE() - MESSAGE(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() -ENDIF() +set(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) +if(CRAYPE_VERSION) + set(KOKKOS_IS_CRAYPE TRUE) + set(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) + if(CRAYPE_LINK_TYPE) + if(NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") + message( + WARNING + "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'" + ) + endif() + else() + message(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") + endif() +endif() diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake index e8bfadb64ebe..010ed33ede89 100644 --- a/packages/kokkos/cmake/kokkos_compiler_id.cmake +++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake @@ -1,262 +1,273 @@ -KOKKOS_CFG_DEPENDS(COMPILER_ID NONE) +kokkos_cfg_depends(COMPILER_ID NONE) -SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) -SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) -SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) +set(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) +set(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +set(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -MACRO(kokkos_internal_have_compiler_nvcc) +macro(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${ARGN} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") - IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) - SET(INTERNAL_HAVE_COMPILER_NVCC true) - ELSE() - SET(INTERNAL_HAVE_COMPILER_NVCC false) - ENDIF() -ENDMACRO() + execute_process(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + string(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + if(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + set(INTERNAL_HAVE_COMPILER_NVCC true) + else() + set(INTERNAL_HAVE_COMPILER_NVCC false) + endif() +endmacro() -IF(Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # kokkos_enable_options is not yet called so use lower case here - IF(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) kokkos_internal_have_compiler_nvcc(${CMAKE_CUDA_COMPILER}) - ELSE() + else() # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) # Check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(CMAKE_CXX_COMPILER_LAUNCHER) - MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") - ENDIF() + if(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_LAUNCHER) + message( + FATAL_ERROR + "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!" + ) + endif() # the first argument to launcher is always the C++ compiler defined by cmake # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) - SET(INTERNAL_USE_COMPILER_LAUNCHER true) - ENDIF() - ENDIF() -ENDIF() + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} + -DKOKKOS_DEPENDENCE + ) + set(INTERNAL_USE_COMPILER_LAUNCHER true) + endif() + endif() +endif() -IF(INTERNAL_HAVE_COMPILER_NVCC) +if(INTERNAL_HAVE_COMPILER_NVCC) # Save the host compiler id before overwriting it. - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) # SET the compiler id to nvcc. We use the value used by CMake 3.8. - SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) + set(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) - STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") - IF(INTERNAL_USE_COMPILER_LAUNCHER) - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") + string(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + string(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + if(INTERNAL_USE_COMPILER_LAUNCHER) + message(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) - ENDIF() -ENDIF() + endif() +endif() -IF(Kokkos_ENABLE_HIP) +if(Kokkos_ENABLE_HIP) # get HIP version - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) - IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) - SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) - ENDIF() + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + if(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + set(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + endif() - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c Cray - OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_CRAY TRUE) - ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c Cray + OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_CRAY TRUE) + set(KOKKOS_CXX_COMPILER_ID CrayClang) + endif() # The clang based Intel compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c "DPC++\\|icpx" - OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - ENDIF() -ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c "DPC++\\|icpx" + OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_INTEL TRUE) + set(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) # SET Cray's compiler version. - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - IF (KOKKOS_CLANG_IS_CRAY) - SET(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) - ELSE() - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - ENDIF() -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + if(KOKKOS_CLANG_IS_CRAY) + set(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) + else() + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) # SET Fujitsus compiler version which is not detected by CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +endif() # Enforce the minimum compilers supported by Kokkos. -IF(NOT CMAKE_CXX_STANDARD) - SET(CMAKE_CXX_STANDARD 17) -ENDIF() -IF(CMAKE_CXX_STANDARD EQUAL 17) - SET(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 8.2.0) - SET(KOKKOS_INTEL_MINIMUM 19.0.5) - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 11.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.29) -ELSE() - SET(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 10.1.0) - SET(KOKKOS_INTEL_MINIMUM "not supported") - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 12.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.30) -ENDIF() +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +if(CMAKE_CXX_STANDARD EQUAL 17) + set(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 8.2.0) + set(KOKKOS_INTEL_MINIMUM 19.0.5) + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 11.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.29) +else() + set(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 10.1.0) + set(KOKKOS_INTEL_MINIMUM "not supported") + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 12.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.30) +endif() -SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") +set(KOKKOS_MESSAGE_TEXT + "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:" +) +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() - SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + if((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() # Treat PGI internally as NVHPC to simplify handling both compilers. # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is # backward-compatible to pgc++. - SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ENDIF() + set(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +endif() -IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) -ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) - SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ENDIF() +if(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) +elseif(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + set(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +endif() -STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) -LIST(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) -LIST(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) -LIST(LENGTH VERSION_LIST LIST_LENGTH) +string(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) +list(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) +list(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) +list(LENGTH VERSION_LIST LIST_LENGTH) # On Android, the compiler doesn't have a patch version, just a major/minor -IF(LIST_LENGTH GREATER 2) - LIST(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) -ELSE() - SET(KOKKOS_COMPILER_VERSION_PATCH 0) -ENDIF() - +if(LIST_LENGTH GREATER 2) + list(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) +else() + set(KOKKOS_COMPILER_VERSION_PATCH 0) +endif() diff --git a/packages/kokkos/cmake/kokkos_configure_trilinos.cmake b/packages/kokkos/cmake/kokkos_configure_trilinos.cmake new file mode 100644 index 000000000000..5aeef61e7b32 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_configure_trilinos.cmake @@ -0,0 +1,38 @@ +if(CMAKE_PROJECT_NAME STREQUAL "Trilinos") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Whether to build Serial backend" FORCE) + + if(NOT ${Trilinos_ENABLE_OpenMP} STREQUAL "") + set(Kokkos_ENABLE_OPENMP ${Trilinos_ENABLE_OpenMP} CACHE BOOL "Whether to build OpenMP backend" FORCE) + else() + set(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "Whether to build OpenMP backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_CUDA} STREQUAL "") + set(Kokkos_ENABLE_CUDA ${TPL_ENABLE_CUDA} CACHE BOOL "Whether to build CUDA backend" FORCE) + else() + set(Kokkos_ENABLE_CUDA OFF CACHE BOOL "Whether to build CUDA backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_HPX} STREQUAL "") + set(Kokkos_ENABLE_HPX ${TPL_ENABLE_HPX} CACHE BOOL "Whether to build HPX backend" FORCE) + else() + set(Kokkos_ENABLE_HPX OFF CACHE BOOL "Whether to build HPX backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_quadmath} STREQUAL "") + set(Kokkos_ENABLE_LIBQUADMATH ${TPL_ENABLE_quadmath} CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + else() + set(Kokkos_ENABLE_LIBQUADMATH OFF CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + endif() + + if(NOT ${TPL_ENABLE_DLlib} STREQUAL "") + set(Kokkos_ENABLE_LIBDL ${TPL_ENABLE_DLlib} CACHE BOOL "Whether to enable the LIBDL library" FORCE) + else() + set(Kokkos_ENABLE_LIBDL OFF CACHE BOOL "Whether to enable the LIBDL library" FORCE) + endif() + + set(Kokkos_ENABLE_COMPLEX_ALIGN OFF CACHE BOOL "Whether to align Kokkos::complex to 2*alignof(RealType)") + + # FIXME_TRILINOS We run into problems when trying to use an external GTest in Trilinos CI + set(CMAKE_DISABLE_FIND_PACKAGE_GTest ON) +endif() diff --git a/packages/kokkos/cmake/kokkos_corner_cases.cmake b/packages/kokkos/cmake/kokkos_corner_cases.cmake index ede2b4e0caf8..530e9e8fd8e0 100644 --- a/packages/kokkos/cmake/kokkos_corner_cases.cmake +++ b/packages/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,8 @@ -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.2) - MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496") -ENDIF() - +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 11.2 +) + message( + WARNING + "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496" + ) +endif() diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake index c7d189285c58..40c2d3ea8afb 100644 --- a/packages/kokkos/cmake/kokkos_enable_devices.cmake +++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake @@ -1,128 +1,132 @@ - -FUNCTION(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME}) - LIST(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) +function(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME}) + list(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) - IF (KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") - SET(KOKKOS_HAS_HOST ON PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) + if(KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") + set(KOKKOS_HAS_HOST ON PARENT_SCOPE) + endif() +endfunction() -KOKKOS_CFG_DEPENDS(DEVICES NONE) +kokkos_cfg_depends(DEVICES NONE) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(DEVICES ENABLE) - +kokkos_deprecated_list(DEVICES ENABLE) -KOKKOS_DEVICE_OPTION(THREADS OFF HOST "Whether to build C++ threads backend") +kokkos_device_option(THREADS OFF HOST "Whether to build C++ threads backend") # detect clang++ / cl / clang-cl clashes -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") # this specific test requires CMake >= 3.15 - IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + if("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") # use pure clang++ instead of clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC OFF) - ELSE() + set(KOKKOS_COMPILER_CLANG_MSVC OFF) + else() # it defaults to clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC ON) - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) - SET(OMP_DEFAULT ON) -ELSE() - SET(OMP_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") + set(KOKKOS_COMPILER_CLANG_MSVC ON) + endif() +endif() +if(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) + set(OMP_DEFAULT ON) +else() + set(OMP_DEFAULT OFF) +endif() +kokkos_device_option(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") # We want this to default to OFF for cache reasons, but if no # host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") - -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") +if(KOKKOS_HAS_HOST) + set(SERIAL_DEFAULT OFF) +else() + set(SERIAL_DEFAULT ON) + if(NOT DEFINED Kokkos_ENABLE_SERIAL) + message( + STATUS + "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt" + ) + endif() +endif() +kokkos_device_option(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") + +kokkos_device_option(HPX OFF HOST "Whether to build HPX backend (experimental)") # Device backends have to come after host backends for header include order reasons # Without this we can't make e.g. CudaSpace accessible by HostSpace -KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") -IF (KOKKOS_ENABLE_OPENACC) - COMPILER_SPECIFIC_FLAGS( - Clang -fopenacc -fopenacc-fake-async-wait - -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version - -Wno-pass-failed - ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG - ) -ENDIF() - -KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(ClangOpenMPFlag -fopenmp=libomp) - IF(KOKKOS_CLANG_IS_CRAY) - SET(ClangOpenMPFlag -fopenmp) - ENDIF() - - COMPILER_SPECIFIC_FLAGS( - Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelLLVM -fiopenmp -Wno-openmp-mapping - NVHPC -mp=gpu - DEFAULT -fopenmp +kokkos_device_option(OPENACC OFF DEVICE "Whether to build the OpenACC backend") +if(KOKKOS_ENABLE_OPENACC) + compiler_specific_flags( + Clang + -fopenacc + -fopenacc-fake-async-wait + -fopenacc-implicit-worker=vector + -Wno-openacc-and-cxx + -Wno-openmp-mapping + -Wno-unknown-cuda-version + -Wno-pass-failed ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) +endif() + +kokkos_device_option(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") +if(KOKKOS_ENABLE_OPENMPTARGET) + set(ClangOpenMPFlag -fopenmp=libomp) + if(KOKKOS_CLANG_IS_CRAY) + set(ClangOpenMPFlag -fopenmp) + endif() + + compiler_specific_flags( + Clang + ${ClangOpenMPFlag} + -Wno-openmp-mapping + IntelLLVM + -fiopenmp + -Wno-openmp-mapping + NVHPC + -mp=gpu + DEFAULT + -fopenmp ) -# Are there compilers which identify as Clang and need this library? -# COMPILER_SPECIFIC_LIBS( -# Clang -lopenmptarget -# ) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_DEFAULT ON) -ELSE() - SET(CUDA_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - -IF (KOKKOS_ENABLE_CUDA) - GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") -## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros - LIST(APPEND DEVICE_SETUP_LIST Cuda) -ENDIF() - -KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) + # Are there compilers which identify as Clang and need this library? + # COMPILER_SPECIFIC_LIBS( + # Clang -lopenmptarget + # ) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") + endif() +endif() + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + set(CUDA_DEFAULT ON) +else() + set(CUDA_DEFAULT OFF) +endif() +kokkos_device_option(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") + +if(KOKKOS_ENABLE_CUDA) + global_set(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") + ## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + list(APPEND DEVICE_SETUP_LIST Cuda) +endif() + +kokkos_device_option(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros -IF (KOKKOS_ENABLE_HIP) - LIST(APPEND DEVICE_SETUP_LIST HIP) -ENDIF() +if(KOKKOS_ENABLE_HIP) + list(APPEND DEVICE_SETUP_LIST HIP) +endif() -KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") +kokkos_device_option(SYCL OFF DEVICE "Whether to build SYCL backend") ## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros -IF (KOKKOS_ENABLE_SYCL) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") - ENDIF() - LIST(APPEND DEVICE_SETUP_LIST SYCL) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "SYCL backend requires C++17 or newer!") + endif() + list(APPEND DEVICE_SETUP_LIST SYCL) +endif() diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake index 53764b0c6848..a5d6fdfe4edd 100644 --- a/packages/kokkos/cmake/kokkos_enable_options.cmake +++ b/packages/kokkos/cmake/kokkos_enable_options.cmake @@ -1,198 +1,236 @@ ########################## NOTES ############################################### # List the options for configuring kokkos using CMake method of doing it. -# These options then get mapped onto KOKKOS_SETTINGS environment variable by -# kokkos_settings.cmake. It is separate to allow other packages to override -# these variables (e.g., TriBITS). ########################## AVAILABLE OPTIONS ################################### # Use lists for documentation, verification, and programming convenience - -FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) +function(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) + list(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) -ENDFUNCTION() + set(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) +endfunction() # Certain defaults will depend on knowing the enabled devices -KOKKOS_CFG_DEPENDS(OPTIONS DEVICES) -KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) +kokkos_cfg_depends(OPTIONS DEVICES) +kokkos_cfg_depends(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) +kokkos_deprecated_list(OPTIONS ENABLE) -KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") -KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") -KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +kokkos_enable_option(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") +kokkos_enable_option(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") +kokkos_enable_option(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") # In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default. # That is problematic when CUDA is not enabled because this not only yields a # bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and -# sets it to ON. This if-clause is a crutch that delays the refactoring of the -# way we declare all options until after we get rid of TriBITS. -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**") - -# May be used to disable our use of CudaMallocAsync. It had caused issues in -# the past when UCX was used as MPI communication layer. We expect it is -# resolved but we keep the option around a bit longer to be safe. -KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") -KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") - -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) -KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) -KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") -KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") -KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") -STRING(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) -IF(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - KOKKOS_ENABLE_OPTION(DEBUG ON "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") -ELSE() - KOKKOS_ENABLE_OPTION(DEBUG OFF "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") -ENDIF() -UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") -KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") -KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") -KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") -KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") -KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") -KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") -KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for HIP") +# sets it to ON. +kokkos_enable_option( + CUDA_LAMBDA ${KOKKOS_ENABLE_CUDA} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**" +) + +# As of 09/2024, cudaMallocAsync causes issues with ICP and older version of UCX +# as MPI communication layer. +kokkos_enable_option(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +kokkos_enable_option(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +kokkos_enable_option(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") + +kokkos_enable_option(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available") +kokkos_enable_option(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings") +kokkos_enable_option(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") + +# Disabling RDC only works properly since oneAPI 2024.1.0 +if(KOKKOS_ENABLE_SYCL AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 2024.1.0 +) + set(SYCL_RDC_DEFAULT ON) +else() + set(SYCL_RDC_DEFAULT OFF) +endif() +kokkos_enable_option( + SYCL_RELOCATABLE_DEVICE_CODE ${SYCL_RDC_DEFAULT} "Whether to enable relocatable device code (RDC) for SYCL" +) +kokkos_enable_option(TESTS OFF "Whether to build the unit tests") +kokkos_enable_option(BENCHMARKS OFF "Whether to build the benchmarks") +kokkos_enable_option(EXAMPLES OFF "Whether to build the examples") +string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) +if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + kokkos_enable_option(DEBUG ON "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") +else() + kokkos_enable_option(DEBUG OFF "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") +endif() +unset(_UPPERCASE_CMAKE_BUILD_TYPE) +kokkos_enable_option(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") +kokkos_enable_option(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") +kokkos_enable_option(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") +kokkos_enable_option(TUNING OFF "Whether to create bindings for tuning tools") +kokkos_enable_option(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +kokkos_enable_option(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") +kokkos_enable_option( + HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF + "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time" +) +kokkos_enable_option(IMPL_HIP_MALLOC_ASYNC OFF "Whether to enable hipMallocAsync") +kokkos_enable_option(OPENACC_FORCE_HOST_AS_DEVICE OFF "Whether to force to use host as a target device for OpenACC") # This option will go away eventually, but allows fallback to old implementation when needed. -KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") -KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") -KOKKOS_ENABLE_OPTION(IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting") +kokkos_enable_option(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +kokkos_enable_option( + ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases" +) +kokkos_enable_option( + IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting" +) mark_as_advanced(Kokkos_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY) -KOKKOS_ENABLE_OPTION(IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction.") +kokkos_enable_option( + IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF + "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction." +) mark_as_advanced(Kokkos_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND) -KOKKOS_ENABLE_OPTION(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") -KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") -KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan") +kokkos_enable_option(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") +kokkos_enable_option(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") +kokkos_enable_option( + IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan" +) mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos) - SET(COMPLEX_ALIGN_DEFAULT OFF) -ELSE() - SET(COMPLEX_ALIGN_DEFAULT ON) -ENDIF() -KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") - -IF (KOKKOS_ENABLE_TESTS) - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) -ELSE() - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") -IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) - MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) - SET(CUDA_CONSTEXPR_DEFAULT ON) -ELSE() - SET(CUDA_CONSTEXPR_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") - -IF (KOKKOS_ENABLE_HPX) - SET(HPX_ASYNC_DISPATCH_DEFAULT ON) -ELSE() - SET(HPX_ASYNC_DISPATCH_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") - -Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") - -FUNCTION(check_device_specific_options) - CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) - IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) - FOREACH(OPTION ${SOME_OPTIONS}) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() - IF(KOKKOS_ENABLE_${OPTION}) - MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") - UNSET(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() - -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +kokkos_enable_option(COMPLEX_ALIGN ON "Whether to align Kokkos::complex to 2*alignof(RealType)") + +if(KOKKOS_ENABLE_TESTS) + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +else() + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +endif() +kokkos_enable_option( + HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests" +) +if(NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + message( + WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored." + ) +endif() + +if(KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + set(CUDA_CONSTEXPR_DEFAULT ON) +else() + set(CUDA_CONSTEXPR_DEFAULT OFF) +endif() +kokkos_enable_option( + CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions" +) + +if(KOKKOS_ENABLE_HPX) + set(HPX_ASYNC_DISPATCH_DEFAULT ON) +else() + set(HPX_ASYNC_DISPATCH_DEFAULT OFF) +endif() +kokkos_enable_option(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") + +kokkos_enable_option(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + +function(check_device_specific_options) + cmake_parse_arguments(SOME "" "DEVICE" "OPTIONS" ${ARGN}) + if(NOT KOKKOS_ENABLE_${SOME_DEVICE}) + foreach(OPTION ${SOME_OPTIONS}) + if(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + message(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") + endif() + if(KOKKOS_ENABLE_${OPTION}) + message( + WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored." + ) + unset(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) + endif() + endforeach() + endif() +endfunction() + +check_device_specific_options( + DEVICE + CUDA + OPTIONS + CUDA_UVM + CUDA_RELOCATABLE_DEVICE_CODE + CUDA_LAMBDA + CUDA_CONSTEXPR + CUDA_LDG_INTRINSIC + IMPL_CUDA_MALLOC_ASYNC + IMPL_CUDA_UNIFIED_MEMORY +) +check_device_specific_options( + DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE HIP_MULTIPLE_KERNEL_INSTANTIATIONS IMPL_HIP_MALLOC_ASYNC +) +check_device_specific_options(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options(DEVICE OPENACC OPTIONS OPENACC_FORCE_HOST_AS_DEVICE) # Needed due to change from deprecated name to new header define name -IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) - SET(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) -ENDIF() +if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + set(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) +endif() # Force consistency of KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE # and CMAKE_CUDA_SEPARABLE_COMPILATION when we are compiling # using the CMake CUDA language support. # Either one being on will turn the other one on. -IF (KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - IF (NOT CMAKE_CUDA_SEPARABLE_COMPILATION) - MESSAGE(STATUS "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support") - SET(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - ELSE() - IF (CMAKE_CUDA_SEPARABLE_COMPILATION) - SET(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) - ENDIF() - ENDIF() -ENDIF() +if(KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + if(NOT CMAKE_CUDA_SEPARABLE_COMPILATION) + message( + STATUS + "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support" + ) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + endif() + else() + if(CMAKE_CUDA_SEPARABLE_COMPILATION) + set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) + endif() + endif() +endif() # This is known to occur with Clang 9. We would need to use nvcc as the linker # http://lists.llvm.org/pipermail/cfe-dev/2018-June/058296.html # TODO: Through great effort we can use a different linker by hacking # CMAKE_CXX_LINK_EXECUTABLE in a future release -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - MESSAGE(FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) - MESSAGE(FATAL_ERROR "Relocatable device code requires static libraries.") -ENDIF() - -IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") - ENDIF() -ENDIF() -IF(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON") +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + message( + FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC" + ) +endif() + +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) + message(FATAL_ERROR "Relocatable device code requires static libraries.") +endif() + +if(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") + endif() +endif() +if(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON" + ) set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) set(KOKKOS_ENABLE_CUDA_LAMBDA ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") - ENDIF() -ENDIF() - - -IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) - MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") -ENDIF() + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") + endif() +endif() + +if(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) + message(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") +endif() diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake index d1f1e0d7a785..38eedd8362c5 100644 --- a/packages/kokkos/cmake/kokkos_functions.cmake +++ b/packages/kokkos/cmake/kokkos_functions.cmake @@ -5,12 +5,8 @@ # Validate options are given with correct case and define an internal # upper-case version for use within -set(Kokkos_OPTIONS_NOT_TO_EXPORT - Kokkos_ENABLE_BENCHMARKS - Kokkos_ENABLE_EXAMPLES - Kokkos_ENABLE_TESTS - Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS - Kokkos_ENABLE_COMPILER_WARNINGS +set(Kokkos_OPTIONS_NOT_TO_EXPORT Kokkos_ENABLE_BENCHMARKS Kokkos_ENABLE_EXAMPLES Kokkos_ENABLE_TESTS + Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS Kokkos_ENABLE_COMPILER_WARNINGS ) # @@ -22,139 +18,122 @@ set(Kokkos_OPTIONS_NOT_TO_EXPORT # It attempts to print a helpful message about updating the options for the new CMake. # Kokkos_${SUFFIX} is the name of the option (like Kokkos_ARCH) being checked. # Kokkos_${PREFIX}_X is the name of new option to be defined from a list X,Y,Z,... -FUNCTION(kokkos_deprecated_list SUFFIX PREFIX) - SET(CAMEL_NAME Kokkos_${SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_deprecated_list SUFFIX PREFIX) + set(CAMEL_NAME Kokkos_${SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) #I don't love doing it this way but better to be safe - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - STRING(REPLACE "," ";" optlist "${${opt}}") - SET(ERROR_MSG "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:") - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - STRING(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") - ENDFOREACH() - STRING(APPEND ERROR_MSG "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it).") - IF (KOKKOS_HAS_TRILINOS) - MESSAGE(WARNING ${ERROR_MSG}) - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - SET(${CAMEL_NAME}_${ENTRY_UC} ON CACHE BOOL "Deprecated Trilinos translation") - ENDFOREACH() - UNSET(${opt} CACHE) - ELSE() - MESSAGE(SEND_ERROR ${ERROR_MSG}) - ENDIF() - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE}) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + string(REPLACE "," ";" optlist "${${opt}}") + set(ERROR_MSG + "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:" + ) + foreach(entry ${optlist}) + string(TOUPPER ${entry} ENTRY_UC) + string(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") + endforeach() + string( + APPEND + ERROR_MSG + "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it)." + ) + message(SEND_ERROR ${ERROR_MSG}) + endif() + endforeach() +endfunction() + +function(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES ${TYPE}) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) # Make sure this appears in the cache with the appropriate DOCSTRING - SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) - - IF (KOKKOS_HAS_TRILINOS) - IF (NOT CAMEL_NAME IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - TRIBITS_PKG_EXPORT_CACHE_VAR(${CAMEL_NAME}) - ENDIF() - ENDIF() + set(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -INCLUDE (CMakeDependentOption) -FUNCTION(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES BOOL) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - - CMAKE_DEPENDENT_OPTION(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +include(CMakeDependentOption) +function(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES BOOL) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + + cmake_dependent_option(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE) - LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) - IF(OPTION_INDEX EQUAL -1) - MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") - ENDIF() - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) - LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) - SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) - MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}") - SET(${UC_NAME} ${VALUE} PARENT_SCOPE) -ENDFUNCTION() - -FUNCTION(kokkos_append_config_line LINE) - GLOBAL_APPEND(KOKKOS_TPL_EXPORTS "${LINE}") -ENDFUNCTION() - -MACRO(kokkos_export_cmake_tpl NAME) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +function(kokkos_set_option CAMEL_SUFFIX VALUE) + list(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) + if(OPTION_INDEX EQUAL -1) + message(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") + endif() + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) + list(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) + set(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) + message(STATUS "Setting ${CAMEL_NAME}=${VALUE}") + set(${UC_NAME} ${VALUE} PARENT_SCOPE) +endfunction() + +function(kokkos_append_config_line LINE) + global_append(KOKKOS_TPL_EXPORTS "${LINE}") +endfunction() + +macro(kokkos_export_cmake_tpl NAME) cmake_parse_arguments(KOKKOS_EXTRA_ARG "REQUIRED" "" "COMPONENTS" ${ARGN}) #CMake TPLs are located with a call to find_package @@ -163,91 +142,88 @@ MACRO(kokkos_export_cmake_tpl NAME) #If Kokkos was configured to find the TPL through a _DIR variable #make sure thar DIR variable is available to downstream packages - IF (DEFINED ${NAME}_DIR) + if(DEFINED ${NAME}_DIR) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_DIR)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_DIR ${${NAME}_DIR})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_DIR)") + kokkos_append_config_line(" SET(${NAME}_DIR ${${NAME}_DIR})") + kokkos_append_config_line("ENDIF()") + endif() - IF (DEFINED ${NAME}_ROOT) + if(DEFINED ${NAME}_ROOT) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_ROOT)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_ROOT ${${NAME}_ROOT})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - SET(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") - - IF(KOKKOS_EXTRA_ARG_REQUIRED) - STRING(APPEND KOKKOS_CONFIG_STRING " REQUIRED") - ENDIF() - IF(KOKKOS_EXTRA_ARG_COMPONENTS) - STRING(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") - ENDIF() - STRING(APPEND KOKKOS_CONFIG_STRING ")") - KOKKOS_APPEND_CONFIG_LINE(${KOKKOS_CONFIG_STRING}) -ENDMACRO() - -MACRO(kokkos_export_imported_tpl NAME) - IF (NOT KOKKOS_HAS_TRILINOS) - GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED) - IF (NOT LIB_IMPORTED) - # This is not an imported target - # This an interface library that we created - INSTALL( - TARGETS ${NAME} - EXPORT KokkosTargets - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - ELSE() - #make sure this also gets "exported" in the config file - KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})") - - GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE) - IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - ELSE() - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) - IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") - ENDIF() - ENDIF() - - GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) - IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") - ENDIF() - - GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) - IF(TPL_COMPILE_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") - ENDIF() - - SET(TPL_LINK_OPTIONS) - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - IF(TPL_LINK_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") - ENDIF() - - GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) - IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") - ENDIF() - KOKKOS_APPEND_CONFIG_LINE(")") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - ENDIF() -ENDMACRO() - + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_ROOT)") + kokkos_append_config_line(" SET(${NAME}_ROOT ${${NAME}_ROOT})") + kokkos_append_config_line("ENDIF()") + endif() + set(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") + + if(KOKKOS_EXTRA_ARG_REQUIRED) + string(APPEND KOKKOS_CONFIG_STRING " REQUIRED") + endif() + if(KOKKOS_EXTRA_ARG_COMPONENTS) + string(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") + endif() + string(APPEND KOKKOS_CONFIG_STRING ")") + kokkos_append_config_line(${KOKKOS_CONFIG_STRING}) +endmacro() + +macro(kokkos_export_imported_tpl NAME) + get_target_property(LIB_IMPORTED ${NAME} IMPORTED) + if(NOT LIB_IMPORTED) + # This is not an imported target + # This an interface library that we created + install( + TARGETS ${NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + #make sure this also gets "exported" in the config file + kokkos_append_config_line("IF(NOT TARGET ${NAME})") + + get_target_property(LIB_TYPE ${NAME} TYPE) + if(${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") + kokkos_append_config_line("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + else() + kokkos_append_config_line("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + get_target_property(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) + if(TPL_LIBRARY) + kokkos_append_config_line("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") + endif() + endif() + + get_target_property(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) + if(TPL_INCLUDES) + kokkos_append_config_line("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") + endif() + + get_target_property(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) + if(TPL_COMPILE_OPTIONS) + kokkos_append_config_line("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") + endif() + + set(TPL_LINK_OPTIONS) + get_target_property(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) + if(TPL_LINK_OPTIONS) + kokkos_append_config_line("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") + endif() + + get_target_property(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) + if(TPL_LINK_LIBRARIES) + kokkos_append_config_line("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") + endif() + kokkos_append_config_line(")") + kokkos_append_config_line("ENDIF()") + endif() +endmacro() # # @MACRO: KOKKOS_IMPORT_TPL() @@ -271,57 +247,43 @@ ENDMACRO() # # If specified, this TPL will build an INTERFACE library rather than an # IMPORTED target -IF (KOKKOS_HAS_TRILINOS) -MACRO(kokkos_import_tpl NAME) - #do nothing -ENDMACRO() -ELSE() -MACRO(kokkos_import_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT;INTERFACE" - "" - "" - ${ARGN}) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - - IF (KOKKOS_ENABLE_${NAME}) +macro(kokkos_import_tpl NAME) + cmake_parse_arguments(TPL "NO_EXPORT;INTERFACE" "" "" ${ARGN}) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + + if(KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find - FIND_PACKAGE(TPL${NAME} REQUIRED MODULE) - IF(NOT TARGET ${TPL_IMPORTED_NAME}) - MESSAGE(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") - ENDIF() - IF(NOT TPL_NO_EXPORT) - GET_TARGET_PROPERTY(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) - IF (NOT TPL_ORIGINAL_NAME) - SET(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) - ENDIF() - KOKKOS_EXPORT_IMPORTED_TPL(${TPL_ORIGINAL_NAME}) - ENDIF() - LIST(APPEND KOKKOS_ENABLED_TPLS ${NAME}) - ENDIF() -ENDMACRO(kokkos_import_tpl) -ENDIF() - -MACRO(kokkos_import_cmake_tpl MODULE_NAME) + find_package(TPL${NAME} REQUIRED MODULE) + if(NOT TARGET ${TPL_IMPORTED_NAME}) + message(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") + endif() + if(NOT TPL_NO_EXPORT) + get_target_property(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) + if(NOT TPL_ORIGINAL_NAME) + set(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) + endif() + kokkos_export_imported_tpl(${TPL_ORIGINAL_NAME}) + endif() + list(APPEND KOKKOS_ENABLED_TPLS ${NAME}) + endif() +endmacro(kokkos_import_tpl) + +macro(kokkos_import_cmake_tpl MODULE_NAME) kokkos_import_tpl(${MODULE_NAME} ${ARGN} NO_EXPORT) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT" - "OPTION_NAME" - "" - ${ARGN}) + cmake_parse_arguments(TPL "NO_EXPORT" "OPTION_NAME" "" ${ARGN}) - IF (NOT TPL_OPTION_NAME) - SET(TPL_OPTION_NAME ${MODULE_NAME}) - ENDIF() + if(NOT TPL_OPTION_NAME) + set(TPL_OPTION_NAME ${MODULE_NAME}) + endif() - IF (NOT TPL_NO_EXPORT) - KOKKOS_EXPORT_CMAKE_TPL(${MODULE_NAME}) - ENDIF() -ENDMACRO() + if(NOT TPL_NO_EXPORT) + kokkos_export_cmake_tpl(${MODULE_NAME}) + endif() +endmacro() # # @MACRO: KOKKOS_CREATE_IMPORTED_TPL() @@ -368,68 +330,57 @@ ENDMACRO() # # If specified, this gives a list of linker flags that must be used # for using this library. -MACRO(kokkos_create_imported_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE" - "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" - ${ARGN}) - - - IF (KOKKOS_HAS_TRILINOS) - #TODO: we need to set a bunch of cache variables here - ELSEIF (TPL_INTERFACE) - ADD_LIBRARY(${NAME} INTERFACE) +macro(kokkos_create_imported_tpl NAME) + cmake_parse_arguments( + TPL "INTERFACE" "LIBRARY" "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN} + ) + + if(TPL_INTERFACE) + add_library(${NAME} INTERFACE) #Give this an importy-looking name - ADD_LIBRARY(Kokkos::${NAME} ALIAS ${NAME}) - IF (TPL_LIBRARY) - MESSAGE(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") - ENDIF() + add_library(Kokkos::${NAME} ALIAS ${NAME}) + if(TPL_LIBRARY) + message(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) - ENDIF() - IF(TPL_INCLUDES) - TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) - ENDIF() - IF(TPL_COMPILE_OPTIONS) - TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) - ENDIF() - IF(TPL_LINK_OPTIONS) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) - ENDIF() - ELSE() - ADD_LIBRARY(${NAME} UNKNOWN IMPORTED) - IF(TPL_LIBRARY) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - IMPORTED_LOCATION ${TPL_LIBRARY}) - ENDIF() + if(TPL_LINK_LIBRARIES) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) + endif() + if(TPL_INCLUDES) + target_include_directories(${NAME} INTERFACE ${TPL_INCLUDES}) + endif() + if(TPL_COMPILE_DEFINITIONS) + target_compile_definitions(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + endif() + if(TPL_COMPILE_OPTIONS) + target_compile_options(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) + endif() + if(TPL_LINK_OPTIONS) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) + endif() + else() + add_library(${NAME} UNKNOWN IMPORTED) + if(TPL_LIBRARY) + set_target_properties(${NAME} PROPERTIES IMPORTED_LOCATION ${TPL_LIBRARY}) + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") - ENDIF() - IF(TPL_INCLUDES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") - ENDIF() - IF(TPL_COMPILE_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") - ENDIF() - IF(TPL_LINK_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") - ENDIF() - ENDIF() -ENDMACRO() + if(TPL_LINK_LIBRARIES) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") + endif() + if(TPL_INCLUDES) + set_target_properties(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") + endif() + if(TPL_COMPILE_DEFINITIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + endif() + if(TPL_COMPILE_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") + endif() + if(TPL_LINK_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") + endif() + endif() +endmacro() # # @MACRO: KOKKOS_FIND_HEADER @@ -479,37 +430,32 @@ ENDMACRO() # # Custom paths to search for the header # -MACRO(kokkos_find_header VAR_NAME HEADER TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS" - ${ARGN}) - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_PATH(${VAR_NAME} ${HEADER} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} +macro(kokkos_find_header VAR_NAME HEADER TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS" ${ARGN}) + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_path( + ${VAR_NAME} ${HEADER} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} PATH_SUFFIXES include - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_PATH(${VAR_NAME} ${HEADER}) - ENDIF() + find_path(${VAR_NAME} ${HEADER}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_LIBRARY @@ -565,42 +511,36 @@ ENDMACRO() # Suffixes appended to PATHS when attempting to locate # the library. Defaults to {lib, lib64}. # -MACRO(kokkos_find_library VAR_NAME LIB TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS;SUFFIXES" - ${ARGN}) - - IF(NOT TPL_SUFFIXES) - SET(TPL_SUFFIXES lib lib64) - ENDIF() - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_LIBRARY(${VAR_NAME} ${LIB} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} - PATH_SUFFIXES - ${TPL_SUFFIXES} - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() - - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) +macro(kokkos_find_library VAR_NAME LIB TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS;SUFFIXES" ${ARGN}) + + if(NOT TPL_SUFFIXES) + set(TPL_SUFFIXES lib lib64) + endif() + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_library( + ${VAR_NAME} ${LIB} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} + PATH_SUFFIXES ${TPL_SUFFIXES} + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() + + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_LIBRARY(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) - ENDIF() + find_library(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_IMPORTED @@ -683,111 +623,127 @@ ENDMACRO() # If specified, this gives a list of paths to search for the headers # If not given, _ROOT/include and _ROOT/include will be searched. # -MACRO(kokkos_find_imported NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" - "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" - "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" - ${ARGN}) - - IF(NOT TPL_MODULE_NAME) - SET(TPL_MODULE_NAME TPL${NAME}) - ENDIF() - - IF (TPL_ALLOW_SYSTEM_PATH_FALLBACK) - SET(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) - ELSE() - SET(ALLOW_PATH_FALLBACK_OPT) - ENDIF() - - IF (NOT TPL_IMPORTED_NAME) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - ENDIF() - - IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib) - IF(KOKKOS_IMPL_32BIT) - LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) - ELSE() - LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) - ENDIF() - ENDIF() - - SET(${NAME}_INCLUDE_DIRS) - IF (TPL_HEADER) - KOKKOS_FIND_HEADER(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - ENDIF() - - FOREACH(HEADER ${TPL_HEADERS}) - KOKKOS_FIND_HEADER(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - IF(HEADER_FIND_TEMP) - LIST(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) - ENDIF() - ENDFOREACH() - - SET(${NAME}_LIBRARY) - IF(TPL_LIBRARY) - KOKKOS_FIND_LIBRARY(${NAME}_LIBRARY ${TPL_LIBRARY} ${NAME} +macro(kokkos_find_imported NAME) + cmake_parse_arguments( + TPL "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" + "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" ${ARGN} + ) + + if(NOT TPL_MODULE_NAME) + set(TPL_MODULE_NAME TPL${NAME}) + endif() + + if(TPL_ALLOW_SYSTEM_PATH_FALLBACK) + set(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) + else() + set(ALLOW_PATH_FALLBACK_OPT) + endif() + + if(NOT TPL_IMPORTED_NAME) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + endif() + + if(NOT TPL_LIBRARY_SUFFIXES) + set(TPL_LIBRARY_SUFFIXES lib) + if(KOKKOS_IMPL_32BIT) + list(APPEND TPL_LIBRARY_SUFFIXES lib32) + else() + list(APPEND TPL_LIBRARY_SUFFIXES lib64) + endif() + endif() + + set(${NAME}_INCLUDE_DIRS) + if(TPL_HEADER) + kokkos_find_header(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + endif() + + foreach(HEADER ${TPL_HEADERS}) + kokkos_find_header(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + if(HEADER_FIND_TEMP) + list(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) + endif() + endforeach() + + set(${NAME}_LIBRARY) + if(TPL_LIBRARY) + kokkos_find_library( + ${NAME}_LIBRARY + ${TPL_LIBRARY} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - ENDIF() - - SET(${NAME}_FOUND_LIBRARIES) - FOREACH(LIB ${TPL_LIBRARIES}) - KOKKOS_FIND_LIBRARY(${LIB}_LOCATION ${LIB} ${NAME} + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + endif() + + set(${NAME}_FOUND_LIBRARIES) + foreach(LIB ${TPL_LIBRARIES}) + kokkos_find_library( + ${LIB}_LOCATION + ${LIB} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - IF(${LIB}_LOCATION) - LIST(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - ELSE() - SET(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - BREAK() - ENDIF() - ENDFOREACH() - - INCLUDE(FindPackageHandleStandardArgs) + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + if(${LIB}_LOCATION) + list(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + else() + set(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + break() + endif() + endforeach() + + include(FindPackageHandleStandardArgs) #Collect all the variables we need to be valid for #find_package to have succeeded - SET(TPL_VARS_NEEDED) - IF (TPL_LIBRARY) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) - ENDIF() - IF(TPL_HEADER) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) - ENDIF() - IF(TPL_LIBRARIES) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) - ENDIF() - FIND_PACKAGE_HANDLE_STANDARD_ARGS(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) - - MARK_AS_ADVANCED(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) + set(TPL_VARS_NEEDED) + if(TPL_LIBRARY) + list(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) + endif() + if(TPL_HEADER) + list(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) + endif() + if(TPL_LIBRARIES) + list(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) + endif() + find_package_handle_standard_args(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) + + mark_as_advanced(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) #this is so much fun on a Cray system #/usr/include should never be added as a -isystem include #this freaks out the compiler include search order - IF (KOKKOS_IS_CRAYPE) - LIST(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") - ENDIF() - - IF (${TPL_MODULE_NAME}_FOUND) - SET(IMPORT_TYPE) - IF (TPL_INTERFACE) - SET(IMPORT_TYPE "INTERFACE") - SET(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) - ENDIF() - KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME} + if(KOKKOS_IS_CRAYPE) + list(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") + endif() + + if(${TPL_MODULE_NAME}_FOUND) + set(IMPORT_TYPE) + if(TPL_INTERFACE) + set(IMPORT_TYPE "INTERFACE") + set(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) + endif() + kokkos_create_imported_tpl( + ${TPL_IMPORTED_NAME} ${IMPORT_TYPE} - INCLUDES "${${NAME}_INCLUDE_DIRS}" - LIBRARY "${${NAME}_LIBRARY}" - LINK_LIBRARIES "${${NAME}_FOUND_LIBRARIES}") - ENDIF() -ENDMACRO(kokkos_find_imported) + INCLUDES + "${${NAME}_INCLUDE_DIRS}" + LIBRARY + "${${NAME}_LIBRARY}" + LINK_LIBRARIES + "${${NAME}_FOUND_LIBRARIES}" + ) + endif() +endmacro(kokkos_find_imported) # # @MACRO: KOKKOS_LINK_TPL() @@ -817,109 +773,114 @@ ENDMACRO(kokkos_find_imported) # If specified, this gives the exact name of the target to link against # target_link_libraries( ) # -FUNCTION(kokkos_link_tpl TARGET) - CMAKE_PARSE_ARGUMENTS(TPL - "PUBLIC;PRIVATE;INTERFACE" - "IMPORTED_NAME" - "" - ${ARGN}) +function(kokkos_link_tpl TARGET) + cmake_parse_arguments(TPL "PUBLIC;PRIVATE;INTERFACE" "IMPORTED_NAME" "" ${ARGN}) #the name of the TPL - SET(TPL ${TPL_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - #Do nothing, they will have already been linked - ELSE() - IF (NOT TPL_IMPORTED_NAME) - SET(TPL_IMPORTED_NAME Kokkos::${TPL}) - ENDIF() - IF (KOKKOS_ENABLE_${TPL}) - IF (TPL_PUBLIC) - TARGET_LINK_LIBRARIES(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_PRIVATE) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_INTERFACE) - TARGET_LINK_LIBRARIES(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) - ELSE() - TARGET_LINK_LIBRARIES(${TARGET} ${TPL_IMPORTED_NAME}) - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() - -FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA NVHPC DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu MSVC) - CMAKE_PARSE_ARGUMENTS( - PARSE - "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" - "COMPILER_ID" - "${COMPILERS}" - ${ARGN}) - IF(PARSE_UNPARSED_ARGUMENTS) - MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options") - ENDIF() - - IF(PARSE_COMPILER_ID) - SET(COMPILER ${${PARSE_COMPILER_ID}}) - ELSE() - SET(COMPILER ${KOKKOS_CXX_COMPILER_ID}) - ENDIF() - - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) - FOREACH(COMP ${COMPILERS}) - IF (COMPILER STREQUAL "${COMP}") - IF (PARSE_${COMPILER}) - IF ("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") - SET(COMPILER_SPECIFIC_FLAGS_TMP "") - ELSE() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() - - IF (PARSE_COMPILE_OPTIONS) + set(TPL ${TPL_UNPARSED_ARGUMENTS}) + if(NOT TPL_IMPORTED_NAME) + set(TPL_IMPORTED_NAME Kokkos::${TPL}) + endif() + if(KOKKOS_ENABLE_${TPL}) + if(TPL_PUBLIC) + target_link_libraries(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) + elseif(TPL_PRIVATE) + target_link_libraries(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) + elseif(TPL_INTERFACE) + target_link_libraries(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) + else() + target_link_libraries(${TARGET} ${TPL_IMPORTED_NAME}) + endif() + endif() +endfunction() + +function(COMPILER_SPECIFIC_OPTIONS_HELPER) + set(COMPILERS + NVIDIA + NVHPC + DEFAULT + Cray + Intel + Clang + AppleClang + IntelLLVM + GNU + HIPCC + Fujitsu + MSVC + CrayClang + ) + cmake_parse_arguments( + PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" "COMPILER_ID" "${COMPILERS}" ${ARGN} + ) + if(PARSE_UNPARSED_ARGUMENTS) + message( + SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options" + ) + endif() + + if(PARSE_COMPILER_ID) + set(COMPILER ${${PARSE_COMPILER_ID}}) + else() + set(COMPILER ${KOKKOS_CXX_COMPILER_ID}) + endif() + + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) + foreach(COMP ${COMPILERS}) + if(COMPILER STREQUAL "${COMP}") + if(PARSE_${COMPILER}) + if("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") + set(COMPILER_SPECIFIC_FLAGS_TMP "") + else() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) + endif() + endif() + endif() + endforeach() + + if(PARSE_COMPILE_OPTIONS) # The funky logic here is for future handling of argument deduplication # If we naively pass multiple -Xcompiler flags to target_compile_options # -Xcompiler will get deduplicated and break the build - IF ("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) - LIST(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") - GLOBAL_APPEND(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ELSE() - GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - ENDIF() - - IF (PARSE_LINK_OPTIONS) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_COMPILE_DEFINITIONS) - GLOBAL_APPEND(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_LINK_LIBRARIES) - GLOBAL_APPEND(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - -FUNCTION(COMPILER_SPECIFIC_FLAGS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_FLAGS) - -FUNCTION(COMPILER_SPECIFIC_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_DEFS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS) -ENDFUNCTION(COMPILER_SPECIFIC_DEFS) - -FUNCTION(COMPILER_SPECIFIC_LIBS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) -ENDFUNCTION(COMPILER_SPECIFIC_LIBS) + if("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) + list(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") + global_append(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + else() + global_append(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + endif() + + if(PARSE_LINK_OPTIONS) + global_append(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_COMPILE_DEFINITIONS) + global_append(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_LINK_LIBRARIES) + global_append(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() +endfunction(COMPILER_SPECIFIC_OPTIONS_HELPER) + +function(COMPILER_SPECIFIC_FLAGS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_FLAGS) + +function(COMPILER_SPECIFIC_OPTIONS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS) +endfunction(COMPILER_SPECIFIC_OPTIONS) + +function(COMPILER_SPECIFIC_LINK_OPTIONS) + compiler_specific_options_helper(${ARGN} LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_LINK_OPTIONS) + +function(COMPILER_SPECIFIC_DEFS) + compiler_specific_options_helper(${ARGN} COMPILE_DEFINITIONS) +endfunction(COMPILER_SPECIFIC_DEFS) + +function(COMPILER_SPECIFIC_LIBS) + compiler_specific_options_helper(${ARGN} LINK_LIBRARIES) +endfunction(COMPILER_SPECIFIC_LIBS) # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -927,41 +888,42 @@ ENDFUNCTION(COMPILER_SPECIFIC_LIBS) # kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2) # would produce a list variable ALL_ARCHES=key1;key2 # and individual variables ARCHkey1=value1 and ARCHkey2=value2 -MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) - SET(PARSE_KEY ON) - SET(${KEY_LIST_NAME}) - FOREACH(ENTRY ${ARGN}) - IF(PARSE_KEY) - SET(CURRENT_KEY ${ENTRY}) - SET(PARSE_KEY OFF) - LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) - ELSE() - SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) - SET(PARSE_KEY ON) - ENDIF() - ENDFOREACH() -ENDMACRO() - -FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) - KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) - FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST}) - SET(OPTION_NAME Kokkos_${OPTION_SUFFIX}) - SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) - IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off - MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") - ENDIF() - ENDFOREACH() -ENDFUNCTION() +macro(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) + set(PARSE_KEY ON) + set(${KEY_LIST_NAME}) + foreach(ENTRY ${ARGN}) + if(PARSE_KEY) + set(CURRENT_KEY ${ENTRY}) + set(PARSE_KEY OFF) + list(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) + else() + set(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) + set(PARSE_KEY ON) + endif() + endforeach() +endmacro() + +function(KOKKOS_CHECK_DEPRECATED_OPTIONS) + kokkos_key_value_map(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) + foreach(OPTION_SUFFIX ${DEPRECATED_LIST}) + set(OPTION_NAME Kokkos_${OPTION_SUFFIX}) + set(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) + if(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off + message(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") + endif() + endforeach() +endfunction() # this function checks whether the current CXX compiler supports building CUDA -FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) - # don't run this test every time - IF(DEFINED ${_VAR}) - RETURN() - ENDIF() - - FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp -" +function(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + if(DEFINED ${_VAR}) + return() + endif() + + file( + WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp + " #include #include @@ -985,14 +947,13 @@ int main() cudaDeviceSynchronize(); return EXIT_SUCCESS; } -") +" + ) - TRY_COMPILE(_RET - ${PROJECT_BINARY_DIR}/compile_tests - SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + try_compile(_RET ${PROJECT_BINARY_DIR}/compile_tests SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) - SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") -ENDFUNCTION() + set(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +endfunction() # this function is provided to easily select which files use nvcc_wrapper: # @@ -1005,58 +966,77 @@ ENDFUNCTION() # NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. # This version explicitly uses nvcc_wrapper. # -FUNCTION(kokkos_compilation) - # check whether the compiler already supports building CUDA - KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) - # if CUDA compile test has already been performed, just return - IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) - RETURN() - ENDIF() - - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() - - # find nvcc_wrapper - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") - ENDIF() - - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() +function(kokkos_compilation) + # check whether the compiler already supports building CUDA + kokkos_cxx_compiler_cuda_test(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + if(Kokkos_CXX_COMPILER_COMPILES_CUDA) + return() + endif() + + cmake_parse_arguments(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # find kokkos_launch_compiler + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR + "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'" + ) + endif() + + # find nvcc_wrapper + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'" + ) + endif() + + if(COMP_GLOBAL) + # if global, don't bother setting others + set_property( + GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + else() + foreach(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + if("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + list(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + unset(COMP_${_TYPE}) + endif() + # set the properties if defined + if(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + endif() + endforeach() + endif() +endfunction() ## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names ## and create output config header file...used for ## creating dynamic include files based on enabled backends @@ -1066,14 +1046,15 @@ ENDFUNCTION() ## HEADER_GUARD TEXT used with include header guard ## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) ## DATA_LIST list of backends to include in generated file -FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) - SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") - CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) - FOREACH( BACKEND_NAME ${DATA_LIST} ) - SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> -\@INCLUDE_NEXT_FILE\@") - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) - ENDFOREACH() - SET(INCLUDE_NEXT_FILE "" ) - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) -ENDFUNCTION() +function(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + set(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + configure_file(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + foreach(BACKEND_NAME ${DATA_LIST}) + set(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@" + ) + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + endforeach() + set(INCLUDE_NEXT_FILE "") + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +endfunction() diff --git a/packages/kokkos/cmake/kokkos_install.cmake b/packages/kokkos/cmake/kokkos_install.cmake index f818dfa24485..3ae7570ffea5 100644 --- a/packages/kokkos/cmake/kokkos_install.cmake +++ b/packages/kokkos/cmake/kokkos_install.cmake @@ -1,57 +1,51 @@ -INCLUDE(CMakePackageConfigHelpers) -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - INCLUDE(GNUInstallDirs) +include(CMakePackageConfigHelpers) +if(NOT Kokkos_INSTALL_TESTING) + include(GNUInstallDirs) #Set all the variables needed for KokkosConfig.cmake - GET_PROPERTY(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - SET(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) + get_property(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + set(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) - INCLUDE(CMakePackageConfigHelpers) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + include(CMakePackageConfigHelpers) + configure_package_config_file( + cmake/KokkosConfig.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfigCommon.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + configure_package_config_file( + cmake/KokkosConfigCommon.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) # Install the KokkosConfig*.cmake files - install(FILES - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) + install(FILES "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos + ) install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${Kokkos_BINARY_DIR}/KokkosTargets.cmake) # Required to be a TriBITS-compliant external package file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake) -ELSE() - CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_CONFIG_COMMON}") - CONFIGURE_FILE(cmake/KokkosTrilinosConfig.cmake.in ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake KOKKOS_TRILINOS_CONFIG) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_TRILINOS_CONFIG}") - - WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake + ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos + ) + file(WRITE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake + "include(${Kokkos_BINARY_DIR}/KokkosTargets.cmake)" + ) +else() + configure_file(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) + + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") -ENDIF() - -INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) + DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/Kokkos" + ) +endif() +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) diff --git a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake index ae14a10d531f..0d31e6d131f7 100644 --- a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,20 +1,28 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17") +kokkos_option( + CXX_STANDARD + "" + STRING + "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17" +) # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX17 OFF) -SET(KOKKOS_ENABLE_CXX20 OFF) -SET(KOKKOS_ENABLE_CXX23 OFF) -SET(KOKKOS_ENABLE_CXX26 OFF) -IF (KOKKOS_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") -ENDIF() +set(KOKKOS_ENABLE_CXX17 OFF) +set(KOKKOS_ENABLE_CXX20 OFF) +set(KOKKOS_ENABLE_CXX23 OFF) +set(KOKKOS_ENABLE_CXX26 OFF) +if(KOKKOS_CXX_STANDARD) + message( + FATAL_ERROR + "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead" + ) +endif() -IF (NOT CMAKE_CXX_STANDARD) - SET(KOKKOS_CXX_STANDARD "17") -ELSE() - SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -ENDIF() -MESSAGE(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") +if(NOT CMAKE_CXX_STANDARD) + set(KOKKOS_CXX_STANDARD "17") +else() + set(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +endif() +message(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake index 5b45674e0570..a84e714064df 100644 --- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -1,101 +1,112 @@ -KOKKOS_CFG_DEPENDS(CXX_STD COMPILER_ID) +kokkos_cfg_depends(CXX_STD COMPILER_ID) -FUNCTION(kokkos_set_cxx_standard_feature standard) - SET(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) - SET(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) - SET(FEATURE_NAME cxx_std_${standard}) +function(kokkos_set_cxx_standard_feature standard) + set(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) + set(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) + set(FEATURE_NAME cxx_std_${standard}) #CMake's way of telling us that the standard (or extension) #flags are supported is the extension/standard variables - IF (NOT DEFINED CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSEIF(CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - MESSAGE(FATAL_ERROR "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue") - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSE() - #For trilinos, we need to make sure downstream projects - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ENDIF() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + global_set(KOKKOS_USE_CXX_EXTENSIONS OFF) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + elseif(CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + message( + FATAL_ERROR + "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue" + ) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + endif() - IF (KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) - MESSAGE(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) - MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") - IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - IF(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) - SET(SUPPORTED_NVCC_FLAGS "-std=c++17") - ELSE() - SET(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") - ENDIF() - IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) - MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") - ENDIF() - ENDIF() - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + if(KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) + message(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) + message(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU + OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang) + ) + if(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) + set(SUPPORTED_NVCC_FLAGS "-std=c++17") + else() + set(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") + endif() + if(NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) + message( + FATAL_ERROR + "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help." + ) + endif() + endif() + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") #MSVC doesn't need a command line flag, that doesn't mean it has no support - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSE() + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu") + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + else() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ENDIF() + message( + WARNING + "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command." + ) + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + endif() - IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) - MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") - ENDIF() - ENDIF() -ENDFUNCTION() + if((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + if(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message( + FATAL_ERROR + "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported" + ) + endif() + endif() +endfunction() -IF(KOKKOS_CXX_STANDARD STREQUAL "17") +if(KOKKOS_CXX_STANDARD STREQUAL "17") kokkos_set_cxx_standard_feature(17) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") - SET(KOKKOS_ENABLE_CXX17 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") + set(KOKKOS_ENABLE_CXX17 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") - SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") + set(KOKKOS_ENABLE_CXX20 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") - SET(KOKKOS_ENABLE_CXX23 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") + set(KOKKOS_ENABLE_CXX23 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "26") kokkos_set_cxx_standard_feature(26) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") - SET(KOKKOS_ENABLE_CXX26 ON) -ELSE() - MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") -ENDIF() + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + set(KOKKOS_ENABLE_CXX26 ON) +else() + message(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") +endif() # Enforce that we can compile a simple C++17 program -TRY_COMPILE(CAN_COMPILE_CPP17 - ${KOKKOS_TOP_BUILD_DIR}/corner_cases - ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp - OUTPUT_VARIABLE ERROR_MESSAGE - CXX_STANDARD 17 +try_compile( + CAN_COMPILE_CPP17 ${KOKKOS_TOP_BUILD_DIR}/corner_cases ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp + OUTPUT_VARIABLE ERROR_MESSAGE CXX_STANDARD 17 ) -if (NOT CAN_COMPILE_CPP17) - UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") -ENDIF() -UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - +if(NOT CAN_COMPILE_CPP17) + unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this + message( + FATAL_ERROR + "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}" + ) +endif() +unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's @@ -105,66 +116,70 @@ UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # that we can only use host compilers for CUDA builds that use those flags. # It also means that extensions (gnu++17) can't be turned on for CUDA builds. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + endif() +endif() -IF(KOKKOS_ENABLE_CUDA) +if(KOKKOS_ENABLE_CUDA) # ENFORCE that the compiler can compile CUDA code. - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) - MESSAGE(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") - ENDIF() - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") - ENDIF() -ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + endif() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message( + FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF" + ) + endif() + elseif(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + message( + FATAL_ERROR + "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}" + ) + endif() +endif() -IF (NOT KOKKOS_CXX_STANDARD_FEATURE) +if(NOT KOKKOS_CXX_STANDARD_FEATURE) #we need to pick the C++ flags ourselves - UNSET(CMAKE_CXX_STANDARD) - UNSET(CMAKE_CXX_STANDARD CACHE) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) + unset(CMAKE_CXX_STANDARD) + unset(CMAKE_CXX_STANDARD CACHE) + if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + include(${KOKKOS_SRC_PATH}/cmake/cray.cmake) kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + include(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + include(${KOKKOS_SRC_PATH}/cmake/intel.cmake) kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) + include(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) + else() + include(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) kokkos_set_gnu_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ENDIF() + endif() #check that the compiler accepts the C++ standard flag - INCLUDE(CheckCXXCompilerFlag) - IF (DEFINED CXX_STD_FLAGS_ACCEPTED) - UNSET(CXX_STD_FLAGS_ACCEPTED CACHE) - ENDIF() - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) - IF (NOT CXX_STD_FLAGS_ACCEPTED) - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) - IF (NOT CXX_INT_STD_FLAGS_ACCEPTED) - MESSAGE(FATAL_ERROR "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}") - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) - ENDIF() - MESSAGE(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") -ENDIF() - - - - + include(CheckCXXCompilerFlag) + if(DEFINED CXX_STD_FLAGS_ACCEPTED) + unset(CXX_STD_FLAGS_ACCEPTED CACHE) + endif() + check_cxx_compiler_flag("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) + if(NOT CXX_STD_FLAGS_ACCEPTED) + check_cxx_compiler_flag("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) + if(NOT CXX_INT_STD_FLAGS_ACCEPTED) + message( + FATAL_ERROR + "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}" + ) + endif() + set(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) + endif() + message(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") +endif() diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake index cda9e0d6004a..f43aff4d1f08 100644 --- a/packages/kokkos/cmake/kokkos_tpls.cmake +++ b/packages/kokkos/cmake/kokkos_tpls.cmake @@ -1,126 +1,120 @@ -KOKKOS_CFG_DEPENDS(TPLS OPTIONS) -KOKKOS_CFG_DEPENDS(TPLS DEVICES) -KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) +kokkos_cfg_depends(TPLS OPTIONS) +kokkos_cfg_depends(TPLS DEVICES) +kokkos_cfg_depends(TPLS COMPILER_ID) -FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) - CMAKE_PARSE_ARGUMENTS(PARSED - "" - "TRIBITS" - "" - ${ARGN}) +function(KOKKOS_TPL_OPTION PKG DEFAULT) + cmake_parse_arguments(PARSED "" "TRIBITS" "" ${ARGN}) - IF (PARSED_TRIBITS) + if(PARSED_TRIBITS) #this is also a TPL option you can activate with Tribits - IF (NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") + if(NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") #Tribits brought its own default that should take precedence - SET(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) - ENDIF() - ENDIF() + set(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) + endif() + endif() - KOKKOS_ENABLE_OPTION(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") - KOKKOS_OPTION(${PKG}_DIR "" PATH "Location of ${PKG} library") - SET(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) - SET(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) + kokkos_enable_option(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") + kokkos_option(${PKG}_DIR "" PATH "Location of ${PKG} library") + set(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) + set(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) +endfunction() - IF (KOKKOS_HAS_TRILINOS - AND KOKKOS_ENABLE_${PKG} - AND NOT PARSED_TRIBITS) - #this TPL was enabled, but it is not valid to use inside of TriBITS - MESSAGE(FATAL_ERROR "Enabled TPL ${PKG} inside TriBITS build, " - "but this can only be enabled in a standalone build") - ENDIF() -ENDFUNCTION() - -KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT - KOKKOS_HAS_TRILINOS) - SET(ROCM_DEFAULT ON) -ELSE() - SET(ROCM_DEFAULT OFF) -ENDIF() -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) - SET(ROCTHRUST_DEFAULT ON) -ELSE() - SET(ROCTHRUST_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) -KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) +kokkos_tpl_option(HWLOC Off TRIBITS HWLOC) +kokkos_tpl_option(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) +if(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + set(ROCM_DEFAULT ON) +else() + set(ROCM_DEFAULT OFF) +endif() +if(KOKKOS_ENABLE_HIP) + set(ROCTHRUST_DEFAULT ON) +else() + set(ROCTHRUST_DEFAULT OFF) +endif() +kokkos_tpl_option(ROCM ${ROCM_DEFAULT}) +kokkos_tpl_option(ROCTHRUST ${ROCTHRUST_DEFAULT}) +if(Kokkos_ENABLE_ROCTHRUST) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " + #include + int main() { + static_assert(_GLIBCXX_RELEASE < 9); + return 0; + } + " + Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + ) +endif() -IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) - SET(ONEDPL_DEFAULT ON) -ELSE() - SET(ONEDPL_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ONEDPL ${ONEDPL_DEFAULT}) +if(KOKKOS_ENABLE_SYCL) + set(ONEDPL_DEFAULT ON) +else() + set(ONEDPL_DEFAULT OFF) +endif() +kokkos_tpl_option(ONEDPL ${ONEDPL_DEFAULT}) -IF (WIN32) - SET(LIBDL_DEFAULT Off) -ELSE() - SET(LIBDL_DEFAULT On) -ENDIF() -KOKKOS_TPL_OPTION(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) +if(WIN32) + set(LIBDL_DEFAULT Off) +else() + set(LIBDL_DEFAULT On) +endif() +kokkos_tpl_option(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) -SET(HPX_DEFAULT ON) -ELSE() -SET(HPX_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(HPX ${HPX_DEFAULT}) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) + set(HPX_DEFAULT ON) +else() + set(HPX_DEFAULT OFF) +endif() +kokkos_tpl_option(HPX ${HPX_DEFAULT}) -KOKKOS_TPL_OPTION(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) +kokkos_tpl_option(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) - SET(LIBQUADMATH_DEFAULT ON) -ELSE() - SET(LIBQUADMATH_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) + set(LIBQUADMATH_DEFAULT ON) +else() + set(LIBQUADMATH_DEFAULT OFF) +endif() +kokkos_tpl_option(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake -KOKKOS_IMPORT_TPL(HPX INTERFACE) -KOKKOS_IMPORT_TPL(CUDA INTERFACE) -KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBDL) -IF (NOT WIN32) - KOKKOS_IMPORT_TPL(THREADS INTERFACE) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(ROCM INTERFACE) -ENDIF() -KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) -KOKKOS_IMPORT_TPL(LIBQUADMATH) -KOKKOS_IMPORT_TPL(ROCTHRUST) +kokkos_import_tpl(HPX INTERFACE) +kokkos_import_tpl(CUDA INTERFACE) +kokkos_import_tpl(HWLOC) +kokkos_import_tpl(LIBDL) +if(NOT WIN32) + kokkos_import_tpl(THREADS INTERFACE) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_import_tpl(ROCM INTERFACE) +endif() +kokkos_import_tpl(ONEDPL INTERFACE) +kokkos_import_tpl(LIBQUADMATH) +kokkos_import_tpl(ROCTHRUST) -IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) +if(Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) - KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics) -ENDIF() + kokkos_export_cmake_tpl(desul REQUIRED COMPONENTS atomics) +endif() -if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) +if(Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) find_package(mdspan REQUIRED) - KOKKOS_EXPORT_CMAKE_TPL(mdspan REQUIRED) + kokkos_export_cmake_tpl(mdspan REQUIRED) endif() -IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED COMPONENTS CXX) - # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency - # so we just append the flags here instead of linking with the OpenMP target. - IF(KOKKOS_HAS_TRILINOS) - COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) - ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) - ENDIF() - IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) - ENDIF() - IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) - ENDIF() -ENDIF() +if(Kokkos_ENABLE_OPENMP) + find_package(OpenMP 3.0 REQUIRED COMPONENTS CXX) + kokkos_export_cmake_tpl(OpenMP REQUIRED COMPONENTS CXX) + if(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + global_append(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + endif() + if(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + endif() +endif() #Convert list to newlines (which CMake doesn't always like in cache variables) -STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") +string(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable -UNSET(KOKKOS_TPL_EXPORTS CACHE) -SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +unset(KOKKOS_TPL_EXPORTS CACHE) +set(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake index 6da543a2c85b..2fda803b1181 100644 --- a/packages/kokkos/cmake/kokkos_tribits.cmake +++ b/packages/kokkos/cmake/kokkos_tribits.cmake @@ -1,82 +1,47 @@ #These are tribits wrappers only ever called by Kokkos itself -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) -INCLUDE(GNUInstallDirs) +include(CMakeParseArguments) +include(CTest) +include(GNUInstallDirs) -MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +message(STATUS "The project name is: ${PROJECT_NAME}") -IF(GTest_FOUND) - SET(KOKKOS_GTEST_LIB GTest::gtest) - MESSAGE(STATUS "Using gtest found in ${GTest_DIR}") -ELSE() # fallback to internal gtest - SET(KOKKOS_GTEST_LIB kokkos_gtest) - MESSAGE(STATUS "Using internal gtest for testing") -ENDIF() +if(GTest_FOUND) + set(KOKKOS_GTEST_LIB GTest::gtest) + message(STATUS "Using gtest found in ${GTest_DIR}") +else() # fallback to internal gtest + set(KOKKOS_GTEST_LIB kokkos_gtest) + message(STATUS "Using internal gtest for testing") +endif() -FUNCTION(VERIFY_EMPTY CONTEXT) +function(VERIFY_EMPTY CONTEXT) if(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + message(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") endif() -ENDFUNCTION() - -#Leave this here for now - but only do for tribits -#This breaks the standalone CMake -IF (KOKKOS_HAS_TRILINOS) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) - SET(${PROJECT_NAME}_ENABLE_OpenMP OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX) - SET(${PROJECT_NAME}_ENABLE_HPX OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG) - SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS) - SET(${PROJECT_NAME}_ENABLE_TESTS OFF) - ENDIF() - - IF(NOT DEFINED TPL_ENABLE_Pthread) - SET(TPL_ENABLE_Pthread OFF) - ENDIF() -ENDIF() - -MACRO(KOKKOS_PROCESS_SUBPACKAGES) - ADD_SUBDIRECTORY(core) - ADD_SUBDIRECTORY(containers) - ADD_SUBDIRECTORY(algorithms) - ADD_SUBDIRECTORY(simd) - if (NOT KOKKOS_HAS_TRILINOS) - ADD_SUBDIRECTORY(example) - ADD_SUBDIRECTORY(benchmarks) - endif() -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_DEF) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_DEF() - else() - #do nothing - endif() -ENDMACRO() - -MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) - KOKKOS_LIB_TYPE(${LIBRARY_NAME} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${LIBRARY_NAME} ${INCTYPE} $) - - INSTALL( +endfunction() + +macro(KOKKOS_PROCESS_SUBPACKAGES) + add_subdirectory(core) + add_subdirectory(containers) + add_subdirectory(algorithms) + add_subdirectory(simd) + add_subdirectory(example) + add_subdirectory(benchmarks) +endmacro() + +macro(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) + kokkos_lib_type(${LIBRARY_NAME} INCTYPE) + target_include_directories(${LIBRARY_NAME} ${INCTYPE} $) + + install( TARGETS ${LIBRARY_NAME} EXPORT ${PROJECT_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT ${PACKAGE_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT ${PACKAGE_NAME} ) - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT KokkosTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -84,157 +49,131 @@ MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) - VERIFY_EMPTY(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDMACRO() + verify_empty(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endmacro() + +function(KOKKOS_ADD_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "TESTONLY" "" "SOURCES;TESTONLYLIBS" ${ARGN}) -FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${ROOT_NAME} ${ARGN}) + set_source_files_properties(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + add_executable(${EXE_NAME} ${PARSE_SOURCES}) + if(PARSE_TESTONLYLIBS) + target_link_libraries(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) + endif() + verify_empty(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) + #All executables must link to all the kokkos targets + #This is just private linkage because exe is final + target_link_libraries(${EXE_NAME} PRIVATE Kokkos::kokkos) +endfunction() + +function(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES;CATEGORIES;ARGS" ${ARGN}) + verify_empty(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + + kokkos_add_test_executable(${ROOT_NAME} SOURCES ${PARSE_SOURCES}) + if(PARSE_ARGS) + set(TEST_NUMBER 0) + foreach(ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + string(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + list(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + math(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + kokkos_add_test( + NAME + ${TEST_NAME} + EXE + ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION + " FAILED " + ARGS + ${ARG_STR_LIST} + ) + endforeach() else() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) - - SET_SOURCE_FILES_PROPERTIES(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - #All executables must link to all the kokkos targets - #This is just private linkage because exe is final - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos) + kokkos_add_test(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED ") + endif() + # We noticed problems with -fvisibility=hidden for inline static variables + # if Kokkos was built as shared library. + if(BUILD_SHARED_LIBS AND NOT ${TEST_NAME}_DISABLE) + set_property(TARGET ${EXE_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) + set_property(TARGET ${EXE_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) + endif() + if(NOT + (Kokkos_INSTALL_TESTING + OR Kokkos_ENABLE_SYCL + OR Kokkos_ENABLE_HPX + OR Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "Intel" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC")) + ) + if(MSVC) + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "/GR-") + else() + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "-fno-rtti") + endif() + endif() +endfunction() + +function(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) + set(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + if(NOT TARGET ${TARGET_NAME}) + message(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") endif() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - - IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) - ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - IF (PARSE_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${PARSE_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - KOKKOS_ADD_TEST(NAME ${TEST_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${ARG_STR_LIST} - ) - ENDFOREACH() - ELSE() - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ) - ENDIF() - ENDIF() - # We noticed problems with -fvisibility=hidden for inline static variables - # if Kokkos was built as shared library. - IF(BUILD_SHARED_LIBS) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) - ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) - SET(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - IF (NOT TARGET ${TARGET_NAME}) - MESSAGE(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") - ENDIF() - SET_PROPERTY(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT) + set_property(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) +endfunction() + +macro(KOKKOS_SETUP_BUILD_ENVIRONMENT) # This is needed for both regular build and install tests - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) #set an internal option, if not already set - SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") - IF (Kokkos_INSTALL_TESTING) - SET(KOKKOS_ENABLE_TESTS ON) - SET(KOKKOS_ENABLE_BENCHMARKS ON) - SET(KOKKOS_ENABLE_EXAMPLES ON) + set(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") + if(Kokkos_INSTALL_TESTING) + set(KOKKOS_ENABLE_TESTS ON) + set(KOKKOS_ENABLE_BENCHMARKS ON) + set(KOKKOS_ENABLE_EXAMPLES ON) # This looks a little weird, but what we are doing # is to NOT build Kokkos but instead look for an # installed Kokkos - then build examples and tests # against that installed Kokkos - FIND_PACKAGE(Kokkos REQUIRED) + find_package(Kokkos REQUIRED) # Just grab the configuration from the installation - FOREACH(DEV ${Kokkos_DEVICES}) - SET(KOKKOS_ENABLE_${DEV} ON) - ENDFOREACH() - FOREACH(OPT ${Kokkos_OPTIONS}) - SET(KOKKOS_ENABLE_${OPT} ON) - ENDFOREACH() - FOREACH(TPL ${Kokkos_TPLS}) - SET(KOKKOS_ENABLE_${TPL} ON) - ENDFOREACH() - FOREACH(ARCH ${Kokkos_ARCH}) - SET(KOKKOS_ARCH_${ARCH} ON) - ENDFOREACH() - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) - IF (NOT KOKKOS_HAS_TRILINOS) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") - ENDIF() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES" - ${ARGN}) - KOKKOS_ADD_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ${PARSE_UNPARSED_ARGUMENTS} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - ) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_POSTPROCESS) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_POSTPROCESS() + foreach(DEV ${Kokkos_DEVICES}) + set(KOKKOS_ENABLE_${DEV} ON) + endforeach() + foreach(OPT ${Kokkos_OPTIONS}) + set(KOKKOS_ENABLE_${OPT} ON) + endforeach() + foreach(TPL ${Kokkos_TPLS}) + set(KOKKOS_ENABLE_${TPL} ON) + endforeach() + foreach(ARCH ${Kokkos_ARCH}) + set(KOKKOS_ARCH_${ARCH} ON) + endforeach() + else() + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + include(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) endif() -ENDMACRO() +endmacro() + +macro(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES" ${ARGN}) + # Don't do anything if the user disabled the test + if(NOT ${PACKAGE_NAME}_${ROOT_NAME}_DISABLE) + kokkos_add_executable( + ${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS ${KOKKOS_GTEST_LIB} + ) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + endif() +endmacro() ## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based ## on enabled backends. @@ -242,265 +181,214 @@ ENDMACRO() ## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp -MACRO(KOKKOS_CONFIGURE_CORE) - MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") - CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) -ENDMACRO() +macro(KOKKOS_CONFIGURE_CORE) + message(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" + "${KOKKOS_ENABLED_DEVICES}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" + "${DEVICE_SETUP_LIST}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" + "${KOKKOS_ENABLED_DEVICES}" + ) + configure_file(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +endmacro() ## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. -MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) +macro(KOKKOS_INSTALL_ADDITIONAL_FILES) # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler - IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") - ELSE() - IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") - ENDIF() - ENDIF() - - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler - ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler - @ONLY) - - INSTALL(PROGRAMS - "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" - "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" - "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" - DESTINATION ${CMAKE_INSTALL_BINDIR}) - INSTALL(FILES - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + if(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + set(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + else() + if(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + set(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + endif() + endif() + + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler @ONLY + ) + + install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - DESTINATION ${KOKKOS_HEADER_DIR}) -ENDMACRO() - + DESTINATION ${KOKKOS_HEADER_DIR} + ) +endmacro() -FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "PLAIN_STYLE" - "" - "" - ${ARGN}) +function(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) + cmake_parse_arguments(PARSE "PLAIN_STYLE" "" "" ${ARGN}) - IF((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) + if((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) #I can use link options #check for CXX linkage using the simple 3.18 way - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_LINK_OPTIONS}> - ) - ELSE() + target_link_options(${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}>) + else() #I can use link options #just assume CXX linkage - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() + target_link_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}) + endif() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_OPTIONS}> + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_OPTIONS}> ) - TARGET_COMPILE_DEFINITIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_DEFINITIONS}> + target_compile_definitions( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_DEFINITIONS}> ) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES} - ) + target_link_libraries(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES}) - IF (KOKKOS_ENABLE_CUDA) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> + if(KOKKOS_ENABLE_CUDA) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> ) - SET(NODEDUP_CUDAFE_OPTIONS) - FOREACH(OPT ${KOKKOS_CUDAFE_OPTIONS}) - LIST(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> + set(NODEDUP_CUDAFE_OPTIONS) + foreach(OPT ${KOKKOS_CUDAFE_OPTIONS}) + list(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_ENABLE_HIP) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> + if(KOKKOS_ENABLE_HIP) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> ) - ENDIF() - - LIST(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) - IF (XOPT_LENGTH GREATER 1) - MESSAGE(FATAL_ERROR "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12") - ENDIF() - IF(KOKKOS_XCOMPILER_OPTIONS) - SET(NODEDUP_XCOMPILER_OPTIONS) - FOREACH(OPT ${KOKKOS_XCOMPILER_OPTIONS}) + endif() + + list(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) + if(XOPT_LENGTH GREATER 1) + message( + FATAL_ERROR + "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12" + ) + endif() + if(KOKKOS_XCOMPILER_OPTIONS) + set(NODEDUP_XCOMPILER_OPTIONS) + foreach(OPT ${KOKKOS_XCOMPILER_OPTIONS}) #I have to do this for now because we can't guarantee 3.12 support #I really should do this with the shell option - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> + list(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) + list(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_CXX_STANDARD_FEATURE) + if(KOKKOS_CXX_STANDARD_FEATURE) #GREAT! I can do this the right way - TARGET_COMPILE_FEATURES(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) - IF (NOT KOKKOS_USE_CXX_EXTENSIONS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) - ENDIF() - ELSE() + target_compile_features(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) + if(NOT KOKKOS_USE_CXX_EXTENSIONS) + set_target_properties(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) + endif() + else() #OH, well, no choice but the wrong way - TARGET_COMPILE_OPTIONS(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES" - ${ARGN}) - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - FOREACH(source ${PARSE_SOURCES}) + target_compile_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) + endif() +endfunction() + +function(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES" ${ARGN}) + + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + foreach(source ${PARSE_SOURCES}) set_source_files_properties(${source} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - ENDFOREACH() + endforeach() - IF(PARSE_STATIC) - SET(LINK_TYPE STATIC) - ENDIF() + if(PARSE_STATIC) + set(LINK_TYPE STATIC) + endif() - IF(PARSE_SHARED) - SET(LINK_TYPE SHARED) - ENDIF() + if(PARSE_SHARED) + set(LINK_TYPE SHARED) + endif() # MSVC and other platforms want to have # the headers included as source files # for better dependency detection - ADD_LIBRARY( - ${LIBRARY_NAME} - ${LINK_TYPE} - ${PARSE_HEADERS} - ${PARSE_SOURCES} - ) + add_library(${LIBRARY_NAME} ${LINK_TYPE} ${PARSE_HEADERS} ${PARSE_SOURCES}) - IF(PARSE_SHARED OR BUILD_SHARED_LIBS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES - VERSION ${Kokkos_VERSION} - SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + if(PARSE_SHARED OR BUILD_SHARED_LIBS) + set_target_properties( + ${LIBRARY_NAME} PROPERTIES VERSION ${Kokkos_VERSION} SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} ) - ENDIF() + endif() - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) + kokkos_internal_add_library_install(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name #that matches the install Kokkos:: name - ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "ADD_BUILD_OPTIONS" - "" - "HEADERS" - ${ARGN} - ) - IF (KOKKOS_HAS_TRILINOS) - # We do not pass headers to trilinos. They would get installed - # to the default include folder, but we want headers installed - # preserving the directory structure, e.g. impl - # If headers got installed in both locations, it breaks some - # downstream packages - TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} - ADDED_LIB_TARGET_NAME_OUT ${LIBRARY_NAME}_TARGET_NAME ) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${${LIBRARY_NAME}_TARGET_NAME}) - ENDIF() - ELSE() - # Forward the headers, we want to know about all headers - # to make sure they appear correctly in IDEs - KOKKOS_INTERNAL_ADD_LIBRARY( - ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME}) - ENDIF() - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) - ELSE() - ADD_LIBRARY(${NAME} INTERFACE) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - FOREACH(DIR ${ARGN}) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $) - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_ADD_TEST_DIRECTORIES) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_TEST_DIRECTORIES(${ARGN}) - ELSE() - IF(KOKKOS_ENABLE_TESTS) - FOREACH(TEST_DIR ${ARGN}) - ADD_SUBDIRECTORY(${TEST_DIR}) - ENDFOREACH() - ENDIF() - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXAMPLE_DIRECTORIES(${ARGN}) - else() - IF(KOKKOS_ENABLE_EXAMPLES) - FOREACH(EXAMPLE_DIR ${ARGN}) - ADD_SUBDIRECTORY(${EXAMPLE_DIR}) - ENDFOREACH() - ENDIF() + add_library(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) +endfunction() + +function(KOKKOS_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "ADD_BUILD_OPTIONS" "" "HEADERS" ${ARGN}) + # Forward the headers, we want to know about all headers + # to make sure they appear correctly in IDEs + kokkos_internal_add_library(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) + if(PARSE_ADD_BUILD_OPTIONS) + kokkos_set_library_properties(${LIBRARY_NAME}) + endif() +endfunction() + +function(KOKKOS_ADD_INTERFACE_LIBRARY NAME) + add_library(${NAME} INTERFACE) + kokkos_internal_add_library_install(${NAME}) +endfunction() + +function(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + foreach(DIR ${ARGN}) + target_include_directories(${TARGET} ${INCTYPE} $) + endforeach() +endfunction() + +function(KOKKOS_LIB_COMPILE_OPTIONS TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + target_compile_options(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) +endfunction() + +macro(KOKKOS_ADD_TEST_DIRECTORIES) + if(KOKKOS_ENABLE_TESTS) + foreach(TEST_DIR ${ARGN}) + add_subdirectory(${TEST_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_EXAMPLE_DIRECTORIES) + if(KOKKOS_ENABLE_EXAMPLES) + foreach(EXAMPLE_DIR ${ARGN}) + add_subdirectory(${EXAMPLE_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_BENCHMARK_DIRECTORIES) + if(KOKKOS_ENABLE_BENCHMARKS) + foreach(BENCHMARK_DIR ${ARGN}) + add_subdirectory(${BENCHMARK_DIR}) + endforeach() endif() -ENDMACRO() - -MACRO(KOKKOS_ADD_BENCHMARK_DIRECTORIES) - IF(KOKKOS_ENABLE_BENCHMARKS) - FOREACH(BENCHMARK_DIR ${ARGN}) - ADD_SUBDIRECTORY(${BENCHMARK_DIR}) - ENDFOREACH() - ENDIF() -ENDMACRO() +endmacro() diff --git a/packages/kokkos/cmake/msvc.cmake b/packages/kokkos/cmake/msvc.cmake index 85421bdbaaa4..1de13585c730 100644 --- a/packages/kokkos/cmake/msvc.cmake +++ b/packages/kokkos/cmake/msvc.cmake @@ -1,11 +1,9 @@ - -FUNCTION(kokkos_set_msvc_flags full_standard int_standard) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - +function(kokkos_set_msvc_flags full_standard int_standard) + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + endif() +endfunction() diff --git a/packages/kokkos/cmake/pgi.cmake b/packages/kokkos/cmake/pgi.cmake index e98e84955888..45f59dcd10bf 100644 --- a/packages/kokkos/cmake/pgi.cmake +++ b/packages/kokkos/cmake/pgi.cmake @@ -1,8 +1,6 @@ - function(kokkos_set_pgi_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake index 4e05d2253489..52d8368d0419 100644 --- a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,8 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) - +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake index 3d5b03805d4d..f51bce5d64d7 100644 --- a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -15,29 +15,26 @@ # ************************************************************************ # @HEADER -SET(USE_THREADS FALSE) +set(USE_THREADS FALSE) -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake index 8560ec60f1b5..b449f45135aa 100644 --- a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +tribits_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/packages/kokkos/containers/CMakeLists.txt b/packages/kokkos/containers/CMakeLists.txt index 0857d7007b44..8ee8bb41a28a 100644 --- a/packages/kokkos/containers/CMakeLists.txt +++ b/packages/kokkos/containers/CMakeLists.txt @@ -1,9 +1,9 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT KOKKOS_ENABLE_OPENACC) -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) -ENDIF() +if(NOT KOKKOS_ENABLE_OPENACC) + kokkos_add_test_directories(unit_tests) + kokkos_add_test_directories(performance_tests) +endif() diff --git a/packages/kokkos/containers/performance_tests/CMakeLists.txt b/packages/kokkos/containers/performance_tests/CMakeLists.txt index e325e45e85dc..8d4d605b0871 100644 --- a/packages/kokkos/containers/performance_tests/CMakeLists.txt +++ b/packages/kokkos/containers/performance_tests/CMakeLists.txt @@ -1,7 +1,6 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) string(TOUPPER ${Tag} DEVICE) @@ -10,14 +9,8 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) if(Kokkos_ENABLE_${DEVICE}) message(STATUS "Sources Test${Tag}.cpp") - set(SOURCES - TestMain.cpp - Test${Tag}.cpp - ) + set(SOURCES TestMain.cpp Test${Tag}.cpp) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - ContainersPerformanceTest_${Tag} - SOURCES ${SOURCES} - ) + kokkos_add_executable_and_test(ContainersPerformanceTest_${Tag} SOURCES ${SOURCES}) endif() endforeach() diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp index a74f833b9f52..953b8bff6e59 100644 --- a/packages/kokkos/containers/performance_tests/TestScatterView.hpp +++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp @@ -25,8 +25,8 @@ namespace Perf { template void test_scatter_view(int m, int n) { - Kokkos::View original_view("original_view", - n); + Kokkos::View original_view("original_view", + n); { auto scatter_view = Kokkos::Experimental::create_scatter_view< Kokkos::Experimental::ScatterSum, Duplication, Contribution>( @@ -40,8 +40,8 @@ void test_scatter_view(int m, int n) { { auto num_threads = unique_token.size(); std::cout << "num_threads " << num_threads << '\n'; - Kokkos::View - hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + Kokkos::View hand_coded_duplicate_view( + "hand_coded_duplicate", num_threads, n); auto f2 = KOKKOS_LAMBDA(int i) { auto thread_id = unique_token.acquire(); for (int j = 0; j < 10; ++j) { diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt index b7d85ebf11d7..b386fbe67505 100644 --- a/packages/kokkos/containers/src/CMakeLists.txt +++ b/packages/kokkos/containers/src/CMakeLists.txt @@ -1,33 +1,27 @@ #need these here for now -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -SET(KOKKOS_CONTAINERS_SRCS) -APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CONTAINER_HEADERS) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +set(KOKKOS_CONTAINERS_SRCS) +append_glob(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CONTAINER_HEADERS) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) - -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) -KOKKOS_ADD_LIBRARY( - kokkoscontainers - SOURCES ${KOKKOS_CONTAINERS_SRCS} - HEADERS ${KOKKOS_CONTAINERS_HEADERS} -) +kokkos_add_library(kokkoscontainers SOURCES ${KOKKOS_CONTAINERS_SRCS} HEADERS ${KOKKOS_CONTAINERS_HEADERS}) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) +kokkos_link_internal_library(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp index f50ab0a0f7e9..409260f0218d 100644 --- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp @@ -271,7 +271,7 @@ class Bitset { offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; - block = Impl::rotate_right(block, offset); + block = Impl::rotate_right(block, offset); return (((!(scan_direction & BIT_SCAN_REVERSE) ? Impl::bit_scan_forward(block) : Impl::int_log2(block)) + diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp index a37a2bdcebd9..6a2e6f73a15e 100644 --- a/packages/kokkos/containers/src/Kokkos_DualView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp @@ -275,14 +275,29 @@ class DualView : public ViewTraits { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : modified_flags(t_modified_flags("DualView::modified_flags")), - d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) { - // without UVM, host View mirrors - if constexpr (Kokkos::Impl::has_type::value) - h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); - else - h_view = Kokkos::create_mirror_view(d_view); + : modified_flags(t_modified_flags("DualView::modified_flags")) { + if constexpr (Impl::ViewCtorProp::sequential_host_init) { + h_view = t_host(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + static_assert(Impl::ViewCtorProp::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!Impl::ViewCtorProp::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); + + d_view = Kokkos::create_mirror_view_and_copy( + typename traits::memory_space{}, h_view); + } else { + d_view = t_dev(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + + // without UVM, host View mirrors + if constexpr (Kokkos::Impl::has_type::value) + h_view = + Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); + else + h_view = Kokkos::create_mirror_view(d_view); + } } //! Copy constructor (shallow copy) @@ -338,23 +353,21 @@ class DualView : public ViewTraits { // does the DualView have only one device struct impl_dualview_is_single_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; // does the given device match the device of t_dev? template struct impl_device_matches_tdev_device { - enum : bool { - value = std::is_same::value - }; + enum : bool { value = std::is_same_v }; }; // does the given device match the device of t_host? template struct impl_device_matches_thost_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -362,7 +375,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_thost_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -370,7 +383,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -378,8 +391,8 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_memory_space { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -389,11 +402,6 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the nested if_c expressions in the return - /// value's type. That just tells the method what the return type - /// should be: t_dev if the \c Device template parameter matches - /// this DualView's device type, else t_host. - /// /// For example, suppose you create a DualView on Cuda, like this: /// \code /// using dual_view_type = @@ -410,56 +418,47 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename std::conditional_t< - impl_device_matches_tdev_device::value, t_dev, - typename std::conditional_t< - impl_device_matches_thost_device::value, t_host, - typename std::conditional_t< - impl_device_matches_thost_exec::value, t_host, - typename std::conditional_t< - impl_device_matches_tdev_exec::value, t_dev, - typename std::conditional_t< - impl_device_matches_tdev_memory_space::value, - t_dev, t_host>>>>> - view() const { - constexpr bool device_is_memspace = - std::is_same::value; - constexpr bool device_is_execspace = - std::is_same::value; - constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; - constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; - constexpr bool device_exec_is_t_host_exec = - std::is_same::value; - constexpr bool device_mem_is_t_host_mem = - std::is_same::value; - constexpr bool device_is_t_host_device = - std::is_same::value; - constexpr bool device_is_t_dev_device = - std::is_same::value; - - static_assert( - device_is_t_dev_device || device_is_t_host_device || - (device_is_memspace && - (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || - (device_is_execspace && - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || - ((!device_is_execspace && !device_is_memspace) && - ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), - "Template parameter to .view() must exactly match one of the " - "DualView's device types or one of the execution or memory spaces"); - - return Impl::if_c::value, - t_dev, t_host>::select(d_view, h_view); + KOKKOS_FUNCTION auto view() const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is a memory space but doesn't " + "match either of DualView's memory spaces!"); + return h_view; + } + } else { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is an execution space but " + "doesn't match either of DualView's execution spaces!"); + return h_view; + } + } else { + static_assert(std::is_same_v, + "The template argument is neither a memory space, " + "execution space, or device!"); + if constexpr (std::is_same_v) + return d_view; + else { + static_assert(std::is_same_v, + "The template argument is a device but " + "doesn't match either of DualView's devices!"); + return h_view; + } + } + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } KOKKOS_INLINE_FUNCTION @@ -475,27 +474,27 @@ class DualView : public ViewTraits { template static int get_device_side() { constexpr bool device_is_memspace = - std::is_same::value; + std::is_same_v; constexpr bool device_is_execspace = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_host_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_host_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_host_device = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_dev_device = - std::is_same::value; + std::is_same_v; static_assert( device_is_t_dev_device || device_is_t_host_device || @@ -627,9 +626,9 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}); } @@ -637,9 +636,9 @@ class DualView : public ViewTraits { template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}, exec); } @@ -669,18 +668,18 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}); } template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}, exec); } @@ -943,12 +942,21 @@ class DualView : public ViewTraits { Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); if (sizeMismatch) { - ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if constexpr (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + ::Kokkos::realloc(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); + ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } } } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { @@ -1062,9 +1070,22 @@ class DualView : public ViewTraits { } }; - constexpr bool has_execution_space = alloc_prop_input::has_execution_space; + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!alloc_prop_input::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); - if constexpr (has_execution_space) { + if (sizeMismatch) { + sync(); + ::Kokkos::resize(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); + } + return; + } else if constexpr (alloc_prop_input::has_execution_space) { using ExecSpace = typename alloc_prop_input::execution_space; const auto& exec_space = Impl::get_property(arg_prop); @@ -1182,15 +1203,15 @@ class DualView : public ViewTraits { } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> extent(const iType& r) const { return d_view.extent(r); } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> extent_int(const iType& r) const { return static_cast(d_view.extent(r)); } diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp index 5f7fcaf69e7f..2f2f4433e7ca 100644 --- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -38,6 +38,23 @@ class DynRankView; // forward declare namespace Impl { +template +struct ViewDataTypeFromRank { + using type = typename ViewDataTypeFromRank::type*; +}; + +template +struct ViewDataTypeFromRank { + using type = T; +}; + +template +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( + DynRankView v, + std::enable_if_t::specialize, + void>>* = nullptr); + template struct DynRankDimTraits { enum : size_t { unspecified = KOKKOS_INVALID_INDEX }; @@ -91,54 +108,59 @@ struct DynRankDimTraits { } // Create the layout for the rank-7 view. + // Because the underlying View is rank-7, preserve "unspecified" for + // dimension 8. + // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.dimension[7] != unspecified ? layout.dimension[7] : 1); + Layout new_layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified); + new_layout.stride = layout.stride; + return new_layout; } // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value), Layout> + (std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.stride[0], - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.stride[1], - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.stride[2], - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.stride[3], - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.stride[4], - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.stride[5], - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.stride[6], - layout.dimension[7] != unspecified ? layout.dimension[7] : 1, - layout.stride[7]); + return Layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.stride[0], + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.stride[1], + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.stride[2], + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.stride[3], + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.stride[4], + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.stride[5], + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.stride[6], + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified, + layout.stride[7]); } // Extra overload to match that for specialize types template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), typename Traits::array_layout> createLayout(const Kokkos::Impl::ViewCtorProp& /* prop */, const typename Traits::array_layout& layout) { @@ -164,9 +186,8 @@ struct DynRankDimTraits { // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value) && - std::is_integral::value, + (std::is_same_v || + std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -182,8 +203,7 @@ reconstructLayout(const Layout& layout, iType dynrank) { // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value) && - std::is_integral::value, + (std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -284,40 +304,43 @@ namespace Impl { template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && - std::is_void::value && - (std::is_same::value || - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value)))), - Kokkos::Impl::ViewToDynRankViewTag>> { + std::enable_if_t< + (std::is_same_v && + std::is_void_v && + std::is_void_v && + (std::is_same_v || + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>)))), + Kokkos::Impl::ViewToDynRankViewTag>> { private: enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { is_assignable_layout = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; public: @@ -345,7 +368,7 @@ class ViewMapping< src.layout()); // Check this for integer input1 for padding, etc dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); - dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); + dst.m_track.m_tracker.assign(src.m_track.m_tracker, DstTraits::is_managed); dst.m_rank = Kokkos::View::rank(); } }; @@ -378,10 +401,11 @@ struct is_dyn_rank_view> : public std::true_type { template inline constexpr bool is_dyn_rank_view_v = is_dyn_rank_view::value; +// Inherit privately from View, this way we don't import anything funky +// for example the rank member vs the rank() function of DynRankView template -class DynRankView : public ViewTraits { - static_assert(!std::is_array::value && - !std::is_pointer::value, +class DynRankView : private View { + static_assert(!std::is_array_v && !std::is_pointer_v, "Cannot template DynRankView with array or pointer datatype - " "must be pod"); @@ -391,28 +415,66 @@ class DynRankView : public ViewTraits { template friend class Kokkos::Impl::ViewMapping; + size_t m_rank{}; + public: using drvtraits = ViewTraits; using view_type = View; - using traits = ViewTraits; - private: - using map_type = - Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; - - track_type m_track; - map_type m_map; - unsigned m_rank; + using drdtraits = Impl::DynRankDimTraits; public: - KOKKOS_INLINE_FUNCTION + // typedefs from ViewTraits, overriden + using data_type = typename drvtraits::data_type; + using const_data_type = typename drvtraits::const_data_type; + using non_const_data_type = typename drvtraits::non_const_data_type; + + // typedefs from ViewTraits not overriden + using value_type = typename view_type::value_type; + using const_value_type = typename view_type::const_value_type; + using non_const_value_type = typename view_type::non_const_value_type; + using traits = typename view_type::traits; + using array_layout = typename view_type::array_layout; + + using execution_space = typename view_type::execution_space; + using memory_space = typename view_type::memory_space; + using device_type = typename view_type::device_type; + + using memory_traits = typename view_type::memory_traits; + using host_mirror_space = typename view_type::host_mirror_space; + using size_type = typename view_type::size_type; + + using reference_type = typename view_type::reference_type; + using pointer_type = typename view_type::pointer_type; + + using scalar_array_type = value_type; + using const_scalar_array_type = const_value_type; + using non_const_scalar_array_type = non_const_value_type; + using specialize = typename view_type::specialize; + + // typedefs in View for mdspan compatibility + // cause issues with MSVC+CUDA + // using layout_type = typename view_type::layout_type; + using index_type = typename view_type::index_type; + using element_type = typename view_type::element_type; + using rank_type = typename view_type::rank_type; + using reference = reference_type; + using data_handle_type = pointer_type; + + KOKKOS_FUNCTION view_type& DownCast() const { return (view_type&)(*this); } - KOKKOS_INLINE_FUNCTION + + // FIXME: this function make NO sense, the above one already is marked const + // Maybe one would want to get back a view of const?? + KOKKOS_FUNCTION const view_type& ConstDownCast() const { return (const view_type&)(*this); } + // FIXME: deprecate DownCast in favor of to_view + // KOKKOS_FUNCTION + // view_type to_view() const { return *this; } + // Types below - at least the HostMirror requires the value_type, NOT the rank // 7 data_type of the traits @@ -436,113 +498,32 @@ class DynRankView : public ViewTraits { typename drvtraits::array_layout, typename drvtraits::host_mirror_space>; + using host_mirror_type = HostMirror; //---------------------------------------- // Domain rank and extents // enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the // enum? - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() const; - //---------------------------------------- /* Deprecate all 'dimension' functions in favor of * ISO/C++ vocabulary 'extent'. */ - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) * - m_map.extent(3) * m_map.extent(4) * m_map.extent(5) * - m_map.extent(6) * m_map.extent(7); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return (m_map.data() != nullptr); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - //---------------------------------------- private: enum { is_layout_left = - std::is_same::value, + std::is_same_v, is_layout_right = - std::is_same::value, + std::is_same_v, - is_layout_stride = std::is_same::value, + is_layout_stride = + std::is_same_v, - is_default_map = std::is_void::value && + is_default_map = std::is_void_v && (is_layout_left || is_layout_right || is_layout_stride) }; @@ -570,476 +551,150 @@ class DynRankView : public ViewTraits { #endif public: - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION constexpr unsigned rank() const { return m_rank; } - // operators () - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type operator()() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding...) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // Phalanx is violating this, since they use the operator to access ALL - // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , - // this->rank(), m_track, m_map) ) - return data()[i0]; - } - - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding... AND a Trilinos/Sacado scalar type ) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - !std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // auto map = impl_map(); - const size_t dim_scalar = m_map.dimension_scalar(); - const size_t bytes = this->span() / dim_scalar; - - using tmp_view_type = Kokkos::View< - DataType*, typename traits::array_layout, typename traits::device_type, - Kokkos::MemoryTraits>; - tmp_view_type rankone_view(this->data(), bytes, dim_scalar); - return rankone_view(i0); - } - - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type access() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); + using view_type::data; + using view_type::extent; + using view_type::extent_int; // FIXME: not tested + using view_type::impl_map; // FIXME: not tested + using view_type::is_allocated; + using view_type::label; + using view_type::size; + using view_type::span; + using view_type::span_is_contiguous; // FIXME: not tested + using view_type::stride; // FIXME: not tested + using view_type::stride_0; // FIXME: not tested + using view_type::stride_1; // FIXME: not tested + using view_type::stride_2; // FIXME: not tested + using view_type::stride_3; // FIXME: not tested + using view_type::stride_4; // FIXME: not tested + using view_type::stride_5; // FIXME: not tested + using view_type::stride_6; // FIXME: not tested + using view_type::stride_7; // FIXME: not tested + using view_type::use_count; + + KOKKOS_FUNCTION reference_type + operator()(index_type i0 = 0, index_type i1 = 0, index_type i2 = 0, + index_type i3 = 0, index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); + } + +// This is an accomodation for Phalanx, that is usint the operator[] to access +// all elements in a linear fashion even when the rank is not 1 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { + if constexpr (std::is_same_v) { + return view_type::data()[i0]; + } else { + const size_t dim_scalar = view_type::impl_map().dimension_scalar(); + const size_t bytes = view_type::span() / dim_scalar; + + using tmp_view_type = + Kokkos::View>; + tmp_view_type rankone_view(view_type::data(), bytes, dim_scalar); + return rankone_view(i0); + } } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); +#else + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 1u) + Kokkos::abort("DynRankView operator[] can only be used for rank-1"); +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); } +#endif - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5, const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + KOKKOS_FUNCTION reference_type access(index_type i0 = 0, index_type i1 = 0, + index_type i2 = 0, index_type i3 = 0, + index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - //---------------------------------------- // Standard constructor, destructor, and assignment operators... KOKKOS_DEFAULTED_FUNCTION ~DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView() : m_track(), m_map(), m_rank() {} // Default ctor + KOKKOS_DEFAULTED_FUNCTION DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} - - KOKKOS_INLINE_FUNCTION - DynRankView(DynRankView&& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + // Make this conditionally explicit? + template + KOKKOS_FUNCTION DynRankView(const DynRankView& rhs) + : view_type(rhs), m_rank(rhs.m_rank) {} - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(const DynRankView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; + template + KOKKOS_FUNCTION DynRankView& operator=(const DynRankView& rhs) { + view_type::operator=(rhs); + m_rank = rhs.m_rank; return *this; } - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(DynRankView&& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; +#if 0 // TODO: this will later be swapped in depending on whether the new View + // impl is active + private: + template + KOKKOS_FUNCTION typename view_type::extents_type create_rank7_extents( + const Ext& ext) { + return typename view_type::extents_type( + ext.rank() > 0 ? ext.extent(0) : 1, ext.rank() > 1 ? ext.extent(1) : 1, + ext.rank() > 2 ? ext.extent(2) : 1, ext.rank() > 3 ? ext.extent(3) : 1, + ext.rank() > 4 ? ext.extent(4) : 1, ext.rank() > 5 ? ext.extent(5) : 1, + ext.rank() > 6 ? ext.extent(6) : 1); } - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. + public: + // Copy/Assign View to DynRankView template - KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); + KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs, + size_t new_rank) + : view_type(rhs.data_handle(), drdtraits::createLayout(rhs.layout())), + m_rank(new_rank) { + if (new_rank > rhs.rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=( - const DynRankView& rhs) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - m_track.assign(rhs.m_track, traits::is_managed); + KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + view_type::operator=(view_type( + rhs.data_handle(), + typename view_type::mapping_type(create_rank7_extents(rhs.extents())), + rhs.accessor())); m_rank = rhs.rank(); return *this; } - - // Copy/Assign View to DynRankView +#else template - KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs) - : m_track(), m_map(), m_rank(View::rank()) { + KOKKOS_FUNCTION DynRankView(const View& rhs, size_t new_rank) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping; static_assert(Mapping::is_assignable, - "Incompatible View to DynRankView copy construction"); + "Incompatible View to DynRankView copy assignment"); + if (new_rank > View::rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); Mapping::assign(*this, rhs); + m_rank = new_rank; } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + KOKKOS_FUNCTION DynRankView& operator=(const View& rhs) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping { static_assert(Mapping::is_assignable, "Incompatible View to DynRankView copy assignment"); Mapping::assign(*this, rhs); + m_rank = View::rank(); return *this; } +#endif + + template + KOKKOS_FUNCTION DynRankView(const View& rhs) + : DynRankView(rhs, View::rank()) {} //---------------------------------------- // Allocation tracking properties - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.use_count(); } - - inline const std::string label() const { - return m_track.template get_label(); - } - //---------------------------------------- // Allocation according to allocation properties and array layout // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that // rank deduction can properly take place + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), - m_map(), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing DynRankView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout), - Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - } + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} - // Wrappers template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout)), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing DynRankView to wrap user memory must supply matching " - "pointer type"); - } + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} //---------------------------------------- // Constructor(s) // Simple dimension-only layout + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} @@ -1188,16 +809,20 @@ class DynRankView : public ViewTraits { //---------------------------------------- // Memory span required to wrap these dimensions. + // FIXME: this function needs to be tested static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, + [[maybe_unused]] const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + // FIXME: check that arg_N7 is not set by user (in debug mode) + return view_type::required_allocation_size(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_INVALID_INDEX, + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, const size_t arg_N2 = KOKKOS_INVALID_INDEX, const size_t arg_N3 = KOKKOS_INVALID_INDEX, @@ -1205,55 +830,38 @@ class DynRankView : public ViewTraits { const size_t arg_N5 = KOKKOS_INVALID_INDEX, const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), arg_N0, - arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, typename traits::array_layout& arg_layout) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), - arg_layout) {} + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + typename traits::array_layout& arg_layout) + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_layout) {} //---------------------------------------- // Shared scratch memory constructor - static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - const size_t num_passed_args = - (arg_N0 != KOKKOS_INVALID_INDEX) + (arg_N1 != KOKKOS_INVALID_INDEX) + - (arg_N2 != KOKKOS_INVALID_INDEX) + (arg_N3 != KOKKOS_INVALID_INDEX) + - (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) + - (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX); - - if (std::is_void::value && - num_passed_args != traits::rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - {} - - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + // Note: We must pass 7 valid args since view_type is rank 7 + static inline size_t shmem_size( + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + return view_type::shmem_size(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, + arg_N6, arg_N7); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(arg_layout) // is this correct? - )))), - arg_layout) {} + : view_type(arg_space, drdtraits::createLayout(arg_layout)), + m_rank(drdtraits::computeRank(arg_layout)) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, @@ -1264,21 +872,38 @@ class DynRankView : public ViewTraits { const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, - arg_N6, arg_N7)))))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} + : DynRankView(arg_space, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + KOKKOS_FUNCTION constexpr auto layout() const { + switch (rank()) { + case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); + case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); + case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); + case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); + case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); + case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); + case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); + case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); + default: + KOKKOS_IF_ON_HOST( + Kokkos::abort( + std::string( + "Calling DynRankView::layout on DRV of unexpected rank " + + std::to_string(rank())) + .c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "Calling DynRankView::layout on DRV of unexpected rank");) + } + // control flow should never reach here + return view_type::layout(); + } }; template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank( - const DynRankView& DRV) { +KOKKOS_FUNCTION constexpr unsigned rank(const DynRankView& DRV) { return DRV.rank(); } // needed for transition to common constexpr method in view and dynrankview // to return rank @@ -1293,181 +918,46 @@ struct DynRankSubviewTag {}; } // namespace Impl -namespace Impl { - -template -class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value)), - Kokkos::Impl::DynRankSubviewTag>, - SrcTraits, Args...> { - private: - enum { - RZ = false, - R0 = bool(is_integral_extent<0, Args...>::value), - R1 = bool(is_integral_extent<1, Args...>::value), - R2 = bool(is_integral_extent<2, Args...>::value), - R3 = bool(is_integral_extent<3, Args...>::value), - R4 = bool(is_integral_extent<4, Args...>::value), - R5 = bool(is_integral_extent<5, Args...>::value), - R6 = bool(is_integral_extent<6, Args...>::value) - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) - }; - - using array_layout = Kokkos::LayoutStride; - - using value_type = typename SrcTraits::value_type; - - using data_type = value_type*******; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - template - struct apply { - static_assert(Kokkos::is_memory_traits::value); - - using traits_type = - Kokkos::ViewTraits; - - using type = Kokkos::View; - }; - - using dimension = typename SrcTraits::dimension; - - template - struct ExtentGenerator { - KOKKOS_INLINE_FUNCTION - static SubviewExtents<7, rank> generator( - const dimension& dim, Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), - Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), - Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6()) { - return SubviewExtents<7, rank>(dim, arg0, arg1, arg2, arg3, arg4, arg5, - arg6); - } - }; - - using ret_type = Kokkos::DynRankView; - - template - KOKKOS_INLINE_FUNCTION static ret_type subview( - const unsigned src_rank, Kokkos::DynRankView const& src, - Args... args) { - using DstType = ViewMapping; - - using DstDimType = std::conditional_t< - (rank == 0), ViewDimension<>, - std::conditional_t< - (rank == 1), ViewDimension<0>, - std::conditional_t< - (rank == 2), ViewDimension<0, 0>, - std::conditional_t< - (rank == 3), ViewDimension<0, 0, 0>, - std::conditional_t< - (rank == 4), ViewDimension<0, 0, 0, 0>, - std::conditional_t< - (rank == 5), ViewDimension<0, 0, 0, 0, 0>, - std::conditional_t< - (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>, - ViewDimension<0, 0, 0, 0, 0, 0, 0>>>>>>>>; - - using dst_offset_type = ViewOffset; - using dst_handle_type = typename DstType::handle_type; - - ret_type dst; - - const SubviewExtents<7, rank> extents = ExtentGenerator::generator( - src.m_map.m_impl_offset.m_dim, args...); - - dst_offset_type tempdst(src.m_map.m_impl_offset, extents); - - dst.m_track = src.m_track; - - dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0; - dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1; - dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2; - dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3; - dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4; - dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5; - dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6; - - dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0; - dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1; - dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2; - dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3; - dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4; - dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5; - dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6; - - dst.m_map.m_impl_handle = - dst_handle_type(src.m_map.m_impl_handle + - src.m_map.m_impl_offset( - extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6))); - - dst.m_rank = - (src_rank > 0 ? unsigned(R0) : 0) + (src_rank > 1 ? unsigned(R1) : 0) + - (src_rank > 2 ? unsigned(R2) : 0) + (src_rank > 3 ? unsigned(R3) : 0) + - (src_rank > 4 ? unsigned(R4) : 0) + (src_rank > 5 ? unsigned(R5) : 0) + - (src_rank > 6 ? unsigned(R6) : 0); - - return dst; - } -}; - -} // namespace Impl - template using Subdynrankview = typename Kokkos::Impl::ViewMapping::ret_type; -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subdynrankview(const Kokkos::DynRankView& src, Args... args) { - if (src.rank() > sizeof...(Args)) // allow sizeof...(Args) >= src.rank(), - // ignore the remaining args - { - Kokkos::abort( - "subdynrankview: num of args must be >= rank of the source " - "DynRankView"); - } - - using metafcn = - Kokkos::Impl::ViewMapping, Args...>; - - return metafcn::subview(src.rank(), src, args...); +template +KOKKOS_INLINE_FUNCTION auto subdynrankview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + auto sub = subview(drv.DownCast(), arg0, arg1, arg2, arg3, arg4, arg5, arg6); + using sub_t = decltype(sub); + size_t new_rank = (drv.rank() > 0 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 1 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 2 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 3 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 4 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 5 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 6 && !std::is_integral_v ? 1 : 0); + + using return_type = + DynRankView; + return static_cast( + DynRankView( + sub, new_rank)); } - -// Wrapper to allow subview function name -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subview(const Kokkos::DynRankView& src, Args... args) { - return subdynrankview(src, args...); +template +KOKKOS_INLINE_FUNCTION auto subview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + return subdynrankview(drv, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } } // namespace Kokkos @@ -1482,12 +972,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && @@ -1638,11 +1128,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template -KOKKOS_FUNCTION auto as_view_of_rank_n( +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( DynRankView v, - typename std::enable_if::specialize, void>::value>::type* = - nullptr) { + std::enable_if_t< + std::is_same_v::specialize, void>>*) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -1653,7 +1143,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");) } - auto layout = v.impl_map().layout(); + auto layout = v.DownCast().layout(); if constexpr (std::is_same_v || std::is_same_v || @@ -1691,43 +1181,16 @@ void apply_to_view_of_static_rank(Function&& f, DynRankView a) { } // namespace Impl -template -KOKKOS_INLINE_FUNCTION constexpr auto DynRankView::layout() const -> - typename traits::array_layout { - switch (rank()) { - case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); - case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); - case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); - case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); - case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); - case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); - case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); - case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); - default: - KOKKOS_IF_ON_HOST( - Kokkos::abort( - std::string( - "Calling DynRankView::layout on DRV of unexpected rank " + - std::to_string(rank())) - .c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "Calling DynRankView::layout on DRV of unexpected rank");) - } - // control flow should never reach here - return m_map.layout(); -} - /** \brief Deep copy a value from Host memory into a view. */ template inline void deep_copy( const ExecSpace& e, const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); Impl::apply_to_view_of_static_rank( @@ -1738,8 +1201,8 @@ template inline void deep_copy( const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Impl::apply_to_view_of_static_rank([=](auto view) { deep_copy(view, value); }, dst); } @@ -1750,8 +1213,8 @@ inline void deep_copy( const ExecSpace& e, typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(e, dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1759,8 +1222,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1773,15 +1236,13 @@ inline void deep_copy( template inline void deep_copy( const ExecSpace& exec_space, const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1826,15 +1287,13 @@ inline void deep_copy( template inline void deep_copy( const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1894,7 +1353,7 @@ struct MirrorDRViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1909,26 +1368,6 @@ struct MirrorDRViewType { std::conditional_t; }; -template -struct MirrorDRVType { - // The incoming view_type - using src_view_type = typename Kokkos::DynRankView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::DynRankView; -}; - } // namespace Impl namespace Impl { @@ -1945,10 +1384,9 @@ inline auto create_mirror(const DynRankView& src, arg_prop, std::string(src.label()).append("_mirror")); if constexpr (Impl::ViewCtorProp::has_memory_space) { - using dst_type = typename Impl::MirrorDRVType< + using dst_type = typename Impl::MirrorDRViewType< typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - + P...>::dest_view_type; return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); } else { @@ -1989,7 +1427,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(const Space&, const Kokkos::DynRankView& src) { +inline auto create_mirror(const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } @@ -1999,8 +1438,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::DynRankView& src) { +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -2026,12 +1465,12 @@ inline auto create_mirror_view( [[maybe_unused]] const typename Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename DynRankView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename DynRankView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename DynRankView::data_type, + typename DynRankView::HostMirror::data_type>) { return typename DynRankView::HostMirror(src); } else { return Kokkos::Impl::choose_create_mirror(src, arg_prop); @@ -2102,7 +1541,7 @@ inline auto create_mirror_view( // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::DynRankView& src) { diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp index a4b74e246e0d..caae3f791f04 100644 --- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -40,10 +40,10 @@ struct ChunkedArrayManager { using pointer_type = ValueType*; using track_type = Kokkos::Impl::SharedAllocationTracker; - ChunkedArrayManager() = default; - ChunkedArrayManager(ChunkedArrayManager const&) = default; - ChunkedArrayManager(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; + ChunkedArrayManager() = default; + ChunkedArrayManager(ChunkedArrayManager const&) = default; + ChunkedArrayManager(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; template @@ -129,10 +129,10 @@ struct ChunkedArrayManager { /// allocation template struct Destroy { - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; Destroy& operator=(const Destroy&) = default; Destroy(std::string label, value_type** arg_chunk, @@ -250,7 +250,7 @@ class DynamicView : public Kokkos::ViewTraits { // It is assumed that the value_type is trivially copyable; // when this is not the case, potential problems can occur. - static_assert(std::is_void::value, + static_assert(std::is_void_v, "DynamicView only implemented for non-specialized View type"); private: @@ -363,7 +363,7 @@ class DynamicView : public Kokkos::ViewTraits { enum { reference_type_is_lvalue_reference = - std::is_lvalue_reference::value + std::is_lvalue_reference_v }; KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { @@ -463,11 +463,11 @@ class DynamicView : public Kokkos::ViewTraits { //---------------------------------------------------------------------- - ~DynamicView() = default; - DynamicView() = default; - DynamicView(DynamicView&&) = default; - DynamicView(const DynamicView&) = default; - DynamicView& operator=(DynamicView&&) = default; + ~DynamicView() = default; + DynamicView() = default; + DynamicView(DynamicView&&) = default; + DynamicView(const DynamicView&) = default; + DynamicView& operator=(DynamicView&&) = default; DynamicView& operator=(const DynamicView&) = default; template @@ -572,7 +572,7 @@ struct MirrorDynamicViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -665,9 +665,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorDynamicViewType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::DynamicView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -693,14 +693,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::DynamicView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::DynamicView::HostMirror(src); } else { @@ -835,21 +835,17 @@ inline void deep_copy(const View& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } template @@ -861,21 +857,17 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } namespace Impl { @@ -964,7 +956,7 @@ struct ViewCopy, // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::DynamicView& src) { diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp index 3adc70b19049..cf23c25b86bd 100644 --- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -50,9 +50,9 @@ inline constexpr bool is_offset_view_v = is_offset_view::value; #define KOKKOS_INVALID_INDEX_RANGE \ { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET } -template ::value && - std::is_signed::value, - iType> = 0> +template && std::is_signed_v, + iType> = 0> using IndexRange = Kokkos::Array; using index_list_type = std::initializer_list; @@ -118,11 +118,11 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( (enum {LEN = 1024}; char buffer[LEN]; const std::string label = tracker.template get_label(); int n = snprintf(buffer, LEN, - "OffsetView bounds error of view labeled %s (", - label.c_str()); + "OffsetView bounds error of view labeled %s (", + label.c_str()); offsetview_error_operator_bounds<0>(buffer + n, LEN - n, map, begins, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE( (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) @@ -180,44 +180,40 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank, } // namespace Impl template -class OffsetView : public ViewTraits { - public: - using traits = ViewTraits; - +class OffsetView : public View { private: template friend class OffsetView; - template - friend class View; // FIXME delete this line - template - friend class Kokkos::Impl::ViewMapping; - using map_type = Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; + using base_t = View; public: - enum { Rank = map_type::Rank }; - using begins_type = Kokkos::Array; + // typedefs to reduce typing base_t:: further down + using traits = typename base_t::traits; + // FIXME: should be base_t::index_type after refactor + using index_type = typename base_t::memory_space::size_type; + using pointer_type = typename base_t::pointer_type; + + using begins_type = Kokkos::Array; template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t begin(const iType local_dimension) const { - return local_dimension < Rank ? m_begins[local_dimension] - : KOKKOS_INVALID_OFFSET; + return static_cast(local_dimension) < base_t::rank() + ? m_begins[local_dimension] + : KOKKOS_INVALID_OFFSET; } KOKKOS_FUNCTION begins_type begins() const { return m_begins; } template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t end(const iType local_dimension) const { - return begin(local_dimension) + m_map.extent(local_dimension); + return begin(local_dimension) + base_t::extent(local_dimension); } private: - track_type m_track; - map_type m_map; begins_type m_begins; public: @@ -245,529 +241,60 @@ class OffsetView : public ViewTraits { typename traits::array_layout, typename traits::host_mirror_space>; - //---------------------------------------- - // Domain rank and extents - - /** \brief rank() to be implemented - */ - // KOKKOS_FUNCTION - // static - // constexpr unsigned rank() { return map_type::Rank; } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_FUNCTION constexpr typename traits::array_layout layout() const { - return m_map.layout(); - } - - KOKKOS_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } - KOKKOS_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } - KOKKOS_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } - KOKKOS_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } - KOKKOS_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } - KOKKOS_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } - KOKKOS_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } - KOKKOS_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); + template + KOKKOS_FUNCTION typename base_t::reference_type offset_operator( + std::integer_sequence, OtherIndexTypes... indices) const { + return base_t::operator()((indices - m_begins[I])...); } - template - KOKKOS_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_FUNCTION - const Kokkos::Impl::ViewMapping& implementation_map() const { - return m_map; - } - - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ - ARG; - -#else - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); - + template +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)) #endif - public: - //------------------------------ - // Rank 0 operator() - - KOKKOS_FORCEINLINE_FUNCTION - reference_type operator()() const { return m_map.reference(); } - //------------------------------ - // Rank 1 operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (2 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.reference(j0, j1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - if constexpr (is_layout_left) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; - else - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; - } else if constexpr (is_layout_right) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; - else - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + - j1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined(KOKKOS_COMPILER_INTEL) - __builtin_unreachable(); + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator[]( + const OtherIndexType& idx) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)); #endif + return base_t::operator[](idx - m_begins[0]); } - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.reference(j0, j1, j2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.reference(j0, j1, j2, j3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.reference(j0, j1, j2, j3, j4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.reference(j0, j1, j2, j3, j4, j5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map - .m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6, j7)]; + template +#ifndef KOKKOS_ENABLE_CXX17 + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())) +#endif + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator()( + OtherIndexTypes... indices) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert( + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())); +#endif + return offset_operator(std::make_index_sequence(), + indices...); } - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6, j7); - } + template + KOKKOS_FUNCTION constexpr typename base_t::reference_type access( + OtherIndexTypes... args) const = delete; -#undef KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY + //---------------------------------------- //---------------------------------------- // Standard destructor, constructors, and assignment operators - KOKKOS_DEFAULTED_FUNCTION - ~OffsetView() = default; - KOKKOS_FUNCTION - OffsetView() : m_track(), m_map() { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = KOKKOS_INVALID_OFFSET; - } - - KOKKOS_FUNCTION - OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(rhs.m_map), - m_begins(rhs.m_begins) {} - - KOKKOS_FUNCTION - OffsetView(OffsetView&& rhs) - : m_track(std::move(rhs.m_track)), - m_map(std::move(rhs.m_map)), - m_begins(std::move(rhs.m_begins)) {} - - KOKKOS_FUNCTION - OffsetView& operator=(const OffsetView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_begins = rhs.m_begins; - return *this; - } - - KOKKOS_FUNCTION - OffsetView& operator=(OffsetView&& rhs) { - m_track = std::move(rhs.m_track); - m_map = std::move(rhs.m_map); - m_begins = std::move(rhs.m_begins); - return *this; + OffsetView() : base_t() { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = KOKKOS_INVALID_OFFSET; } // interoperability with View @@ -778,20 +305,10 @@ class OffsetView : public ViewTraits { public: KOKKOS_FUNCTION - view_type view() const { - view_type v(m_track, m_map); - return v; - } + view_type view() const { return *this; } template - KOKKOS_FUNCTION OffsetView(const View& aview) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - + KOKKOS_FUNCTION OffsetView(const View& aview) : base_t(aview) { for (size_t i = 0; i < View::rank(); ++i) { m_begins[i] = 0; } @@ -800,19 +317,14 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const index_list_type& minIndices) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(aview) { + KOKKOS_IF_ON_HOST( + (Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, aview.label());)) + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -820,27 +332,13 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const begins_type& beg) - : m_track(aview.impl_track()), m_map(), m_begins(beg) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - } + : base_t(aview), m_begins(beg) {} // may assign unmanaged from managed. template KOKKOS_FUNCTION OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(), - m_begins(rhs.m_begins) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign? - } + : base_t(rhs.view()), m_begins(rhs.m_begins) {} private: enum class subtraction_failure { @@ -879,7 +377,7 @@ class OffsetView : public ViewTraits { static subtraction_failure runtime_check_begins_ends_host(const B& begins, const E& ends) { std::string message; - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) message += "begins.size() " "(" + @@ -887,19 +385,19 @@ class OffsetView : public ViewTraits { ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) message += "ends.size() " "(" + - std::to_string(begins.size()) + + std::to_string(ends.size()) + ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; @@ -941,7 +439,7 @@ class OffsetView : public ViewTraits { message = "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView\n" + message; - Kokkos::Impl::throw_runtime_exception(message); + Kokkos::abort(message.c_str()); } return subtraction_failure::none; @@ -951,11 +449,11 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION static subtraction_failure runtime_check_begins_ends_device( const B& begins, const E& ends) { - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: begins has bad Rank"); - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: ends has bad Rank"); @@ -993,20 +491,25 @@ class OffsetView : public ViewTraits { // Precondition: begins.size() == ends.size() == m_begins.size() == Rank template KOKKOS_FUNCTION OffsetView(const pointer_type& p, const B& begins_, - const E& ends_, - subtraction_failure) - : m_track() // no tracking - , - m_map(Kokkos::Impl::ViewCtorProp(p), - typename traits::array_layout( - Rank > 0 ? at(ends_, 0) - at(begins_, 0) : 0, - Rank > 1 ? at(ends_, 1) - at(begins_, 1) : 0, - Rank > 2 ? at(ends_, 2) - at(begins_, 2) : 0, - Rank > 3 ? at(ends_, 3) - at(begins_, 3) : 0, - Rank > 4 ? at(ends_, 4) - at(begins_, 4) : 0, - Rank > 5 ? at(ends_, 5) - at(begins_, 5) : 0, - Rank > 6 ? at(ends_, 6) - at(begins_, 6) : 0, - Rank > 7 ? at(ends_, 7) - at(begins_, 7) : 0)) { + const E& ends_, subtraction_failure) + : base_t(Kokkos::view_wrap(p), + typename traits::array_layout( + base_t::rank() > 0 ? at(ends_, 0) - at(begins_, 0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 1 ? at(ends_, 1) - at(begins_, 1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 2 ? at(ends_, 2) - at(begins_, 2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 3 ? at(ends_, 3) - at(begins_, 3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 4 ? at(ends_, 4) - at(begins_, 4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 5 ? at(ends_, 5) - at(begins_, 5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 6 ? at(ends_, 6) - at(begins_, 6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 7 ? at(ends_, 7) - at(begins_, 7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG)) { for (size_t i = 0; i != m_begins.size(); ++i) { m_begins[i] = at(begins_, i); }; @@ -1040,15 +543,6 @@ class OffsetView : public ViewTraits { : OffsetView(p, begins_, ends_, runtime_check_begins_ends(begins_, ends_)) {} - //---------------------------------------- - // Allocation tracking properties - KOKKOS_FUNCTION - int use_count() const { return m_track.use_count(); } - - const std::string label() const { - return m_track.template get_label(); - } - // Choosing std::pair as type for the arguments allows constructing an // OffsetView using list initialization syntax, e.g., // OffsetView dummy("dummy", {-1, 3}, {-2,2}); @@ -1070,18 +564,34 @@ class OffsetView : public ViewTraits { const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE ) - : OffsetView( - Kokkos::Impl::ViewCtorProp(arg_label), - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(Kokkos::Impl::ViewCtorProp(arg_label), + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG - 1 + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit OffsetView( @@ -1094,18 +604,34 @@ class OffsetView : public ViewTraits { const std::pair range5 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range6 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE) - : OffsetView( - arg_prop, - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(arg_prop, + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit KOKKOS_FUNCTION OffsetView( @@ -1113,9 +639,14 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { + : base_t(arg_prop, arg_layout) { + KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, + base_t::label());)) + + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -1132,42 +663,9 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track(), - m_map() - - { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i]; - - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Kokkos::Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "OffsetView allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing OffsetView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, - Kokkos::Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(arg_prop, arg_layout) { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = minIndices.begin()[i]; } }; @@ -1177,7 +675,7 @@ class OffsetView : public ViewTraits { */ template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { - return V.Rank; + return V.rank(); } // Temporary until added to view //---------------------------------------------------------------------------- @@ -1185,8 +683,8 @@ KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { namespace Impl { template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> -shift_input(const T arg, const int64_t offset) { +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> shift_input( + const T arg, const int64_t offset) { return arg - offset; } @@ -1197,13 +695,13 @@ Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) { template KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, Kokkos::pair> + std::enable_if_t, Kokkos::pair> shift_input(const Kokkos::pair arg, const int64_t offset) { return Kokkos::make_pair(arg.first - offset, arg.second - offset); } template -inline std::enable_if_t::value, std::pair> -shift_input(const std::pair arg, const int64_t offset) { +inline std::enable_if_t, std::pair> shift_input( + const std::pair arg, const int64_t offset) { return std::make_pair(arg.first - offset, arg.second - offset); } @@ -1212,7 +710,7 @@ KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( const size_t i, Kokkos::Array& subviewBegins, std::enable_if_t shiftedArg, const Arg arg, const A viewBegins, size_t& counter) { - if (!std::is_integral::value) { + if (!std::is_integral_v) { subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0; counter++; } @@ -1621,7 +1119,7 @@ KOKKOS_INLINE_FUNCTION ViewTraits, Args...>::type>::type subview(const OffsetView& src, Args... args) { static_assert( - OffsetView::Rank == sizeof...(Args), + OffsetView::rank() == sizeof...(Args), "subview requires one argument for each source OffsetView rank"); return Kokkos::Experimental::Impl::subview_offset(src, args...); @@ -1641,12 +1139,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1672,12 +1170,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1704,11 +1202,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); auto dstView = dst.view(); @@ -1719,11 +1217,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1733,11 +1231,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const View& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1748,11 +1246,11 @@ template inline void deep_copy( const View& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); Kokkos::deep_copy(dst, value.view()); @@ -1770,7 +1268,7 @@ struct MirrorOffsetViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1786,27 +1284,6 @@ struct MirrorOffsetViewType { std::conditional_t; }; -template -struct MirrorOffsetType { - // The incoming view_type - using src_view_type = typename Kokkos::Experimental::OffsetView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it.) - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = - Kokkos::Experimental::OffsetView; -}; - } // namespace Impl namespace Impl { @@ -1825,10 +1302,12 @@ inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetViewType< + Space, T, P...>::dest_view_type(prop_copy, src.layout(), + {src.begin(0), src.begin(1), + src.begin(2), src.begin(3), + src.begin(4), src.begin(5), + src.begin(6), src.begin(7)}); } else { return typename Kokkos::Experimental::OffsetView::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); @@ -1877,9 +1356,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorOffsetType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::OffsetView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } @@ -1905,14 +1384,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::OffsetView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::OffsetView::HostMirror(src); } else { diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp index 9d04cf6acd0e..52af567c61d2 100644 --- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -184,16 +184,16 @@ struct DefaultContribution -struct DefaultDuplication { +struct DefaultDuplication { using type = Kokkos::Experimental::ScatterNonDuplicated; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; @@ -532,32 +532,56 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { subview where the index specified is the largest-stride one. */ template struct Slice { - using next = Slice; - using value_type = typename next::value_type; - - static value_type get(V const& src, const size_t i, Args... args) { + using next = Slice; + static auto get(V const& src, const size_t i, Args... args) { return next::get(src, i, Kokkos::ALL, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, i, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, args..., i); } }; +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; +#endif + template struct ReduceDuplicates; @@ -905,7 +929,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(arg); @@ -1028,10 +1052,7 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1233,8 +1254,8 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1460,7 +1478,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(thread_id, arg); @@ -1470,9 +1488,9 @@ class ScatterAccess::array_layout, typename ViewTraits::device_type, Op, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, typename std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>>::type, diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 8ce868cac217..ec1b8905c766 100644 --- a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -190,7 +190,7 @@ struct GraphRowViewConst { const typename GraphType::entries_type& colidx_in, const ordinal_type& stride, const ordinal_type& count, const OffsetType& idx, - const std::enable_if_t::value, int>& = 0) + const std::enable_if_t, int>& = 0) : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} /// \brief Number of entries in the row. diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp index c3a8b67df8df..4f47051a5c1c 100644 --- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -34,7 +34,7 @@ #include #include -#include +#include #include @@ -746,7 +746,7 @@ class UnorderedMap { /// 'const value_type' via Cuda texture fetch must return by value. template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_void::value, // !is_set + !std::is_void_v, // !is_set std::conditional_t> value_at(size_type i) const { KOKKOS_EXPECTS(i < capacity()); @@ -808,8 +808,8 @@ class UnorderedMap { // Re-allocate the views of the calling UnorderedMap according to src // capacity, and deep copy the src data. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> create_copy_view( UnorderedMap const &src) { if (m_hash_lists.data() != src.m_hash_lists.data()) { @@ -821,8 +821,8 @@ class UnorderedMap { // Allocate views of the calling UnorderedMap with the same capacity as the // src. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> allocate_view( UnorderedMap const &src) { insertable_map_type tmp; @@ -852,8 +852,8 @@ class UnorderedMap { // Deep copy view data from src. This requires that the src capacity is // identical to the capacity of the calling UnorderedMap. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> deep_copy_view( UnorderedMap const &src) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp index 88109fb0ba56..83ccfbf6305f 100644 --- a/packages/kokkos/containers/src/Kokkos_Vector.hpp +++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp @@ -172,9 +172,8 @@ class KOKKOS_DEPRECATED vector private: template - struct impl_is_input_iterator - : /* TODO replace this */ std::bool_constant< - !std::is_convertible::value> {}; + struct impl_is_input_iterator : /* TODO replace this */ std::bool_constant< + !std::is_convertible_v> {}; public: // TODO: can use detection idiom to generate better error message here later diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt index e69e46bb6a85..6255a86c4614 100644 --- a/packages/kokkos/containers/unit_tests/CMakeLists.txt +++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt @@ -1,8 +1,7 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) @@ -12,57 +11,49 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) set(UnitTestSources UnitTestMain.cpp) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - foreach(Name - Bitset - DualView - DynamicView - DynViewAPI_generic - DynViewAPI_rank12345 - DynViewAPI_rank67 - ErrorReporter - OffsetView - ScatterView - StaticCrsGraph - WithoutInitializing - UnorderedMap - Vector - ViewCtorPropEmbeddedDim - ) + foreach( + Name + Bitset + DualView + DynamicView + DynViewAPI_generic + DynViewAPI_rank12345 + DynViewAPI_rank67 + DynRankView_TeamScratch + ErrorReporter + OffsetView + ScatterView + StaticCrsGraph + WithoutInitializing + UnorderedMap + Vector + ViewCtorPropEmbeddedDim + ) if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name STREQUAL "Vector") continue() # skip Kokkos::vector test if deprecated code 4 is not enabled endif() # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. set(file ${dir}/Test${Tag}_${Name}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include \n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include \n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() #fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj if(KOKKOS_ENABLE_CUDA AND WIN32) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) + kokkos_add_executable_and_test(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() -SET(COMPILE_ONLY_SOURCES - TestCreateMirror.cpp - TestDualViewParameterPack.cpp - TestIsViewTrait.cpp -) -KOKKOS_ADD_EXECUTABLE( - ContainersTestCompileOnly - SOURCES - TestCompileMain.cpp - ${COMPILE_ONLY_SOURCES} +set(COMPILE_ONLY_SOURCES TestCreateMirror.cpp TestDualViewParameterPack.cpp TestIsViewTrait.cpp + TestDynRankViewTypedefs.cpp ) +kokkos_add_executable(ContainersTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES}) diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp index 9923453f72ce..91dc1710e5f8 100644 --- a/packages/kokkos/containers/unit_tests/TestBitset.hpp +++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp @@ -39,7 +39,7 @@ struct TestBitset { TestBitset(bitset_type const& bitset) : m_bitset(bitset) {} - unsigned testit(unsigned collisions) { + unsigned testit(unsigned long long collisions) { execution_space().fence(); unsigned count = 0; diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp index 2512cb5c4915..5d03e6202a89 100644 --- a/packages/kokkos/containers/unit_tests/TestDualView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp @@ -71,7 +71,7 @@ struct test_dualview_copy_construction_and_assignment { using SrcViewType = Kokkos::DualView; using DstViewType = - Kokkos::DualView; + Kokkos::DualView; SrcViewType a("A", n, m); @@ -520,58 +520,26 @@ namespace { * that we keep the semantics of UVM DualViews intact. */ // modify if we have other UVM enabled backends -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // OR other UVM builds -#define UVM_ENABLED_BUILD -#endif - -#ifdef UVM_ENABLED_BUILD -template -struct UVMSpaceFor; -#endif - -#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA -template <> -struct UVMSpaceFor { - using type = Kokkos::CudaUVMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_SYCL // specific to SYCL -template <> -struct UVMSpaceFor { - using type = Kokkos::Experimental::SYCLSharedUSMSpace; -}; -#endif -#ifdef KOKKOS_ENABLE_HIP // specific to HIP -template <> -struct UVMSpaceFor { - using type = Kokkos::HIPManagedSpace; -}; -#endif - -#ifdef UVM_ENABLED_BUILD -template <> -struct UVMSpaceFor { - using type = typename UVMSpaceFor::type; -}; +#ifdef KOKKOS_HAS_SHARED_SPACE +template +using TestSharedSpace = Kokkos::SharedSpace; #else -template -struct UVMSpaceFor { - using type = typename ExecSpace::memory_space; -}; +template +using TestSharedSpace = typename ExecutionSpace::memory_space; #endif using ExecSpace = Kokkos::DefaultExecutionSpace; -using MemSpace = typename UVMSpaceFor::type; +using MemSpace = TestSharedSpace; using DeviceType = Kokkos::Device; using DualViewType = Kokkos::DualView; -using d_device = DeviceType; -using h_device = Kokkos::Device< - Kokkos::DefaultHostExecutionSpace, - typename UVMSpaceFor::type>; +using ConstDualViewType = + Kokkos::DualView; +using d_device = DeviceType; +using h_device = + Kokkos::Device>; TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { DualViewType dv("myView", 100); @@ -635,14 +603,69 @@ TEST(TEST_CATEGORY, dualview_template_views_return_correct_executionspace_views) { DualViewType dv("myView", 100); dv.clear_sync_state(); - using hvt = decltype(dv.view()); - using dvt = decltype(dv.view()); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), dvt::device_type::execution_space::name()); ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), hvt::device_type::execution_space::name()); } +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_views_from_const_dual_view) { + DualViewType dv("myView", 100); + ConstDualViewType const_dv = dv; + dv.clear_sync_state(); + ASSERT_EQ(dv.view(), + const_dv.view()); + ASSERT_EQ(dv.view(), + const_dv.view()); +} + +// User-defined types with a View data member, only host-constructible +template +class S { + V v_; + + public: + template + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + S() : v_("v", 10) {} +}; + +template +auto initialize_view_of_views() { + Kokkos::DualView dv_v( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 3u); + + V v("v", 2); + V w("w", 2); + dv_v.h_view(0) = v; + dv_v.h_view(1) = w; + + dv_v.modify_host(); + dv_v.sync_device(); + + return dv_v; +} + +TEST(TEST_CATEGORY, dualview_sequential_host_init) { + auto dv_v = initialize_view_of_views>(); + dv_v.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv_v.d_view.size(), 2u); + ASSERT_EQ(dv_v.h_view.size(), 2u); + + initialize_view_of_views>>(); + + Kokkos::DualView dv( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 1u); + dv.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv.d_view.size(), 2u); + ASSERT_EQ(dv.h_view.size(), 2u); + dv.realloc(Kokkos::view_alloc(Kokkos::SequentialHostInit), 3u); + ASSERT_EQ(dv.d_view.size(), 3u); + ASSERT_EQ(dv.h_view.size(), 3u); +} } // anonymous namespace } // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp b/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp new file mode 100644 index 000000000000..95117a22e6e8 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp @@ -0,0 +1,260 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +// clang-format off +template +struct data_analysis { + using data_type = DataType; + using const_data_type = const DataType; + using runtime_data_type = DataType; + using runtime_const_data_type = const DataType; + using non_const_data_type = std::remove_const_t; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type*; + using const_data_type = typename data_analysis::const_data_type*; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type*; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type[N]; + using const_data_type = typename data_analysis::const_data_type[N]; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type[N]; +}; + +template +constexpr bool test_view_typedefs_impl() { + // ======================== + // inherited from ViewTraits + // ======================== + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + + // FIXME: these should be deprecated and for proper testing (I.e. where this is different from data_type) + // we would need ensemble types which use the hidden View dimension facility of View (i.e. which make + // "specialize" not void) + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + static_assert(std::is_same_v); + + // FIXME: value_type definition conflicts with mdspan value_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + + // FIXME: should maybe be deprecated + static_assert(std::is_same_v); + + // FIXME: should be deprecated and is some complicated impl type + // static_assert(!std::is_void_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should be deprecated in favor of reference + static_assert(std::is_same_v); + // FIXME: should be deprecated in favor of data_handle_type + static_assert(std::is_same_v); + + // ========================================= + // in Legacy View: some helper View variants + // ========================================= + + // FIXME: in contrast to View, hooks_policy is not propagated + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + +/* FIXME: these don't exist in DynRankView, should they? + using uniform_layout_type = std::conditional_t), + Kokkos::LayoutLeft, Layout>; + + // Uhm uniformtype removes all memorytraits? + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + + using anonymous_device_type = Kokkos::Device; + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); +*/ + + // ================================== + // mdspan compatibility + // ================================== + + // FIXME: This typedef caused some weird issue with MSVC+NVCC + // static_assert(std::is_same_v); + // FIXME: Not supported yet + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + + static_assert(std::is_same_v); + // FIXME: should be remove_const_t + static_assert(std::is_same_v); + // FIXME: should be extents_type::index_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should come from accessor_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + return true; +} + +// Helper function to unpack data type and other args from the View, and pass them on +template +struct ViewParams {}; + +template +constexpr bool test_view_typedefs(ViewParams) { + return test_view_typedefs_impl, Kokkos::ViewTraits, + T, L, S, M, HostMirrorSpace, ValueType, ReferenceType>(); +} + + +constexpr bool is_host_exec = std::is_same_v; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +constexpr bool has_unified_mem_space = true; +#else +constexpr bool has_unified_mem_space = false; +#endif + +// The test take explicit template arguments for: LayoutType, Space, MemoryTraits, HostMirrorSpace, ValueType, ReferenceType +// The ViewParams is just a type pack for the View template arguments + +// Kokkos::View +namespace TestInt { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestIntDefaultExecutionSpace { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, it is HostSpace (note difference from View ...) + using host_mirror_space = std::conditional_t>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestFloatPPHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::HostSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View> +namespace TestFloatPPDeviceDefaultHostExecHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::Device; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams>{})); +} + +// Kokkos::View> +namespace TestIntAtomic { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs>>>( + ViewParams>{})); +} +// clang-format on +} // namespace diff --git a/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp b/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp new file mode 100644 index 000000000000..e5f8860de76c --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +void test_dyn_rank_view_team_scratch() { + using execution_space = TEST_EXECSPACE; + using memory_space = execution_space::scratch_memory_space; + using drv_type = Kokkos::DynRankView; + using policy_type = Kokkos::TeamPolicy; + using team_type = policy_type::member_type; + + int N0 = 10, N1 = 4, N2 = 3; + size_t shmem_size = drv_type::shmem_size(N0, N1, N2); + ASSERT_GE(shmem_size, N0 * N1 * N2 * sizeof(int)); + + Kokkos::View> + errors("errors"); + auto policy = policy_type(1, Kokkos::AUTO) + .set_scratch_size(0, Kokkos::PerTeam(shmem_size)); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const team_type& team) { + drv_type scr(team.team_scratch(0), N0, N1, N2); + // Control that the code ran at all + if (scr.rank() != 3) errors() |= 1u; + if (scr.extent_int(0) != N0) errors() |= 2u; + if (scr.extent_int(1) != N1) errors() |= 4u; + if (scr.extent_int(2) != N2) errors() |= 8u; + Kokkos::parallel_for( + Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { scr(i, j, k) = i * 100 + j * 10 + k; }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { + if (scr(i, j, k) != i * 100 + j * 10 + k) + errors() |= 16u; + }); + errors() |= 256u; + }); + unsigned h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + + ASSERT_EQ((h_errors & 1u), 0u) << "Rank mismatch"; + ASSERT_EQ((h_errors & 2u), 0u) << "extent 0 mismatch"; + ASSERT_EQ((h_errors & 4u), 0u) << "extent 1 mismatch"; + ASSERT_EQ((h_errors & 8u), 0u) << "extent 2 mismatch"; + ASSERT_EQ((h_errors & 16u), 0u) << "data access incorrect"; + ASSERT_EQ(h_errors, 256u); +} + +TEST(TEST_CATEGORY, dyn_rank_view_team_scratch) { + test_dyn_rank_view_team_scratch(); +} + +} // namespace diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 4ecb6cf25cc5..930c76c32c47 100644 --- a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -792,9 +792,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -817,9 +816,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -846,9 +844,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -879,8 +876,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -915,8 +911,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -943,8 +938,6 @@ class TestDynViewAPI { dView0 d("d"); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - // Rank 0 Kokkos::resize(d); @@ -1121,8 +1114,6 @@ class TestDynViewAPI { Kokkos::deep_copy(error_flag_host, error_flag); ASSERT_EQ(error_flag_host(), 0); #endif // MDRangePolict Rank < 7 - -#endif // defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) } static void run_test_scalar() { diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp index c8f8fed3b8b3..94ccea86eb9b 100644 --- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -71,7 +71,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -85,7 +84,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -93,7 +91,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -108,7 +105,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -123,7 +119,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -137,7 +132,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -145,7 +139,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -160,7 +153,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -175,7 +167,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -189,14 +180,12 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // remove the final 3/4 entries i.e. first 1/4 remain unsigned da_resize = arg_total_size / 8; da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -210,7 +199,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Reproducer to demonstrate compile-time error of deep_copy @@ -229,7 +217,6 @@ struct TestDynamicView { device_dynamic_view.resize_serial(da_size); // Use parallel_for to populate device_dynamic_view and verify values -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); @@ -243,7 +230,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // Use an on-device View as intermediate to deep_copy the // device_dynamic_view to host, zero out the device_dynamic_view, @@ -251,13 +237,11 @@ struct TestDynamicView { Kokkos::deep_copy(device_view, device_dynamic_view); Kokkos::deep_copy(host_view, device_view); Kokkos::deep_copy(device_view, host_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); -#endif Kokkos::deep_copy(device_dynamic_view, device_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, da_size), @@ -267,21 +251,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif - - // Try to deep_copy device_dynamic_view directly to/from host. - // host-to-device currently fails to compile because DP and SP are - // swapped in the deep_copy implementation. - // Once that's fixed, both deep_copy's will fail at runtime because the - // destination execution space cannot access the source memory space. - // Check if the memory spaces are different before testing the deep_copy. - if (!Kokkos::SpaceAccessibility::accessible) { - ASSERT_THROW(Kokkos::deep_copy(host_view, device_dynamic_view), - std::runtime_error); - ASSERT_THROW(Kokkos::deep_copy(device_dynamic_view, host_view), - std::runtime_error); - } } } }; diff --git a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp index 0003a29468c5..4ebab889c78f 100644 --- a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp +++ b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -149,7 +149,6 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase { } }; -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase { @@ -178,7 +177,6 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; -#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP @@ -205,8 +203,7 @@ struct ErrorReporterDriverNativeOpenMP // FIXME_MSVC MSVC just gets confused when using the base class in the // KOKKOS_CLASS_LAMBDA -#if !defined(KOKKOS_COMPILER_MSVC) && \ - (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +#ifndef KOKKOS_COMPILER_MSVC TEST(TEST_CATEGORY, ErrorReporterViaLambda) { TestErrorReporter>(); } diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp index c133922e3def..706b40fff386 100644 --- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -56,7 +56,18 @@ void test_offsetview_construction() { offset_view_type ov("firstOV", range0, range1); ASSERT_EQ("firstOV", ov.label()); - ASSERT_EQ(2, ov.Rank); + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ASSERT_EQ(2u, ov.Rank); +#endif +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + + ASSERT_EQ(2u, ov.rank()); ASSERT_EQ(ov.begin(0), -1); ASSERT_EQ(ov.end(0), 4); @@ -67,7 +78,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -149,7 +159,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif { offset_view_type ovCopy(ov); @@ -184,7 +193,6 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -207,7 +215,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif } view_type viewFromOV = ov.view(); @@ -232,7 +239,6 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -242,7 +248,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif } { // test view to offsetview deep copy @@ -251,7 +256,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -261,7 +265,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif } } @@ -329,46 +332,131 @@ void test_offsetview_unmanaged_construction() { ASSERT_EQ(bb, ib); ASSERT_EQ(bb, ii); } +} + +template +void test_offsetview_unmanaged_construction_death() { + // Preallocated memory (Only need a valid address for this test) + Scalar s; + + // Regular expression syntax on Windows is a pain. `.` does not match `\n`. + // Feel free to make it work if you have time to spare. +#ifdef _WIN32 +#define SKIP_REGEX_ON_WINDOWS(REGEX) "" +#else +#define SKIP_REGEX_ON_WINDOWS(REGEX) REGEX +#endif { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must be positive - ASSERT_NO_THROW(offset_view_type(&s, {0}, {1})); - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0})); - ASSERT_THROW(offset_view_type(&s, {0}, {-1}), std::runtime_error); + (void)offset_view_type(&s, {0}, {1}); + (void)offset_view_type(&s, {0}, {0}); + ASSERT_DEATH( + offset_view_type(&s, {0}, {-1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(-1\\) - begins\\[0\\] \\(0\\)\\) must be " + "non-negative")); } { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must not overflow - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0x7fffffffffffffffl})); - ASSERT_THROW(offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW( + (void)offset_view_type(&s, {0}, {0x7fffffffffffffffl}); + ASSERT_DEATH( + offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-1\\)\\) " + "overflows")); + ASSERT_DEATH( offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), - std::runtime_error); + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); + ASSERT_DEATH( + offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(0\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); } { using offset_view_type = Kokkos::Experimental::OffsetView; - // Should throw when the rank of begins and/or ends doesn't match that of - // OffsetView - ASSERT_THROW(offset_view_type(&s, {0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1}), std::runtime_error); - ASSERT_NO_THROW(offset_view_type(&s, {0, 0}, {1, 1})); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), - std::runtime_error); + // Should throw when the rank of begins and/or ends doesn't match that + // of OffsetView + ASSERT_DEATH( + offset_view_type(&s, {0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + (void)offset_view_type(&s, {0, 0}, {1, 1}); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); } +#undef SKIP_REGEX_ON_WINDOWS } template @@ -377,8 +465,8 @@ void test_offsetview_subview() { Kokkos::Experimental::OffsetView sliceMe("offsetToSlice", {-10, 20}); { - auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0); - ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken."; + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0); + ASSERT_EQ(offsetSubview.rank(), 0u) << "subview of offset is broken."; } } { // test subview 2 @@ -387,13 +475,13 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), -2); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -406,30 +494,29 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::make_pair(-30, -21)); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; ASSERT_EQ(offsetSubview.begin(0), -20); ASSERT_EQ(offsetSubview.end(0), 31); ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -455,25 +542,24 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif } // slice 2 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -486,73 +572,72 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } // slice 2 auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2a.rank(), 2u) << "subview of offset is broken."; { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } // slice 3 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -586,6 +671,7 @@ void test_offsetview_offsets_rank1() { KOKKOS_LAMBDA(const int ii, int& lerrors) { offset_view_type ov(v, {ii}); lerrors += (ov(3) != element({3 - ii})); + lerrors += (ov[3] != element({3 - ii})); }, errors); @@ -655,7 +741,6 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } -#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); @@ -665,11 +750,15 @@ TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); } +TEST(TEST_CATEGORY_DEATH, offsetview_unmanaged_construction) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_offsetview_unmanaged_construction_death(); +} + TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview(); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1(); } @@ -681,7 +770,6 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3(); } -#endif } // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp index 733f43122ce9..72c1afbe96a7 100644 --- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp @@ -33,11 +33,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -134,11 +134,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -235,11 +235,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -335,11 +335,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -714,7 +714,7 @@ void test_scatter_view(int64_t n) { test_sv_config.run_test(n); } #ifdef KOKKOS_ENABLE_SERIAL - if (!std::is_same::value) { + if (!std::is_same_v) { #endif test_scatter_view_config::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); } } /* namespace TestStaticCrsGraph */ diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp index 4a7e826ecbe4..fc7435a75e56 100644 --- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -460,7 +460,7 @@ struct UnorderedMapInsert { //! Insert multiple values. template - void insert(Args &&... args) const { + void insert(Args &&...args) const { static_assert(sizeof...(Args) > 1, "Prefer the single value version"); constexpr size_t size = sizeof...(Args); Kokkos::Array values{ @@ -534,8 +534,6 @@ TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) { ASSERT_EQ(1u, test_map_copy.m_map.size()); } -#if !defined(KOKKOS_ENABLE_CUDA) || \ - (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA)) void test_unordered_map_device_capture() { TestMapCopy::map_type map; @@ -549,7 +547,6 @@ void test_unordered_map_device_capture() { TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { test_unordered_map_device_capture(); } -#endif /** * @test This test ensures that an @ref UnorderedMap can be built diff --git a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp index 0246f11ddfe7..2edddcce34f4 100644 --- a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp +++ b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -48,7 +48,7 @@ struct TestViewCtorProp_EmbeddedDim { void operator()(const int i) const { v(i) = i; } }; - static void test_vcpt(const int N0, const int N1) { + static void test_vcpt(const size_t N0, const size_t N1) { // Create two views to test { using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; @@ -78,16 +78,16 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); #if 0 // debug output - for ( int i = 0; i < N0*N1; ++i ) { - printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + for ( size_t i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%zu) = %lf\n ", i, hcv1(i) ); } printf( " Common value type view: %s \n", typeid( CVT() ).name() ); printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); - if ( std::is_same< CommonViewValueType, double >::value == true ) { + if ( std::is_same_v< CommonViewValueType, double > == true ) { printf("Proper common value_type\n"); } else { @@ -115,7 +115,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } @@ -148,7 +148,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } { @@ -169,7 +169,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } diff --git a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index e8558628dc84..2932898554c5 100644 --- a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -44,6 +44,12 @@ Kokkos::CudaSpace>) \ GTEST_SKIP() << "skipping since unified memory requires additional " \ "fences"; +#elif defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; #else #define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE #endif @@ -51,8 +57,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 5, 6, 7, - 8); + Kokkos::DualView bla("bla", 5, 6, 7, 8); auto success = validate_absence( [&]() { @@ -82,8 +87,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -112,8 +116,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -245,7 +248,7 @@ TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif @@ -280,7 +283,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 4, 5, 6, 7); auto success = validate_absence( @@ -312,7 +315,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -343,7 +346,7 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -384,13 +387,12 @@ TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif diff --git a/packages/kokkos/core/CMakeLists.txt b/packages/kokkos/core/CMakeLists.txt index 0917928001a9..21f05f627242 100644 --- a/packages/kokkos/core/CMakeLists.txt +++ b/packages/kokkos/core/CMakeLists.txt @@ -1,22 +1,14 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() -FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) - IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() - ENDIF() +function(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) + if(NOT Kokkos_ENABLE_BENCHMARKS) + return() + endif() - IF(KOKKOS_HAS_TRILINOS) - message( - STATUS - "Benchmarks are not supported when building as part of Trilinos" - ) - RETURN() - ENDIF() + add_subdirectory(${DIR_NAME}) +endfunction() - ADD_SUBDIRECTORY(${DIR_NAME}) -ENDFUNCTION() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_test) -KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) +kokkos_add_test_directories(unit_test) +kokkos_add_benchmark_directory(perf_test) diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt index e0dba03e1ecb..0cb2c804d383 100644 --- a/packages/kokkos/core/perf_test/CMakeLists.txt +++ b/packages/kokkos/core/perf_test/CMakeLists.txt @@ -1,50 +1,36 @@ # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. # FIXME_OPENACC - temporarily disabled due to unimplemented features -IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - RETURN() -ENDIF() -IF (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - RETURN() -ENDIF() +if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + return() +endif() +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + return() +endif() # all PerformanceTest_* executables are part of regular tests # TODO: finish converting these into benchmarks (in progress) -IF(KOKKOS_ENABLE_TESTS) - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE ( - PerformanceTest_SharedSpace - SOURCES test_sharedSpace.cpp - ) - ENDIF() - - KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - - IF(NOT Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET needs tasking - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_TaskDag - SOURCES test_taskdag.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() -ENDIF() - -IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() -ENDIF() - -IF (KOKKOS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") -ENDIF() +if(KOKKOS_ENABLE_TESTS) + if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) + endif() + + kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + + kokkos_add_executable_and_test(PerformanceTest_TaskDag SOURCES test_taskdag.cpp CATEGORIES PERFORMANCE) +endif() + +if(NOT Kokkos_ENABLE_BENCHMARKS) + return() +endif() # Find or download google/benchmark library find_package(benchmark QUIET 1.5.6) -IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") -ELSE() +if(benchmark_FOUND) + message(STATUS "Using google benchmark found in ${benchmark_DIR}") +else() message(STATUS "No installed google benchmark found, fetching from GitHub") include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_TESTING OFF) list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( @@ -57,143 +43,93 @@ ELSE() list(POP_BACK CMAKE_MESSAGE_INDENT) # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(benchmark PROPERTIES CXX_CLANG_TIDY "") + endif() target_compile_options(benchmark PRIVATE -w) target_compile_options(benchmark_main PRIVATE -w) -ENDIF() +endif() +function(KOKKOS_ADD_BENCHMARK NAME) + cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) + if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) + endif() -FUNCTION(KOKKOS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() - - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - LIST(APPEND BENCHMARK_SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - ) + set(BENCHMARK_NAME Kokkos_${NAME}) + list(APPEND BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp) - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) + add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) + target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) + target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE} - ) - ENDFOREACH() - - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) + foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) + set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + endforeach() - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) -ENDFUNCTION() - -SET( - BENCHMARK_SOURCES - PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - PerfTestHexGrad.cpp - PerfTest_MallocFree.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewCopy_a123.cpp - PerfTest_ViewCopy_b123.cpp - PerfTest_ViewCopy_c123.cpp - PerfTest_ViewCopy_d123.cpp - PerfTest_ViewCopy_a45.cpp - PerfTest_ViewCopy_b45.cpp - PerfTest_ViewCopy_c45.cpp - PerfTest_ViewCopy_d45.cpp - PerfTest_ViewCopy_a6.cpp - PerfTest_ViewCopy_b6.cpp - PerfTest_ViewCopy_c6.cpp - PerfTest_ViewCopy_d6.cpp - PerfTest_ViewCopy_a7.cpp - PerfTest_ViewCopy_b7.cpp - PerfTest_ViewCopy_c7.cpp - PerfTest_ViewCopy_d7.cpp - PerfTest_ViewCopy_a8.cpp - PerfTest_ViewCopy_b8.cpp - PerfTest_ViewCopy_c8.cpp - PerfTest_ViewCopy_d8.cpp - PerfTest_ViewCopy_Raw.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewFill_Raw.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp - PerfTest_ViewResize_Raw.cpp -) + string(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) + + add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) +endfunction() -IF(Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM BENCHMARK_SOURCES +set(BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + PerfTestHexGrad.cpp + PerfTest_MallocFree.cpp + PerfTest_ViewAllocate.cpp + PerfTest_ViewCopy_a123.cpp + PerfTest_ViewCopy_b123.cpp + PerfTest_ViewCopy_c123.cpp + PerfTest_ViewCopy_d123.cpp + PerfTest_ViewCopy_a45.cpp + PerfTest_ViewCopy_b45.cpp + PerfTest_ViewCopy_c45.cpp + PerfTest_ViewCopy_d45.cpp + PerfTest_ViewCopy_a6.cpp + PerfTest_ViewCopy_b6.cpp + PerfTest_ViewCopy_c6.cpp + PerfTest_ViewCopy_d6.cpp + PerfTest_ViewCopy_a7.cpp + PerfTest_ViewCopy_b7.cpp + PerfTest_ViewCopy_c7.cpp + PerfTest_ViewCopy_d7.cpp + PerfTest_ViewCopy_a8.cpp + PerfTest_ViewCopy_b8.cpp + PerfTest_ViewCopy_c8.cpp + PerfTest_ViewCopy_d8.cpp + PerfTest_ViewCopy_Raw.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewFill_Raw.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + PerfTest_ViewResize_Raw.cpp ) -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) - KOKKOS_ADD_BENCHMARK( - Benchmark_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp +if(Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + list(REMOVE_ITEM BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp ) -ENDIF() +endif() + +kokkos_add_benchmark(PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES}) + +kokkos_add_benchmark(Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp) # FIXME_NVHPC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - KOKKOS_ADD_BENCHMARK( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - ) -ENDIF() +if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + kokkos_add_benchmark(PerformanceTest_Mempool SOURCES test_mempool.cpp) +endif() -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Atomic - SOURCES test_atomic.cpp -) +kokkos_add_benchmark(PerformanceTest_Atomic SOURCES test_atomic.cpp) diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp index 98cb246c71e1..1ebe750f2164 100644 --- a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp +++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -34,10 +34,10 @@ struct HexGrad { enum { NSpace = 3, NNode = 8 }; using elem_coord_type = - Kokkos::View; + Kokkos::View; using elem_grad_type = - Kokkos::View; + Kokkos::View; elem_coord_type coords; elem_grad_type grad_op; diff --git a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp index 2110f38a916f..03340a5d6de4 100644 --- a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -21,7 +21,6 @@ #include #include -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template std::pair custom_reduction_test(int N, int R) { @@ -130,4 +129,3 @@ BENCHMARK(CustomReduction) ->UseManualTime(); } // namespace Test -#endif diff --git a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index d2a3d0b823a2..aa23ddbb6072 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -56,8 +56,7 @@ bool is_overlapping(const Kokkos::HIP&) { #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -bool is_overlapping( - const Kokkos::Experimental::SYCL&) { +bool is_overlapping(const Kokkos::SYCL&) { return true; } #endif diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e55545..e4db40e128c3 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) @@ -38,6 +37,5 @@ BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) ->UseManualTime(); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp index c11074d9154f..57bba83a9c1e 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) @@ -28,6 +27,5 @@ BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp index 2d1bcbb3cab5..ab469cb647ca 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw) ->ArgName("N") ->Arg(N) @@ -30,6 +29,5 @@ BENCHMARK(ViewResize_NoInit_Raw) ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp index 9905740afb4d..bdfe59b0b3bc 100644 --- a/packages/kokkos/core/perf_test/test_mempool.cpp +++ b/packages/kokkos/core/perf_test/test_mempool.cpp @@ -198,7 +198,7 @@ static void Mempool_Fill(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, @@ -225,7 +225,7 @@ static void Mempool_Alloc_Dealloc(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, diff --git a/packages/kokkos/core/perf_test/test_sharedSpace.cpp b/packages/kokkos/core/perf_test/test_sharedSpace.cpp index 4f140c9409ad..3c06770e2861 100644 --- a/packages/kokkos/core/perf_test/test_sharedSpace.cpp +++ b/packages/kokkos/core/perf_test/test_sharedSpace.cpp @@ -103,7 +103,7 @@ size_t getDeviceMemorySize() { #elif defined KOKKOS_ENABLE_HIP return Kokkos::HIP{}.hip_device_prop().totalGlobalMem; #elif defined KOKKOS_ENABLE_SYCL - auto device = Kokkos::Experimental::SYCL{}.sycl_queue().get_device(); + auto device = Kokkos::SYCL{}.sycl_queue().get_device(); return device.get_info(); #else #error \ diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp index fccaab64ddf1..347d9748b5a9 100644 --- a/packages/kokkos/core/perf_test/test_taskdag.cpp +++ b/packages/kokkos/core/perf_test/test_taskdag.cpp @@ -32,6 +32,11 @@ int main() { return 0; } #include +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + using ExecSpace = Kokkos::DefaultExecutionSpace; inline long eval_fib(long n) { @@ -223,4 +228,8 @@ int main(int argc, char* argv[]) { return 0; } +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + #endif diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt index b84677e61b6f..72663739a142 100644 --- a/packages/kokkos/core/src/CMakeLists.txt +++ b/packages/kokkos/core/src/CMakeLists.txt @@ -1,118 +1,125 @@ -KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} - ${KOKKOS_TOP_BUILD_DIR} -) -IF (NOT desul_FOUND) - IF(KOKKOS_ENABLE_CUDA) - SET(DESUL_ATOMICS_ENABLE_CUDA ON) - ENDIF() - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP) - SET(DESUL_ATOMICS_ENABLE_HIP ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_SYCL) - SET(DESUL_ATOMICS_ENABLE_SYCL ON) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) - ENDIF() - ENDIF() - IF(KOKKOS_ENABLE_OPENMPTARGET) - SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP - ENDIF() - IF(KOKKOS_ENABLE_OPENACC) - SET(DESUL_ATOMICS_ENABLE_OPENACC ON) - ENDIF() - CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp - ) - KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${KOKKOS_TOP_BUILD_DIR}) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + set(DESUL_ATOMICS_ENABLE_CUDA ON) + endif() + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_HIP) + set(DESUL_ATOMICS_ENABLE_HIP ON) + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_SYCL) + set(DESUL_ATOMICS_ENABLE_SYCL ON) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + endif() + endif() + if(KOKKOS_ENABLE_OPENMPTARGET) + set(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP + endif() + if(KOKKOS_ENABLE_OPENACC) + # FIXME_OPENACC FIXME_CLACC - Below condition will be removed if Clacc can compile atomics. + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + set(DESUL_ATOMICS_ENABLE_OPENACC ON) + endif() + endif() + configure_file( + ${KOKKOS_SOURCE_DIR}/tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp ) -ENDIF() + kokkos_include_directories(${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/" +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h" ) -SET(KOKKOS_CORE_SRCS) -APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CORE_HEADERS) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) - -IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_THREADS) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HPX) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_SERIAL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) -ENDIF() - -IF (NOT desul_FOUND) - IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp) - ELSEIF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp) - ELSEIF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp) - ENDIF() - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) - - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul" - "${CMAKE_CURRENT_BINARY_DIR}/desul" +set(KOKKOS_CORE_SRCS) +append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CORE_HEADERS) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) + +if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/Kokkos_Cuda_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/Kokkos_OpenMP_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENACC) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) +endif() + +if(KOKKOS_ENABLE_THREADS) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) +endif() + +if(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) +endif() + +if(KOKKOS_ENABLE_HPX) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/Kokkos_HPX_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) +endif() + +if(KOKKOS_ENABLE_SERIAL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/Kokkos_Serial_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) +endif() + +if(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +endif() + +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_CUDA.cpp) + elseif(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_HIP.cpp) + elseif(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_SYCL.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/*/*/*.inc*) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) + + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul" "${CMAKE_CURRENT_BINARY_DIR}/desul" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.inc" @@ -120,33 +127,26 @@ IF (NOT desul_FOUND) PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal desul_atomics copy") -ELSE() - MESSAGE(STATUS "Using external desul_atomics install found at:") - MESSAGE(STATUS " " ${desul_DIR}) -ENDIF() - + message(STATUS "Using internal desul_atomics copy") +else() + message(STATUS "Using external desul_atomics install found at:") + message(STATUS " " ${desul_DIR}) +endif() -KOKKOS_ADD_LIBRARY( - kokkoscore - SOURCES ${KOKKOS_CORE_SRCS} - HEADERS ${KOKKOS_CORE_HEADERS} +kokkos_add_library( + kokkoscore SOURCES ${KOKKOS_CORE_SRCS} HEADERS ${KOKKOS_CORE_HEADERS} ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -IF (NOT desul_FOUND) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() +if(NOT desul_FOUND) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -IF (Kokkos_ENABLE_IMPL_MDSPAN) - MESSAGE(STATUS "Experimental mdspan support is enabled") +if(Kokkos_ENABLE_IMPL_MDSPAN) + message(STATUS "Experimental mdspan support is enabled") # Some compilers now include mdspan... we just flag on their version # for now until we can get some compiler detection support @@ -154,62 +154,56 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) check_include_file_cxx(experimental/mdspan KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN) check_include_file_cxx(mdspan KOKKOS_COMPILER_SUPPORTS_MDSPAN) - if (Kokkos_ENABLE_MDSPAN_EXTERNAL) - MESSAGE(STATUS "Using external mdspan") + if(Kokkos_ENABLE_MDSPAN_EXTERNAL) + message(STATUS "Using external mdspan") target_link_libraries(kokkoscore PUBLIC std::mdspan) elseif(KOKKOS_COMPILER_SUPPORTS_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied mdspan") elseif(KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied experimental/mdspan") else() - KOKKOS_LIB_INCLUDE_DIRECTORIES( - kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include - ) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/__p0009_bits/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/mdspan) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/__p0009_bits/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/mdspan) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "mdspan" PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal mdspan directory ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include") + message(STATUS "Using internal mdspan directory ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include") endif() -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) +kokkos_link_tpl(kokkoscore PUBLIC HWLOC) +kokkos_link_tpl(kokkoscore PUBLIC CUDA) +kokkos_link_tpl(kokkoscore PUBLIC HPX) +kokkos_link_tpl(kokkoscore PUBLIC LIBDL) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread -IF (NOT WIN32) - KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) -ENDIF() +if(NOT WIN32) + kokkos_link_tpl(kokkoscore PUBLIC THREADS) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_link_tpl(kokkoscore PUBLIC ROCM) +endif() # FIXME: We need a proper solution to figure out whether to enable # libatomic # Most compilers only require libatomic for 128-bit CAS # I (CT) had removed 128bit CAS from desul to not need libatomic. -IF (KOKKOS_ENABLE_OPENMPTARGET) +if(KOKKOS_ENABLE_OPENMPTARGET) target_link_libraries(kokkoscore PUBLIC atomic) -ENDIF() +endif() -IF (desul_FOUND) +if(desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) -ENDIF() +endif() -# FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency so we -# just append the flags in cmake/kokkos_tpls.cmake instead of linking with the -# OpenMP target. -IF(Kokkos_ENABLE_OPENMP AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_OPENMP) target_link_libraries(kokkoscore PUBLIC OpenMP::OpenMP_CXX) -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH) +kokkos_link_tpl(kokkoscore PUBLIC LIBQUADMATH) diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index fd86976d3ba6..07c35e6611f1 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -35,7 +35,6 @@ static_assert(false, #include // CUDA_SAFE_CALL #include -#include #include #include #include diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 6ae24022c8fd..8bcd6525c962 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -201,7 +201,14 @@ void *impl_allocate_common(const int device_id, } } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - if (arg_alloc_size >= memory_threshold_g) { + // FIXME_KEPLER Everything after Kepler should support cudaMallocAsync + int device_supports_cuda_malloc_async; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceGetAttribute(&device_supports_cuda_malloc_async, + cudaDevAttrMemoryPoolsSupported, device_id)); + + if (arg_alloc_size >= memory_threshold_g && + device_supports_cuda_malloc_async == 1) { error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); if (error_code == cudaSuccess) { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index e1d062d72d5a..1ccf38a4a158 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -73,9 +73,9 @@ class CudaSpace { CudaSpace(int device_id, cudaStream_t stream); public: - CudaSpace(CudaSpace&& rhs) = default; - CudaSpace(const CudaSpace& rhs) = default; - CudaSpace& operator=(CudaSpace&& rhs) = default; + CudaSpace(CudaSpace&& rhs) = default; + CudaSpace(const CudaSpace& rhs) = default; + CudaSpace& operator=(CudaSpace&& rhs) = default; CudaSpace& operator=(const CudaSpace& rhs) = default; ~CudaSpace() = default; @@ -174,9 +174,9 @@ class CudaUVMSpace { CudaUVMSpace(int device_id, cudaStream_t stream); public: - CudaUVMSpace(CudaUVMSpace&& rhs) = default; - CudaUVMSpace(const CudaUVMSpace& rhs) = default; - CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(const CudaUVMSpace& rhs) = default; + CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; ~CudaUVMSpace() = default; @@ -266,9 +266,9 @@ class CudaHostPinnedSpace { CudaHostPinnedSpace(int device_id, cudaStream_t stream); public: - CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; - CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; - CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; + CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace& operator=(const CudaHostPinnedSpace& rhs) = default; ~CudaHostPinnedSpace() = default; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index 5a821ab64a3c..058b1f538d56 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -51,7 +51,8 @@ class GraphNodeKernelImpl m_graph_node_ptr = nullptr; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... - mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; public: using Policy = PolicyType; @@ -61,25 +62,20 @@ class GraphNodeKernelImpl - GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, Cuda const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) // This is super ugly, but it works for now and is the most minimal change // to the codebase for now... - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} // FIXME @graph Forward through the instance once that works in the backends template GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", ex, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -90,13 +86,21 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const CudaSpace& mem) const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast(mem.allocate(alloc_label.c_str(), sizeof(base_t))), + [alloc_label, mem](base_t* ptr) { + mem.deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr) - return m_driver_storage; + return m_driver_storage.get(); } + + auto get_driver_storage() const { return m_driver_storage; } }; struct CudaGraphNodeAggregateKernel { @@ -128,7 +132,8 @@ struct get_graph_node_kernel_type // {{{1 template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const CudaSpace& mem, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = @@ -136,7 +141,7 @@ auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(mem); } template diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index 625d8c317a1c..8e800e756d2b 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -51,7 +51,14 @@ struct GraphImpl { using node_details_t = GraphNodeBackendSpecificDetails; - void _instantiate_graph() { + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; + + public: + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; @@ -60,10 +67,10 @@ struct GraphImpl { ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph, &error_node, error_log, error_log_size))); + KOKKOS_ENSURES(m_graph_exec); // TODO @graphs print out errors } - public: using root_node_impl_t = GraphNodeImpl; @@ -74,11 +81,11 @@ struct GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to @@ -129,6 +136,8 @@ struct GraphImpl { kernel.set_cuda_graph_node_ptr(&cuda_node); kernel.execute(); KOKKOS_ENSURES(bool(cuda_node)); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } template @@ -158,13 +167,13 @@ struct GraphImpl { &cuda_node, 1))); } - void submit() { + void submit(const execution_space& exec) { if (!bool(m_graph_exec)) { - _instantiate_graph(); + instantiate(); } KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() - ->cuda_graph_launch_wrapper(m_graph_exec))); + (exec.impl_internal_space_instance()->cuda_graph_launch_wrapper( + m_graph_exec))); } execution_space const& get_execution_space() const noexcept { @@ -197,6 +206,9 @@ struct GraphImpl { m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_kernel_impl_t{}); } + + cudaGraph_t cuda_graph() { return m_graph; } + cudaGraphExec_t cuda_graph_exec() { return m_graph_exec; } }; } // end namespace Impl diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 158c8acdda6b..ec5768a7f0f6 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -//#include -//#include -//#include -//#include +// #include +// #include +// #include +// #include #include #include #include @@ -687,16 +687,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << " KOKKOS_ENABLE_CUDA: yes\n"; os << "Cuda Options:\n"; - os << " KOKKOS_ENABLE_CUDA_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA - os << "yes\n"; -#else - os << "no\n"; -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - os << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; - os << "yes\n"; -#endif os << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE os << "yes\n"; @@ -708,12 +698,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "yes\n"; #else os << "no\n"; -#endif - os << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - os << "yes\n"; -#else - os << "no\n"; #endif os << " KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: "; #ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b0dadb45f72b..2d00e735cb9d 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -209,8 +209,8 @@ inline void configure_shmem_preference(const int cuda_device, // Use multiples of 8kB const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; size_t carveout = shmem_per_block == 0 - ? 0 - : 100 * + ? 0 + : 100 * (((num_blocks_desired * shmem_per_block + min_shmem_size_per_sm - 1) / min_shmem_size_per_sm) * @@ -491,7 +491,10 @@ struct CudaParallelLaunchKernelInvoker< cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel( + CudaSpace::impl_create(cuda_instance->m_cudaDev, + cuda_instance->m_stream), + driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl @@ -714,7 +717,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::launch_kernel((Args &&) args...); + base_t::launch_kernel((Args&&)args...); } }; @@ -728,7 +731,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::create_parallel_launch_graph_node((Args &&) args...); + base_t::create_parallel_launch_graph_node((Args&&)args...); } }; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 630389840048..c50ff430345c 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -95,11 +95,39 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxthreads = m_rp.space().cuda_device_prop().maxThreadsDim; + [[maybe_unused]] const auto maxThreadsPerBlock = + m_rp.space().cuda_device_prop().maxThreadsPerBlock; + // make sure the Z dimension (it is less than x,y limits) isn't exceeded + const auto clampZ = [&](const int input) { + return (input > maxthreads[2] ? maxthreads[2] : input); + }; + // make sure the block dimensions don't exceed the max number of threads + // allowed + const auto check_block_sizes = [&]([[maybe_unused]] const dim3& block) { + KOKKOS_ASSERT(block.x > 0 && + block.x <= static_cast(maxthreads[0])); + KOKKOS_ASSERT(block.y > 0 && + block.y <= static_cast(maxthreads[1])); + KOKKOS_ASSERT(block.z > 0 && + block.z <= static_cast(maxthreads[2])); + KOKKOS_ASSERT(block.x * block.y * block.z <= + static_cast(maxThreadsPerBlock)); + }; + // make sure the grid dimensions don't exceed the max number of blocks + // allowed + const auto check_grid_sizes = [&]([[maybe_unused]] const dim3& grid) { + KOKKOS_ASSERT(grid.x > 0 && + grid.x <= static_cast(maxblocks[0])); + KOKKOS_ASSERT(grid.y > 0 && + grid.y <= static_cast(maxblocks[1])); + KOKKOS_ASSERT(grid.z > 0 && + grid.z <= static_cast(maxblocks[2])); + }; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -108,13 +136,12 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, maxblocks[1]), 1); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], clampZ(m_rp.m_tile[2])); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -125,15 +152,16 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, maxblocks[2])); + // ensure we don't exceed the capability of the device + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 4) { // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + clampZ(m_rp.m_tile[3])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -143,14 +171,15 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 5) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); + m_rp.m_tile[2] * m_rp.m_tile[3], clampZ(m_rp.m_tile[4])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -159,6 +188,7 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 6) { @@ -166,7 +196,8 @@ class ParallelFor, Kokkos::Cuda> { // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); + clampZ(m_rp.m_tile[4] * m_rp.m_tile[5])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -174,6 +205,7 @@ class ParallelFor, Kokkos::Cuda> { maxblocks[1]), std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 334834938a17..8251fcb248d3 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -48,7 +48,7 @@ class ParallelFor, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - ParallelFor() = delete; + ParallelFor() = delete; ParallelFor& operator=(const ParallelFor&) = delete; template diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 71e775182106..a2955e3ab61d 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,9 +539,14 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -631,7 +636,7 @@ class ParallelReduce word_count(m_functor_reducer.get_reducer().value_size() / sizeof(word_size_type)); - reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -895,11 +901,16 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 86d6d91bbee1..5090e84c38cc 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -31,6 +31,9 @@ //---------------------------------------------------------------------------- +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #if defined(__CUDA_ARCH__) #define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ { \ @@ -584,9 +587,9 @@ class TaskExec { private: enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize }; - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; TaskExec& operator=(TaskExec const&) = delete; friend class Kokkos::Impl::TaskQueue< @@ -1224,5 +1227,7 @@ KOKKOS_INLINE_FUNCTION void single( #undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() + #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index c2b5f1fa7894..aec692c2c366 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -184,24 +184,37 @@ class CudaTeamMember { * ( 1 == blockDim.z ) */ template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer) const noexcept { team_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE(( + typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( - (typename Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, TeamPolicy, - ReducerType, typename ReducerType::value_type>::Reducer - wrapped_reducer(reducer); - cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); - reducer.reference() = value;)) + (cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);)) } //-------------------------------------------------------------------------- @@ -260,23 +273,42 @@ class CudaTeamMember { //---------------------------------------- template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer) { vector_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE( + (typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_vector_reduce(wrapped_reducer, value); + reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer_v> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( (if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; unsigned mask = blockDim.x == 32 @@ -287,7 +319,7 @@ class CudaTeamMember { for (int i = blockDim.x; (i >>= 1);) { Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask); if ((int)threadIdx.x < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -297,7 +329,7 @@ class CudaTeamMember { // and thus different threads could have different results. Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask); - value = tmp2; reducer.reference() = tmp2;)) + value = tmp2;)) } //---------------------------------------- @@ -487,14 +519,21 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { KOKKOS_IF_ON_DEVICE( - (typename ReducerType::value_type value; + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - reducer.init(value); + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value);)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); reducer.reference() = value;)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -518,16 +557,25 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; + KOKKOS_IF_ON_DEVICE( - (ValueType val; Kokkos::Sum reducer(val); + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value{}; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; - i < loop_boundaries.end; i += blockDim.y) { closure(i, val); } + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference();)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } template @@ -548,16 +596,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value; - reducer.init(value); + KOKKOS_IF_ON_DEVICE( + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, value); } + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value;)) - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value);)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -573,18 +632,27 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, val); } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference();)) + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) } //---------------------------------------------------------------------------- @@ -632,13 +700,22 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< Closure const& closure, ReducerType const& reducer) { KOKKOS_IF_ON_DEVICE(( - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.x) { closure(i, reducer.reference()); } + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } - Impl::CudaTeamMember::vector_reduce(reducer); + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value; )) // Avoid bogus warning about reducer value being uninitialized with combined @@ -667,15 +744,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (result = ValueType(); - for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; i += blockDim.x) { closure(i, result); } + KOKKOS_IF_ON_DEVICE(( + + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - Impl::CudaTeamMember::vector_reduce(Kokkos::Sum(result)); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - )) + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index a3f4f2f4cccf..9e0c5819f712 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -125,8 +125,8 @@ struct in_place_shfl_op { struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -136,28 +136,28 @@ struct in_place_shfl_fn : in_place_shfl_op { }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { return __shfl_up_sync(mask, val, lane, width); } }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -168,7 +168,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index 517c592af724..0ac2d4052d2a 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -23,15 +23,12 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, - const View& dst) { +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, void* dst, size_t cnt) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() - ->cuda_memset_async_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); + ->cuda_memset_async_wrapper(dst, 0, cnt))); } }; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp index aced2083ffb5..8de3a8758fa1 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -27,6 +27,8 @@ #include +#include + namespace Kokkos { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -49,34 +51,44 @@ void HIP::impl_initialize(InitializationSettings const& settings) { Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( hipGetDeviceProperties(&Impl::HIPInternal::m_deviceProp, hip_device_id)); - const auto& hipProp = Impl::HIPInternal::m_deviceProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(hip_device_id)); - // number of multiprocessors - Impl::HIPInternal::m_multiProcCount = hipProp.multiProcessorCount; + // Check that we are running on the expected architecture. We print a warning + // instead of erroring out because AMD does not guarantee that gcnArchName + // will always contain the gfx flag. + if (Kokkos::show_warnings()) { + if (std::string_view arch_name = + Impl::HIPInternal::m_deviceProp.gcnArchName; + arch_name.find(KOKKOS_ARCH_AMD_GPU) != 0) { + std::cerr + << "Kokkos::HIP::initialize WARNING: running kernels compiled for " + << KOKKOS_ARCH_AMD_GPU << " on " << arch_name << " device.\n"; + } + } - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::HIPInternal::m_maxWarpCount = - hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; - if (Impl::HIPTraits::WarpSize < Impl::HIPInternal::m_maxWarpCount) { - Impl::HIPInternal::m_maxWarpCount = Impl::HIPTraits::WarpSize; + // Print a warning if the user did not select the right GFX942 architecture +#ifdef KOKKOS_ARCH_AMD_GFX942 + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 1)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300X " + "(discrete GPU) on a MI300A (APU).\n"; + } +#endif +#ifdef KOKKOS_ARCH_AMD_GFX942_APU + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 0)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300A " + "(APU) on a MI300X (discrete GPU).\n"; } +#endif - //---------------------------------- - // Maximum number of blocks - Impl::HIPInternal::m_maxBlock[0] = hipProp.maxGridSize[0]; - Impl::HIPInternal::m_maxBlock[1] = hipProp.maxGridSize[1]; - Impl::HIPInternal::m_maxBlock[2] = hipProp.maxGridSize[2]; - - // theoretically, we can get 40 WF's / CU, but only can sustain 32 see - // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 - Impl::HIPInternal::m_maxWavesPerCU = 32; - Impl::HIPInternal::m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - Impl::HIPInternal::m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + // theoretically on GFX 9XX GPUs, we can get 40 WF's / CU, but only can + // sustain 32 see + // https://github.com/ROCm/clr/blob/4d0b815d06751735e6a50fa46e913fdf85f751f0/hipamd/src/hip_platform.cpp#L362-L366 + const int maxWavesPerCU = + Impl::HIPInternal::m_deviceProp.major <= 9 ? 32 : 64; Impl::HIPInternal::m_maxThreadsPerSM = - Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize; + maxWavesPerCU * Impl::HIPTraits::WarpSize; // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME @@ -146,10 +158,6 @@ void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { #else os << "no\n"; #endif -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY - os << " KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY: "; - os << "yes\n"; -#endif os << "\nRuntime Configuration:\n"; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 1f084c41e50e..90e5cf73559f 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -113,8 +113,9 @@ unsigned hip_internal_get_block_size(const HIPInternal *hip_instance, const unsigned min_waves_per_eu = LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1; const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize; - const unsigned shmem_per_sm = hip_instance->m_shmemPerSM; - unsigned block_size = tperb_reg; + const unsigned shmem_per_sm = + hip_instance->m_deviceProp.maxSharedMemoryPerMultiProcessor; + unsigned block_size = tperb_reg; do { unsigned total_shmem = f(block_size); // find how many threads we can fit with this blocksize based on LDS usage diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 5f0df72df179..584cc63d958c 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -44,22 +44,17 @@ class GraphNodeKernelImpl // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, HIP const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} template GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", exec_space, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -73,18 +68,29 @@ class GraphNodeKernelImpl hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; } - Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const HIP& exec) const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast( + HIPSpace().allocate(exec, alloc_label.c_str(), sizeof(base_t))), + // FIXME_HIP Custom deletor should use same 'exec' as for allocation. + [alloc_label](base_t* ptr) { + HIPSpace().deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; + return m_driver_storage.get(); } + auto get_driver_storage() const { return m_driver_storage; } + private: Kokkos::ObservingRawPtr m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; }; struct HIPGraphNodeAggregateKernel { @@ -114,13 +120,14 @@ struct get_graph_node_kernel_type Kokkos::ParallelReduceTag>> {}; template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const HIP& exec, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = static_cast(kernel); - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(exec); } template diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index a0989fe67111..4f97214ca683 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -42,11 +42,11 @@ class GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); @@ -60,7 +60,7 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::HIP& exec); Kokkos::HIP const& get_execution_space() const noexcept; @@ -69,18 +69,28 @@ class GraphImpl { template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; hipGraphNode_t error_node = nullptr; char error_log[error_log_size]; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate( &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); + KOKKOS_ENSURES(m_graph_exec); } + hipGraph_t hip_graph() { return m_graph; } + hipGraphExec_t hip_graph_exec() { return m_graph_exec; } + + private: Kokkos::HIP m_execution_space; hipGraph_t m_graph = nullptr; hipGraphExec_t m_graph_exec = nullptr; + + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; }; inline GraphImpl::~GraphImpl() { @@ -123,6 +133,8 @@ inline void GraphImpl::add_node( kernel.set_hip_graph_node_ptr(&node); kernel.execute(); KOKKOS_ENSURES(node); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } // Requires PredecessorRef is a specialization of GraphNodeRef that has @@ -145,16 +157,15 @@ inline void GraphImpl::add_predecessor( hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::HIP& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - KOKKOS_IMPL_HIP_SAFE_CALL( - hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphLaunch(m_graph_exec, exec.hip_stream())); } -inline Kokkos::HIP const& GraphImpl::get_execution_space() const - noexcept { +inline Kokkos::HIP const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index e0b25c69399a..54e8c315e3f6 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -77,7 +77,8 @@ std::size_t scratch_count(const std::size_t size) { //---------------------------------------------------------------------------- int HIPInternal::concurrency() { - static int const concurrency = m_maxThreadsPerSM * m_multiProcCount; + static int const concurrency = + m_maxThreadsPerSM * m_deviceProp.multiProcessorCount; return concurrency; } @@ -97,6 +98,13 @@ void HIPInternal::print_configuration(std::ostream &s) const { << "undefined\n"; #endif + s << "macro KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC: "; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + s << "yes\n"; +#else + s << "no\n"; +#endif + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); @@ -177,8 +185,16 @@ void HIPInternal::initialize(hipStream_t stream) { // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + unsigned int maxWarpCount = + m_deviceProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; + if (Impl::HIPTraits::WarpSize < maxWarpCount) { + maxWarpCount = Impl::HIPTraits::WarpSize; + } + const unsigned reduce_block_count = - m_maxWarpCount * Impl::HIPTraits::WarpSize; + maxWarpCount * Impl::HIPTraits::WarpSize; (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); @@ -353,14 +369,8 @@ void HIPInternal::finalize() { m_num_scratch_locks = 0; } -int HIPInternal::m_hipDev = -1; -unsigned HIPInternal::m_multiProcCount = 0; -unsigned HIPInternal::m_maxWarpCount = 0; -std::array HIPInternal::m_maxBlock = {0, 0, 0}; -unsigned HIPInternal::m_maxWavesPerCU = 0; -int HIPInternal::m_shmemPerSM = 0; -int HIPInternal::m_maxShmemPerBlock = 0; -int HIPInternal::m_maxThreadsPerSM = 0; +int HIPInternal::m_hipDev = -1; +int HIPInternal::m_maxThreadsPerSM = 0; hipDeviceProp_t HIPInternal::m_deviceProp; @@ -372,15 +382,7 @@ std::mutex HIPInternal::constantMemMutex; //---------------------------------------------------------------------------- Kokkos::HIP::size_type hip_internal_multiprocessor_count() { - return HIPInternal::singleton().m_multiProcCount; -} - -Kokkos::HIP::size_type hip_internal_maximum_warp_count() { - return HIPInternal::singleton().m_maxWarpCount; -} - -std::array hip_internal_maximum_grid_count() { - return HIPInternal::singleton().m_maxBlock; + return HIPInternal::singleton().m_deviceProp.multiProcessorCount; } Kokkos::HIP::size_type *hip_internal_scratch_space(const HIP &instance, diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 19349e90bb16..d8043dc23d7a 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -31,11 +31,12 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ - defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ -#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) +#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) || \ + defined(KOKKOS_ARCH_AMD_GFX1103) static constexpr int WarpSize = 32; static constexpr int WarpIndexMask = 0x001f; /* hexadecimal for 31 */ static constexpr int WarpIndexShift = 5; /* WarpSize == 1 << WarpShift*/ @@ -51,8 +52,6 @@ struct HIPTraits { //---------------------------------------------------------------------------- -HIP::size_type hip_internal_maximum_warp_count(); -std::array hip_internal_maximum_grid_count(); HIP::size_type hip_internal_multiprocessor_count(); HIP::size_type *hip_internal_scratch_space(const HIP &instance, @@ -71,12 +70,6 @@ class HIPInternal { using size_type = ::Kokkos::HIP::size_type; static int m_hipDev; - static unsigned m_multiProcCount; - static unsigned m_maxWarpCount; - static std::array m_maxBlock; - static unsigned m_maxWavesPerCU; - static int m_shmemPerSM; - static int m_maxShmemPerBlock; static int m_maxThreadsPerSM; static hipDeviceProp_t m_deviceProp; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 7cd0afcf47fc..e243eb07e784 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -25,11 +25,7 @@ #include #include -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) -#define KOKKOS_IMPL_HIP_GRAPH_ENABLED -#endif - -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH #include #include #endif @@ -173,15 +169,15 @@ struct DeduceHIPLaunchMechanism { static constexpr HIPLaunchMechanism launch_mechanism = ((property & force_global_launch) == force_global_launch) ? HIPLaunchMechanism::GlobalMemory - : ((property & light_weight) == light_weight) - ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit - ? HIPLaunchMechanism::LocalMemory - : HIPLaunchMechanism::GlobalMemory) - : (((property & heavy_weight) == heavy_weight) - ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage - ? HIPLaunchMechanism::ConstantMemory - : HIPLaunchMechanism::GlobalMemory) - : (default_launch_mechanism)); + : ((property & light_weight) == light_weight) + ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::GlobalMemory) + : (((property & heavy_weight) == heavy_weight) + ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::GlobalMemory) + : (default_launch_mechanism)); }; template m_stream, ManageStream::no), driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl // which is guaranteed to be alive until the graph instance itself is // destroyed, where there should be a fence ensuring that the allocation // associated with this kernel on the device side isn't deleted. - hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), + hipMemcpyDefault, hip_instance->m_stream)); void const *args[] = {&driver_ptr}; @@ -551,11 +549,11 @@ struct HIPParallelLaunch< LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, + const dim3 &block, const unsigned int shmem, const HIPInternal *hip_instance, const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { + if (hip_instance->m_deviceProp.sharedMemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( "HIPParallelLaunch FAILED: shared memory request is too large"); } @@ -585,7 +583,7 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker, HIP> { const Policy m_policy; public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { @@ -57,7 +57,7 @@ class ParallelFor, HIP> { inline void execute() const { using ClosureType = ParallelFor; if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); + auto const maxblocks = m_policy.space().hip_device_prop().maxGridSize; if (Policy::rank == 2) { dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); dim3 const grid( diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp index 9355c1c75fbe..3985dc60f06b 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -53,8 +53,8 @@ class ParallelFor, Kokkos::HIP> { public: using functor_type = FunctorType; - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp index bf0c2193383f..83e890bce99a 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -71,8 +71,8 @@ class ParallelFor, HIP> { } public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; __device__ inline void operator()() const { @@ -120,9 +120,14 @@ class ParallelFor, HIP> { m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -149,8 +154,9 @@ class ParallelFor, HIP> { static_cast(m_league_size)))); } - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + unsigned int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); } diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp index 0c24e5cc62ad..fb4ff937cdff 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -46,6 +46,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::HIP::size_type), + std::conditional_t, + Kokkos::HIP::size_type>; using reducer_type = ReducerType; using size_type = HIP::size_type; @@ -72,7 +88,7 @@ class ParallelReduce const - word_count(reducer.value_size() / sizeof(size_type)); + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(word_size_type)); - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = reducer.init(reinterpret_cast( + kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league iterate_through_league(threadid, value); // Reduce with final value at blockDim.y - 1 location. bool do_final_reduce = (m_league_size == 0); if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); + do_final_reduce = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); if (do_final_reduce) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; + word_size_type* const shared = + kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; if (threadIdx.y == 0) { reducer.final(reinterpret_cast(shared)); @@ -227,7 +244,8 @@ class ParallelReduce(m_scratch_space), result, m_scratch_flags, blockDim.y)) { unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { @@ -249,8 +267,9 @@ class ParallelReduce(hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count)); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); @@ -306,11 +325,15 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction @@ -356,7 +379,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " "L0 scratch memory")); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index 83f829fddae3..0b679218092d 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -23,7 +23,7 @@ #include #include -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPSpace); #else diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index 1ca7bd5cd0e6..a464609108cd 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -20,7 +20,7 @@ #include #include -#if defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); #else KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index 4035bb012132..feee44ccaf17 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -100,7 +100,7 @@ template __device__ inline bool hip_inter_block_shuffle_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, FunctorType const& reducer, - HIP::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, HIP::size_type* const m_scratch_flags, int const max_active_thread = blockDim.y) { @@ -115,9 +115,8 @@ __device__ inline bool hip_inter_block_shuffle_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = - reinterpret_cast(m_scratch_space) + blockIdx.x; - *global = value; + pointer_type global = m_scratch_space + blockIdx.x; + *global = value; __threadfence(); } @@ -140,8 +139,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction( last_block = true; value = neutral; - pointer_type const global = - reinterpret_cast(m_scratch_space); + pointer_type const global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = blockDim.x * blockDim.y < warp_size diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 67635fc1c4ca..47f07b31abfe 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -51,28 +51,54 @@ static std::atomic is_first_hip_managed_allocation(true); namespace Kokkos { -HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {} +HIPSpace::HIPSpace() + : m_device(HIP().hip_device()), m_stream(HIP().hip_stream()) {} HIPHostPinnedSpace::HIPHostPinnedSpace() {} HIPManagedSpace::HIPManagedSpace() : m_device(HIP().hip_device()) {} +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY +void* HIPSpace::allocate(const HIP& exec_space, + const size_t arg_alloc_size) const { + return allocate(exec_space, "[unlabeled]", arg_alloc_size); +} + +void* HIPSpace::allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(exec_space.hip_stream(), arg_label, arg_alloc_size, + arg_logical_size, true); +} +#endif + void* HIPSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } -void* HIPSpace::allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +void* HIPSpace::allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(m_stream, arg_label, arg_alloc_size, arg_logical_size, + false); } + void* HIPSpace::impl_allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { + [[maybe_unused]] const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, const size_t arg_logical_size, + [[maybe_unused]] const bool stream_sync_only) const { void* ptr = nullptr; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + auto const error_code = hipMallocAsync(&ptr, arg_alloc_size, stream); + if (stream_sync_only) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(stream)); + } else { + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); + } +#else auto const error_code = hipMalloc(&ptr, arg_alloc_size); +#endif + if (error_code != hipSuccess) { // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here @@ -80,6 +106,8 @@ void* HIPSpace::impl_allocate( Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { + const Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name()); const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -219,7 +247,12 @@ void HIPSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + KOKKOS_IMPL_HIP_SAFE_CALL(hipFreeAsync(arg_alloc_ptr, m_stream)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); +#else KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); +#endif } void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index e1b4768b8771..2380772cacf8 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -58,14 +58,14 @@ class HIPSpace { /*--------------------------------*/ HIPSpace(); - HIPSpace(HIPSpace&& rhs) = default; - HIPSpace(const HIPSpace& rhs) = default; - HIPSpace& operator=(HIPSpace&& rhs) = default; + HIPSpace(HIPSpace&& rhs) = default; + HIPSpace(const HIPSpace& rhs) = default; + HIPSpace& operator=(HIPSpace&& rhs) = default; HIPSpace& operator=(const HIPSpace& rhs) = default; ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifdef KOKKOS_IMPL_HIP_UNIFIED_MEMORY template void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { return allocate(arg_alloc_size); @@ -77,15 +77,10 @@ class HIPSpace { return allocate(arg_label, arg_alloc_size, arg_logical_size); } #else - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const size_t arg_alloc_size) const { - return allocate(arg_alloc_size); - } - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return allocate(arg_label, arg_alloc_size, arg_logical_size); - } + void* allocate(const HIP& exec_space, const size_t arg_alloc_size) const; + void* allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; #endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, @@ -98,10 +93,10 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; + void* impl_allocate(const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size, + bool stream_sync_only) const; void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -114,6 +109,7 @@ class HIPSpace { private: int m_device; ///< Which HIP device + hipStream_t m_stream; }; template <> @@ -140,9 +136,9 @@ class HIPHostPinnedSpace { /*--------------------------------*/ HIPHostPinnedSpace(); - HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; - HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; + HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; ~HIPHostPinnedSpace() = default; @@ -213,9 +209,9 @@ class HIPManagedSpace { /*--------------------------------*/ HIPManagedSpace(); - HIPManagedSpace(HIPManagedSpace&& rhs) = default; - HIPManagedSpace(const HIPManagedSpace& rhs) = default; - HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(const HIPManagedSpace& rhs) = default; + HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default; ~HIPManagedSpace() = default; @@ -280,7 +276,7 @@ static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); template <> struct MemorySpaceAccess { enum : bool { assignable = false }; -#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) enum : bool{accessible = false}; #else enum : bool { accessible = true }; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index fb466d8a721f..1724b4361db8 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -183,7 +183,7 @@ class HIPTeamMember { typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType, typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); - hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value; #else (void)reducer; @@ -191,6 +191,19 @@ class HIPTeamMember { #endif } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { +#ifdef __HIP_DEVICE_COMPILE__ + hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); +#else + (void)wrapped_reducer; + (void)value; +#endif + } + //-------------------------------------------------------------------------- /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. @@ -261,17 +274,37 @@ class HIPTeamMember { KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; +#else + (void)reducer; + (void)value; +#endif + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { #ifdef __HIP_DEVICE_COMPILE__ if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = blockDim.x; (i >>= 1);) { in_place_shfl_down(tmp2, tmp, i, blockDim.x); if (static_cast(threadIdx.x) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -281,10 +314,9 @@ class HIPTeamMember { // and thus different threads could have different results. in_place_shfl(tmp2, tmp, 0, blockDim.x); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; #else - (void)reducer; + (void)wrapped_reducer; (void)value; #endif } @@ -479,15 +511,26 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -508,24 +551,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; - i += blockDim.y) { - closure(i, val); - } + for (iType i = loop_boundaries.start + threadIdx.y; + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } /** \brief Inter-thread parallel exclusive prefix sum. @@ -620,16 +663,26 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; i < loop_boundaries.end; i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -642,25 +695,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); - - reducer.init(reducer.reference()); - - for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; i += blockDim.y * blockDim.x) { - closure(i, val); - } - - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- @@ -706,14 +761,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; i += blockDim.x) { - closure(i, reducer.reference()); + closure(i, value); } - Impl::HIPTeamMember::vector_reduce(reducer); + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -737,20 +804,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - result = ValueType(); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; - i += blockDim.x) { - closure(i, result); - } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - Impl::HIPTeamMember::vector_reduce(Kokkos::Sum(result)); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp index 67e1181125c2..f21c65f16dd8 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -222,7 +222,8 @@ class TeamPolicyInternal m_tune_team_size(bool(team_size_request <= 0)), m_tune_vector_length(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + const int max_grid_size_x = m_space.hip_device_prop().maxGridSize[0]; + if (league_size_ >= max_grid_size_x) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on HIP execution " "space."); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 30774c898b67..f5b1d321e8cf 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -40,8 +40,8 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { using shfl_type = int; union conv_type { Scalar orig; @@ -65,16 +65,16 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( reinterpret_cast(in), lane_or_delta, width); } template __device__ inline std::enable_if_t - operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( *reinterpret_cast(&in), lane_or_delta, width); } @@ -82,8 +82,8 @@ struct in_place_shfl_op { // sizeof(Scalar) > sizeof(double) case template __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))> - operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, const Scalar& val, int lane_or_delta, + int width) const noexcept { using shuffle_as_t = int; constexpr int N = sizeof(Scalar) / sizeof(shuffle_as_t); @@ -108,7 +108,7 @@ struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { @@ -123,7 +123,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { @@ -138,7 +138,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp new file mode 100644 index 000000000000..34d5ecf1a657 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp @@ -0,0 +1,36 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif + +#include +#include + +namespace Kokkos { +namespace Impl { + +// alternative to hipMemsetAsync, which sets the first `cnt` bytes of `dst` to 0 +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt) { + Kokkos::parallel_for( + "Kokkos::ZeroMemset via parallel_for", + Kokkos::RangePolicy(exec_space, 0, cnt), + KOKKOS_LAMBDA(size_t i) { static_cast(dst)[i] = 0; }); +} + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 4bca29868f78..18708cf8c566 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -23,12 +23,21 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( - dst.data(), 0, dst.size() * sizeof(typename View::value_type), - exec_space.hip_stream())); +// hipMemsetAsync sets the first `cnt` bytes of `dst` to the provided value +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt); + +template <> +struct ZeroMemset { + ZeroMemset(const HIP& exec_space, void* dst, size_t cnt) { + // in ROCm <= 6.2.0, hipMemsetAsync on a host-allocated pointer + // returns an invalid value error, but accessing the data via a + // GPU kernel works. +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + zero_with_hip_kernel(exec_space, dst, cnt); +#else + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemsetAsync(dst, 0, cnt, exec_space.hip_stream())); +#endif } }; diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp index 245dc128ca86..7d4993379087 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -32,12 +32,10 @@ static_assert(false, #include #include #include -#include #include #include #include #include -#include #include #include @@ -75,12 +73,12 @@ class hpx_thread_buffer { } public: - hpx_thread_buffer() = default; - ~hpx_thread_buffer() = default; - hpx_thread_buffer(const hpx_thread_buffer &) = delete; - hpx_thread_buffer(hpx_thread_buffer &&) = delete; + hpx_thread_buffer() = default; + ~hpx_thread_buffer() = default; + hpx_thread_buffer(const hpx_thread_buffer &) = delete; + hpx_thread_buffer(hpx_thread_buffer &&) = delete; hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete; - hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; + hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; void resize(const std::size_t num_threads, const std::size_t size_per_thread, const std::size_t extra_space = 0) noexcept; @@ -140,10 +138,10 @@ class HPX { hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_id(instance_id), m_sender{std::move(sender)} {} - instance_data(const instance_data &) = delete; - instance_data(instance_data &&) = delete; + instance_data(const instance_data &) = delete; + instance_data(instance_data &&) = delete; instance_data &operator=(const instance_data &) = delete; - instance_data &operator=(instance_data) = delete; + instance_data &operator=(instance_data) = delete; uint32_t m_instance_id{HPX::impl_default_instance_id()}; hpx::execution::experimental::unique_any_sender<> m_sender{ @@ -196,7 +194,7 @@ class HPX { HPX(HPX &&other) = default; HPX(const HPX &other) = default; - HPX &operator=(HPX &&) = default; + HPX &operator=(HPX &&) = default; HPX &operator=(const HPX &) = default; void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; @@ -214,9 +212,9 @@ class HPX { struct impl_in_parallel_scope { impl_in_parallel_scope() noexcept; ~impl_in_parallel_scope() noexcept; - impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; + impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete; }; @@ -249,13 +247,15 @@ class HPX { impl_instance_fence(name); } - static bool is_asynchronous(HPX const & = HPX()) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool is_asynchronous(HPX const & = HPX()) noexcept { #if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) return true; #else return false; #endif } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); @@ -281,8 +281,8 @@ class HPX { return impl_get_instance_data().m_buffer; } - hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const - noexcept { + hpx::execution::experimental::unique_any_sender<> &impl_get_sender() + const noexcept { return impl_get_instance_data().m_sender; } @@ -447,6 +447,20 @@ class HPX { } }; +template +std::vector partition_space(HPX const &, Args... args) { + std::vector instances(sizeof...(args)); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + +template +std::vector partition_space(HPX const &, std::vector const &weights) { + std::vector instances(weights.size()); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + extern template void HPX::impl_bulk_plain_erased( bool, bool, std::function &&, int const, hpx::threads::thread_stacksize stacksize) const; @@ -1772,11 +1786,24 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } /** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each @@ -1810,14 +1837,26 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + wrapped_reducer.final(&value); + result = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -1995,7 +2060,9 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #endif /* #if defined( KOKKOS_ENABLE_HPX ) */ #endif /* #ifndef KOKKOS_HPX_HPP */ diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp index 28c75b2515ae..d775b7fac3b7 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -25,6 +25,8 @@ #include +#include + #include #include @@ -33,6 +35,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -256,6 +263,10 @@ extern template class TaskQueue< } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 297b1fadee94..92dc506c5e9d 100644 --- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -30,6 +30,7 @@ static_assert(false, #include #include #include +#include namespace Kokkos { @@ -60,13 +61,13 @@ namespace Impl { // NOTE the comparison below is encapsulated to silent warnings about pointless // comparison of unsigned integer with zero template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T) { return false; } template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T arg) { return arg < T{}; } @@ -75,7 +76,7 @@ is_less_than_value_initialized_variable(T arg) { template constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = - (std::is_signed::value != std::is_signed::value); + (std::is_signed_v != std::is_signed_v); auto const ret = static_cast(arg); if (static_cast(ret) != arg || (is_different_signedness && @@ -183,7 +184,7 @@ struct MDRangePolicy template friend struct MDRangePolicy; - static_assert(!std::is_void::value, + static_assert(!std::is_void_v, "Kokkos Error: MD iteration pattern not defined"); using iteration_pattern = typename traits::iteration_pattern; @@ -238,9 +239,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) : MDRangePolicy( @@ -257,9 +258,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const typename traits::execution_space& work_space, const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) @@ -291,14 +292,14 @@ struct MDRangePolicy } template ::value>> + typename = std::enable_if_t>> MDRangePolicy(Kokkos::Array const& lower, Kokkos::Array const& upper, Kokkos::Array const& tile = Kokkos::Array{}) : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} template ::value>> + typename = std::enable_if_t>> MDRangePolicy(const typename traits::execution_space& work_space, Kokkos::Array const& lower, Kokkos::Array const& upper, @@ -330,7 +331,44 @@ struct MDRangePolicy } bool impl_tune_tile_size() const { return m_tune_tile_size; } + tile_type tile_size_recommended() const { + tile_type rec_tile_sizes = {}; + + for (std::size_t i = 0; i < rec_tile_sizes.size(); ++i) { + rec_tile_sizes[i] = tile_size_recommended(i); + } + return rec_tile_sizes; + } + + int max_total_tile_size() const { + return Impl::get_tile_size_properties(m_space).max_total_tile_size; + } + private: + int tile_size_recommended(const int tile_rank) const { + auto properties = Impl::get_tile_size_properties(m_space); + int last_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; + int rank_acc = + (inner_direction == Iterate::Right) ? tile_rank + 1 : tile_rank - 1; + int rec_tile_size = (std::pow(properties.default_tile_size, rank_acc) < + properties.max_total_tile_size) + ? properties.default_tile_size + : 1; + + if (tile_rank == last_rank) { + rec_tile_size = tile_size_last_rank( + properties, m_upper[last_rank] - m_lower[last_rank]); + } + return rec_tile_size; + } + + int tile_size_last_rank(const Impl::TileSizeProperties properties, + const index_type length) const { + return properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; + } + void init_helper(Impl::TileSizeProperties properties) { m_prod_tile_dims = 1; int increment = 1; @@ -341,6 +379,7 @@ struct MDRangePolicy rank_start = rank - 1; rank_end = -1; } + for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; @@ -368,9 +407,7 @@ struct MDRangePolicy m_tile[i] = 1; } } else { - m_tile[i] = properties.default_largest_tile_size == 0 - ? std::max(length, 1) - : properties.default_largest_tile_size; + m_tile[i] = tile_size_last_rank(properties, length); } } m_tile_end[i] = @@ -389,58 +426,55 @@ struct MDRangePolicy }; template -MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; +MDRangePolicy(const LT (&)[N], const UT (&)[N]) -> MDRangePolicy>; template MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], - const TT (&)[TN]) - ->MDRangePolicy>; + const TT (&)[TN]) -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template -MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; +MDRangePolicy(Array const&, Array const&) -> MDRangePolicy>; template MDRangePolicy(Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; } // namespace Kokkos diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 9f5deed5d66f..62f527aa025c 100644 --- a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -41,10 +41,10 @@ class AnonymousSpace { using device_type = Kokkos::Device; /**\brief Default memory space instance */ - AnonymousSpace() = default; - AnonymousSpace(AnonymousSpace &&rhs) = default; - AnonymousSpace(const AnonymousSpace &rhs) = default; - AnonymousSpace &operator=(AnonymousSpace &&) = default; + AnonymousSpace() = default; + AnonymousSpace(AnonymousSpace &&rhs) = default; + AnonymousSpace(const AnonymousSpace &rhs) = default; + AnonymousSpace &operator=(AnonymousSpace &&) = default; AnonymousSpace &operator=(const AnonymousSpace &) = default; ~AnonymousSpace() = default; diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp index 4d905fbc5538..493536b53bed 100644 --- a/packages/kokkos/core/src/Kokkos_Array.hpp +++ b/packages/kokkos/core/src/Kokkos_Array.hpp @@ -35,7 +35,7 @@ namespace Kokkos { #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK namespace Impl { -template ::value> +template > struct ArrayBoundsCheck; template @@ -195,8 +195,10 @@ struct Array { return *reinterpret_cast(-1); } - KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr pointer data() { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { + return nullptr; + } friend KOKKOS_FUNCTION constexpr bool operator==(Array const&, Array const&) noexcept { @@ -365,7 +367,7 @@ struct KOKKOS_DEPRECATED #endif template -Array(T, Us...)->Array; +Array(T, Us...) -> Array; namespace Impl { @@ -377,7 +379,7 @@ KOKKOS_FUNCTION constexpr Array, N> to_array_impl( template KOKKOS_FUNCTION constexpr Array, N> to_array_impl( - T(&&a)[N], std::index_sequence) { + T (&&a)[N], std::index_sequence) { return {{std::move(a[I])...}}; } @@ -389,7 +391,7 @@ KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { } template -KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { +KOKKOS_FUNCTION constexpr auto to_array(T (&&a)[N]) { return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); } @@ -435,6 +437,32 @@ KOKKOS_FUNCTION constexpr T const&& get(Array const&& a) noexcept { } // namespace Kokkos // +// +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr T const* begin(Array const& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T* begin(Array& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T const* end(Array const& a) noexcept { + return a.data() + a.size(); +} + +template +KOKKOS_FUNCTION constexpr T* end(Array& a) noexcept { + return a.data() + a.size(); +} + +} // namespace Kokkos +// + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp index 6fc903f27434..ba6113609229 100644 --- a/packages/kokkos/core/src/Kokkos_Atomic.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp @@ -47,7 +47,6 @@ #include #include -#include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp deleted file mode 100644 index bf57dcae650e..000000000000 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ /dev/null @@ -1,196 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#include -#include - -#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() -#else -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() -#endif - -// clang-format off -namespace Kokkos { - -template KOKKOS_INLINE_FUNCTION -T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast(dest), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) { - return desul::atomic_compare_exchange_strong(const_cast(dest),expected, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) { - return desul::atomic_compare_exchange(const_cast(dest),compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -} -#undef KOKKOS_DESUL_MEM_SCOPE - -// clang-format on -#endif diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index 26db69ac1f11..40f51c5a3340 100644 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -24,14 +24,16 @@ static_assert(false, #include #include +#include // identity_type #include -// clang-format off namespace Kokkos { -// FIXME: These functions don't have any use/test in unit tests ... -// ========================================================== -inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline const char* atomic_query_version() { + return "KOKKOS_DESUL_ATOMICS"; +} +#endif #if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ !defined(__CUDA_ARCH__) @@ -53,197 +55,120 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() #endif -template KOKKOS_INLINE_FUNCTION -T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { atomic_store(dest,val); } +namespace Impl { +template +using not_deduced_atomic_t = + std::add_const_t>>; + +template +using enable_if_atomic_t = + std::enable_if_t && !std::is_const_v, + std::remove_volatile_t>; +} // namespace Impl -KOKKOS_INLINE_FUNCTION -void memory_fence() { - desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); -} +// clang-format off -KOKKOS_INLINE_FUNCTION -void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +// fences +KOKKOS_INLINE_FUNCTION void memory_fence() { desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void load_fence() { desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void store_fence() { desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } -KOKKOS_INLINE_FUNCTION -void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } +// load/store +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_load (T const* ptr) { return desul::atomic_load (const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_store(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_store(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_store() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_assign(T* ptr, Impl::not_deduced_atomic_t val) { atomic_store(ptr, val); } +#endif // atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_add(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_sub(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_max(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_min(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mul(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_div(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mod(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_and(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_or (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_xor(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_nand(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_nand(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_lshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_rshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_inc(T* ptr) { return desul::atomic_fetch_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_dec(T* ptr) { return desul::atomic_fetch_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_add_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_sub_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_max_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_min_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mul_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_div_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mod_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_and_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or_fetch (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_or_fetch (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_xor_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_lshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_rshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc_fetch(T* ptr) { return desul::atomic_inc_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec_fetch(T* ptr) { return desul::atomic_dec_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or (T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc(T* ptr) { desul::atomic_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec(T* ptr) { desul::atomic_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_inc() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_increment(T* ptr) { atomic_inc(ptr); } +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_dec() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_decrement(T* ptr) { atomic_dec(ptr); } +#endif -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +// exchange +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_exchange (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_exchange (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return desul::atomic_compare_exchange(const_cast*>(ptr), expected, desired, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_compare_exchange() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange_strong(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return expected == atomic_compare_exchange(ptr, expected, desired); } +#endif -// Exchange +// clang-format on +} // namespace Kokkos -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +namespace Kokkos::Impl { -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t expected, desul::Impl::dont_deduce_this_parameter_t desired) { - T expected_ref = expected; - return desul::atomic_compare_exchange_strong(dest, expected_ref, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template +KOKKOS_FUNCTION bool atomic_compare_exchange_strong(T* const dest, T& expected, + const T desired, + MemOrderSuccess succ, + MemOrderFailure fail) { + return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, + fail, KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t compare, desul::Impl::dont_deduce_this_parameter_t desired) { - return desul::atomic_compare_exchange(dest, compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template +KOKKOS_FUNCTION T atomic_load(const T* const src, MemoryOrder order) { + return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); } -namespace Impl { - template KOKKOS_INLINE_FUNCTION - bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { - return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - T atomic_load(const T* const src, MemoryOrder order) { - return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - void atomic_store(T* const src, const T val, MemoryOrder order) { - return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); - } -} // namespace Impl +template +KOKKOS_FUNCTION void atomic_store(T* const src, const T val, + MemoryOrder order) { + return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); +} -} // namespace Kokkos +} // namespace Kokkos::Impl #undef KOKKOS_DESUL_MEM_SCOPE -// clang-format on #endif diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp index 7dd2a9ddbb71..8233c30b243c 100644 --- a/packages/kokkos/core/src/Kokkos_Complex.hpp +++ b/packages/kokkos/core/src/Kokkos_Complex.hpp @@ -70,9 +70,8 @@ class complex& operator=(const complex&) noexcept = default; /// \brief Conversion constructor from compatible RType - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_INLINE_FUNCTION complex(const complex& other) noexcept // Intentionally do the conversions implicitly here so that users don't // get any warnings about narrowing, etc., that they would expect to get @@ -265,9 +264,8 @@ class #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 //! Copy constructor from volatile. - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex(const volatile complex& src) noexcept // Intentionally do the conversions implicitly here so that users don't @@ -296,7 +294,7 @@ class // vl = r; // vl = cr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( const Complex& src) volatile noexcept { re_ = src.re_; @@ -319,7 +317,7 @@ class // vl = vr; // vl = cvr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile complex& operator=( const volatile Complex& src) volatile noexcept { re_ = src.re_; @@ -341,7 +339,7 @@ class // l = cvr; // template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( const volatile Complex& src) noexcept { re_ = src.re_; @@ -539,7 +537,7 @@ inline bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -551,7 +549,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -590,7 +588,7 @@ inline bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -602,7 +600,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -778,16 +776,14 @@ KOKKOS_INLINE_FUNCTION complex pow(const complex& x, return x == T() ? T() : exp(y * log(x)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow( const T& x, const complex& y) { using type = Impl::promote_2_t; return pow(type(x), complex(y)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, const U& y) { using type = Impl::promote_2_t; diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp index df78a644a034..0bfb9eb5fa40 100644 --- a/packages/kokkos/core/src/Kokkos_Concepts.hpp +++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp @@ -41,8 +41,7 @@ struct Dynamic {}; // Schedule Wrapper Type template struct Schedule { - static_assert(std::is_same::value || - std::is_same::value, + static_assert(std::is_same_v || std::is_same_v, "Kokkos: Invalid Schedule<> type."); using schedule_type = Schedule; using type = T; @@ -51,7 +50,7 @@ struct Schedule { // Specify Iteration Index Type template struct IndexType { - static_assert(std::is_integral::value, "Kokkos: Invalid IndexType<>."); + static_assert(std::is_integral_v, "Kokkos: Invalid IndexType<>."); using index_type = IndexType; using type = T; }; @@ -139,8 +138,8 @@ namespace Kokkos { \ public: \ static constexpr bool value = \ - std::is_base_of, T>::value || \ - std::is_base_of, T>::value; \ + std::is_base_of_v, T> || \ + std::is_base_of_v, T>; \ constexpr operator bool() const noexcept { return value; } \ }; \ template \ @@ -292,44 +291,6 @@ struct is_space { using execution_space = typename is_exe::space; using memory_space = typename is_mem::space; - - // For backward compatibility, deprecated in favor of - // Kokkos::Impl::HostMirror::host_mirror_space - - private: - // The actual definitions for host_memory_space and host_execution_spaces are - // in do_not_use_host_memory_space and do_not_use_host_execution_space to be - // able to use them within this class without deprecation warnings. - using do_not_use_host_memory_space = std::conditional_t< - std::is_same::value -#if defined(KOKKOS_ENABLE_CUDA) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_HIP) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_SYCL) - || std::is_same::value || - std::is_same::value -#endif - , - memory_space, Kokkos::HostSpace>; - - using do_not_use_host_execution_space = std::conditional_t< -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_HIP) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_SYCL) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - std::is_same::value || -#endif - false, - Kokkos::DefaultHostExecutionSpace, execution_space>; }; } // namespace Kokkos @@ -357,7 +318,7 @@ struct MemorySpaceAccess { * 2. All execution spaces that can access DstMemorySpace can also access * SrcMemorySpace. */ - enum { assignable = std::is_same::value }; + enum { assignable = std::is_same_v }; /**\brief For all DstExecSpace::memory_space == DstMemorySpace * DstExecSpace can access SrcMemorySpace. @@ -442,7 +403,7 @@ struct SpaceAccessibility { // If same memory space or not accessible use the AccessSpace // else construct a device with execution space and memory space. using space = std::conditional_t< - std::is_same::value || + std::is_same_v || !exe_access::accessible, AccessSpace, Kokkos::Device>; diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp index e856b1924719..7da59aa4e419 100644 --- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp @@ -561,21 +561,20 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -649,21 +648,20 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1350,22 +1348,20 @@ inline void contiguous_fill( } // Default implementation for execution spaces that don't provide a definition -template +template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { - using ValueType = typename ViewType::value_type; - alignas(alignof(ValueType)) unsigned char - zero_initialized_storage[sizeof(ValueType)] = {}; - contiguous_fill(exec_space, dst, - *reinterpret_cast(zero_initialized_storage)); + ZeroMemset(const ExecutionSpace& exec_space, void* dst, size_t cnt) { + contiguous_fill( + exec_space, + Kokkos::View( + static_cast(dst), cnt), + std::byte{}); } }; template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1375,20 +1371,20 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset>(exec_space, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec_space, dst.data(), + dst.size() * sizeof(typename ViewTraits::value_type)); else contiguous_fill(exec_space, dst, value); } template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1397,9 +1393,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1411,11 +1405,12 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset(exec, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec, dst.data(), dst.size() * sizeof(typename ViewType::value_type)); else #endif contiguous_fill(exec, dst, value); @@ -1423,9 +1418,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1441,8 +1434,8 @@ template inline void deep_copy( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; @@ -1464,8 +1457,8 @@ inline void deep_copy( } Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); // If contiguous we can simply do a 1D flat loop or use memset @@ -1482,21 +1475,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1539,8 +1531,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; @@ -1576,8 +1568,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using dst_type = View; @@ -1587,8 +1579,8 @@ inline void deep_copy( using dst_memory_space = typename dst_type::memory_space; using src_memory_space = typename src_type::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -1628,8 +1620,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; @@ -1641,8 +1633,8 @@ inline void deep_copy( using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -1772,10 +1764,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2191,8 +2183,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const TeamType& team, const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team, dst.span()), [&](const int& i) { dst.data()[i] = value; }); } @@ -2201,8 +2193,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { for (size_t i = 0; i < dst.span(); ++i) { dst.data()[i] = value; } @@ -2568,13 +2560,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2594,21 +2586,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -2649,13 +2640,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && !Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2696,8 +2687,8 @@ inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, std::enable_if_t::value && - std::is_same::specialize, - void>::value>* = nullptr) { + std::is_same_v::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, @@ -2734,8 +2725,8 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using src_traits = ViewTraits; @@ -2743,8 +2734,8 @@ inline void deep_copy( using src_memory_space = typename src_traits::memory_space; using dst_memory_space = typename dst_traits::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2784,15 +2775,15 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; using src_type = View; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -2922,10 +2913,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2994,11 +2985,11 @@ bool size_mismatch(const ViewType& view, unsigned int max_extent, /** \brief Resize a view with copying old data to new data at the corresponding * indices. */ template -inline typename std::enable_if< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value>::type +inline std::enable_if_t< + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, @@ -3048,10 +3039,10 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3066,10 +3057,10 @@ resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3085,10 +3076,10 @@ template inline std::enable_if_t< (Impl::is_view_ctor_property::value || Kokkos::is_execution_space::value) && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> resize(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3103,12 +3094,12 @@ resize(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3149,12 +3140,12 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3218,10 +3209,10 @@ inline void resize(Kokkos::View& v, /** \brief Resize a view with discarding old data. */ template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, const size_t n6, const size_t n7, @@ -3264,10 +3255,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3283,10 +3274,10 @@ realloc(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3302,10 +3293,10 @@ realloc(Kokkos::View& v, template inline std::enable_if_t< Impl::is_view_ctor_property::value && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> realloc(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3320,12 +3311,12 @@ realloc(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3365,12 +3356,12 @@ impl_realloc(Kokkos::View& v, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3435,7 +3426,7 @@ struct MirrorViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -3450,26 +3441,6 @@ struct MirrorViewType { std::conditional_t; }; -template -struct MirrorType { - // The incoming view_type - using src_view_type = typename Kokkos::View; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::View; -}; - // collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { @@ -3503,7 +3474,7 @@ inline auto create_mirror(const Kokkos::View& src, if constexpr (Impl::ViewCtorProp::has_memory_space) { using memory_space = typename decltype(prop_copy)::memory_space; using dst_type = - typename Impl::MirrorType::view_type; + typename Impl::MirrorViewType::dest_view_type; return dst_type(prop_copy, src.layout()); } else { using dst_type = typename View::HostMirror; @@ -3636,12 +3607,12 @@ inline auto create_mirror_view( const Kokkos::View& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::View< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename Kokkos::View::data_type, + typename Kokkos::View::HostMirror::data_type>) { check_view_ctor_args_create_mirror(); return typename Kokkos::View::HostMirror(src); } else { @@ -3785,8 +3756,7 @@ create_mirror_view_and_copy( const Space&, const Kokkos::View& src, std::string const& name = "", std::enable_if_t< - std::is_void::specialize>::value>* = - nullptr) { + std::is_void_v::specialize>>* = nullptr) { return create_mirror_view_and_copy( Kokkos::view_alloc(typename Space::memory_space{}, name), src); } diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp index 1f146563be20..9588d289a9ca 100644 --- a/packages/kokkos/core/src/Kokkos_Core.hpp +++ b/packages/kokkos/core/src/Kokkos_Core.hpp @@ -63,7 +63,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include @@ -248,9 +250,9 @@ class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard { } ScopeGuard& operator=(const ScopeGuard&) = delete; - ScopeGuard& operator=(ScopeGuard&&) = delete; - ScopeGuard(const ScopeGuard&) = delete; - ScopeGuard(ScopeGuard&&) = delete; + ScopeGuard& operator=(ScopeGuard&&) = delete; + ScopeGuard(const ScopeGuard&) = delete; + ScopeGuard(ScopeGuard&&) = delete; }; } // namespace Kokkos @@ -281,7 +283,7 @@ std::vector partition_space(ExecSpace const& space, "Kokkos Error: partition_space expects an Execution Space as " "first argument"); static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); std::vector instances(weights.size()); diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp index 7edb35f00eb4..5dbe5714293f 100644 --- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -106,8 +106,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = HIP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) -using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::SYCL; +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::OpenACC; @@ -122,7 +121,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -162,7 +161,7 @@ using SharedSpace = CudaUVMSpace; using SharedSpace = HIPManagedSpace; #define KOKKOS_HAS_SHARED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) -using SharedSpace = Experimental::SYCLSharedUSMSpace; +using SharedSpace = SYCLSharedUSMSpace; #define KOKKOS_HAS_SHARED_SPACE // if only host compile point to HostSpace #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) @@ -184,7 +183,7 @@ using SharedHostPinnedSpace = CudaHostPinnedSpace; using SharedHostPinnedSpace = HIPHostPinnedSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) - using SharedHostPinnedSpace = Experimental::SYCLHostUSMSpace; + using SharedHostPinnedSpace = SYCLHostUSMSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) using SharedHostPinnedSpace = HostSpace; diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp index 92931b584952..69223b641289 100644 --- a/packages/kokkos/core/src/Kokkos_Crs.hpp +++ b/packages/kokkos/core/src/Kokkos_Crs.hpp @@ -84,12 +84,12 @@ class Crs { /* * Default Constructors, operators and destructor */ - KOKKOS_DEFAULTED_FUNCTION Crs() = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; /** \brief Assign to a view of the rhs array. * If the old view is the last view @@ -148,7 +148,7 @@ class GetCrsTransposeCounts { public: KOKKOS_INLINE_FUNCTION - void operator()(index_type i) const { atomic_increment(&out[in.entries(i)]); } + void operator()(index_type i) const { atomic_inc(&out[in.entries(i)]); } GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out) : in(arg_in), out(arg_out) { using policy_type = RangePolicy; @@ -345,7 +345,7 @@ struct CountAndFill : public CountAndFillBase { closure.execute(); } auto nentries = Kokkos::get_crs_row_map_from_counts(this->m_crs.row_map, - this->m_counts); + this->m_counts); this->m_counts = counts_type(); this->m_crs.entries = entries_type("entries", nentries); { diff --git a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp index ae28805a42ee..8af10b2a409b 100644 --- a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp +++ b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp @@ -54,8 +54,8 @@ struct detector>, Op, Args...> { } // namespace Impl struct nonesuch : private Impl::nonesuch_base { - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; void operator=(nonesuch const&) = delete; }; @@ -81,7 +81,7 @@ inline constexpr bool is_detected_v = is_detected::value; template class Op, class... Args> inline constexpr bool is_detected_exact_v = - is_detected_exact::value; + is_detected_exact::value; // NOLINT template class Op, class... Args> inline constexpr bool is_detected_convertible_v = diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp index b8d7f77deb30..dd7ce5ce21f6 100644 --- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -27,7 +27,10 @@ static_assert(false, #include #include #include +#include +#ifndef KOKKOS_ENABLE_IMPL_TYPEINFO #include +#endif #include //---------------------------------------------------------------------------- @@ -197,8 +200,7 @@ class RangePolicy : public Impl::PolicyTraits { /** \brief finalize chunk_size if it was set to AUTO*/ inline void set_auto_chunk_size() { #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { + if (std::is_same_v) { // chunk_size <=1 lets the compiler choose the workgroup size when // launching kernels m_granularity = 1; @@ -248,46 +250,49 @@ class RangePolicy : public Impl::PolicyTraits { // To be replaced with std::in_range (c++20) template - static void check_conversion_safety(const IndexType bound) { + static void check_conversion_safety([[maybe_unused]] const IndexType bound) { + // Checking that the round-trip conversion preserves input index value + if constexpr (std::is_convertible_v) { #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) - std::string msg = - "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " - "is performed on a bound (" + - std::to_string(bound) + - "), which may " - "not preserve its original value.\n"; - bool warn = false; - - if constexpr (std::is_signed_v != - std::is_signed_v) { - // check signed to unsigned - if constexpr (std::is_signed_v) - warn |= (bound < static_cast( - std::numeric_limits::min())); - - // check unsigned to signed - if constexpr (std::is_signed_v) - warn |= (bound > static_cast( - std::numeric_limits::max())); - } + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_arithmetic_v && + (std::is_signed_v != + std::is_signed_v)) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } - // check narrowing - warn |= (static_cast(static_cast(bound)) != bound); + // check narrowing + warn |= + (static_cast(static_cast(bound)) != bound); - if (warn) { + if (warn) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Kokkos::abort(msg.c_str()); + Kokkos::abort(msg.c_str()); #endif #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS - Kokkos::Impl::log_warning(msg); + Kokkos::Impl::log_warning(msg); #endif - } -#else - (void)bound; + } #endif + } } public: @@ -333,20 +338,20 @@ class RangePolicy : public Impl::PolicyTraits { }; }; -RangePolicy()->RangePolicy<>; +RangePolicy() -> RangePolicy<>; -RangePolicy(int64_t, int64_t)->RangePolicy<>; -RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; +RangePolicy(int64_t, int64_t) -> RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>; -RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>; RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) - ->RangePolicy<>; + -> RangePolicy<>; template >> -RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy; template >> -RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy; } // namespace Kokkos @@ -515,24 +520,24 @@ struct PerThreadValue { template struct ExtractVectorLength { static inline iType value( - std::enable_if_t::value, iType> val, Args...) { + std::enable_if_t, iType> val, Args...) { return val; } - static inline std::enable_if_t::value, int> value( - std::enable_if_t::value, iType>, Args...) { + static inline std::enable_if_t, int> value( + std::enable_if_t, iType>, Args...) { return 1; } }; template -inline std::enable_if_t::value, iType> -extract_vector_length(iType val, Args...) { +inline std::enable_if_t, iType> extract_vector_length( + iType val, Args...) { return val; } template -inline std::enable_if_t::value, int> -extract_vector_length(iType, Args...) { +inline std::enable_if_t, int> extract_vector_length( + iType, Args...) { return 1; } @@ -577,7 +582,7 @@ struct ScratchRequest { } }; -// Throws a runtime exception if level is not `0` or `1` +// Causes abnormal program termination if level is not `0` or `1` void team_policy_check_valid_storage_level_argument(int level); /** \brief Execution policy for parallel work over a league of teams of @@ -721,55 +726,54 @@ class TeamPolicy // Execution space not provided deduces to TeamPolicy<> -TeamPolicy()->TeamPolicy<>; +TeamPolicy() -> TeamPolicy<>; -TeamPolicy(int, int)->TeamPolicy<>; -TeamPolicy(int, int, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int) -> TeamPolicy<>; +TeamPolicy(int, int, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>; // DefaultExecutionSpace deduces to TeamPolicy<> -TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; -TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, - Kokkos::AUTO_t const&) - ->TeamPolicy<>; + Kokkos::AUTO_t const&) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; // ES != DefaultExecutionSpace deduces to TeamPolicy template >> -TeamPolicy(ES const&, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy; template >> TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) - ->TeamPolicy; + -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy; namespace Impl { @@ -1041,7 +1045,7 @@ struct TeamThreadMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...) - ->TeamThreadMDRange, TeamHandle>; + -> TeamThreadMDRange, TeamHandle>; template struct ThreadVectorMDRange; @@ -1078,7 +1082,7 @@ struct ThreadVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...) - ->ThreadVectorMDRange, TeamHandle>; + -> ThreadVectorMDRange, TeamHandle>; template struct TeamVectorMDRange; @@ -1115,7 +1119,7 @@ struct TeamVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...) - ->TeamVectorMDRange, TeamHandle>; + -> TeamVectorMDRange, TeamHandle>; template @@ -1162,7 +1166,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1198,7 +1202,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1217,15 +1221,21 @@ KOKKOS_INLINE_FUNCTION void parallel_for( namespace Impl { template ::value> + bool HasTag = !std::is_void_v> struct ParallelConstructName; template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = + std::string(TypeInfo>::name()) + + "/" + std::string(TypeInfo::name()); +#else default_name = std::string(typeid(FunctorType).name()) + "/" + typeid(TagType).name(); +#endif } } std::string const& get() { @@ -1239,7 +1249,11 @@ template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { - default_name = std::string(typeid(FunctorType).name()); +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = TypeInfo>::name(); +#else + default_name = typeid(FunctorType).name(); +#endif } } std::string const& get() { diff --git a/packages/kokkos/core/src/Kokkos_Extents.hpp b/packages/kokkos/core/src/Kokkos_Extents.hpp index 9bc2eda60469..7d1f8c755d78 100644 --- a/packages/kokkos/core/src/Kokkos_Extents.hpp +++ b/packages/kokkos/core/src/Kokkos_Extents.hpp @@ -134,7 +134,7 @@ struct ApplyExtent { template struct ApplyExtent { - using type = ValueType * [Ext]; + using type = ValueType* [Ext]; }; template diff --git a/packages/kokkos/core/src/Kokkos_Future.hpp b/packages/kokkos/core/src/Kokkos_Future.hpp index 0b3a153de8c4..c26d08be1cff 100644 --- a/packages/kokkos/core/src/Kokkos_Future.hpp +++ b/packages/kokkos/core/src/Kokkos_Future.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_FUTURE_HPP #define KOKKOS_FUTURE_HPP @@ -41,13 +47,19 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // For now, hack this in as a partial specialization // TODO @tasking @cleanup Make this the "normal" class template and make the old // code the specialization template -class BasicFuture> { +class KOKKOS_DEPRECATED + BasicFuture> { public: using value_type = ValueType; using execution_space = ExecutionSpace; @@ -244,7 +256,7 @@ class BasicFuture> { //////////////////////////////////////////////////////////////////////////////// template -class BasicFuture { +class KOKKOS_DEPRECATED BasicFuture { private: template friend class BasicTaskScheduler; @@ -413,13 +425,13 @@ class BasicFuture { // Is a Future with the given execution space template -struct is_future : public std::false_type {}; +struct KOKKOS_DEPRECATED is_future : public std::false_type {}; template -struct is_future, ExecSpace> +struct KOKKOS_DEPRECATED is_future, ExecSpace> : std::bool_constant< - std::is_same::value || - std::is_void::value> {}; + std::is_same_v || + std::is_void_v> {}; //////////////////////////////////////////////////////////////////////////////// // END OLD CODE @@ -432,8 +444,8 @@ class ResolveFutureArgOrder { private: enum { Arg1_is_space = Kokkos::is_space::value }; enum { Arg2_is_space = Kokkos::is_space::value }; - enum { Arg1_is_value = !Arg1_is_space && !std::is_void::value }; - enum { Arg2_is_value = !Arg2_is_space && !std::is_void::value }; + enum { Arg1_is_value = !Arg1_is_space && !std::is_void_v }; + enum { Arg2_is_value = !Arg2_is_space && !std::is_void_v }; static_assert(!(Arg1_is_space && Arg2_is_space), "Future cannot be given two spaces"); @@ -463,10 +475,15 @@ class ResolveFutureArgOrder { * */ template -using Future = typename Impl::ResolveFutureArgOrder::type; +using Future KOKKOS_DEPRECATED = + typename Impl::ResolveFutureArgOrder::type; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_Graph.hpp b/packages/kokkos/core/src/Kokkos_Graph.hpp index 9cc6650e26ed..05d774ac61a2 100644 --- a/packages/kokkos/core/src/Kokkos_Graph.hpp +++ b/packages/kokkos/core/src/Kokkos_Graph.hpp @@ -86,10 +86,21 @@ struct [[nodiscard]] Graph { return m_impl_ptr->get_execution_space(); } - void submit() const { + void instantiate() { KOKKOS_EXPECTS(bool(m_impl_ptr)) - (*m_impl_ptr).submit(); + (*m_impl_ptr).instantiate(); } + + void submit(const execution_space& exec) const { + KOKKOS_EXPECTS(bool(m_impl_ptr)) + (*m_impl_ptr).submit(exec); + } + + void submit() const { submit(get_execution_space()); } + + decltype(auto) native_graph(); + + decltype(auto) native_graph_exec(); }; // end Graph }}}1 @@ -135,22 +146,68 @@ Graph create_graph(ExecutionSpace ex, Closure&& arg_closure) { // function template injection works. auto rv = Kokkos::Impl::GraphAccess::construct_graph(std::move(ex)); // Invoke the user's graph construction closure - ((Closure &&) arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); + ((Closure&&)arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); // and given them back the graph // KOKKOS_ENSURES(rv.m_impl_ptr.use_count() == 1) return rv; } +template +std::enable_if_t, + Graph> +create_graph(ExecutionSpace exec = ExecutionSpace{}) { + return Kokkos::Impl::GraphAccess::construct_graph(std::move(exec)); +} + template < class ExecutionSpace = DefaultExecutionSpace, class Closure = Kokkos::Impl::DoNotExplicitlySpecifyThisTemplateParameter> -Graph create_graph(Closure&& arg_closure) { - return create_graph(ExecutionSpace{}, (Closure &&) arg_closure); +std::enable_if_t< + !Kokkos::is_execution_space_v>, + Graph> +create_graph(Closure&& arg_closure) { + return create_graph(ExecutionSpace{}, (Closure&&)arg_closure); } // end create_graph }}}1 //============================================================================== +template +decltype(auto) Graph::native_graph() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph(); + } +#endif +} + +template +decltype(auto) Graph::native_graph_exec() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph_exec(); + } +#endif +} + } // end namespace Experimental } // namespace Kokkos @@ -163,7 +220,7 @@ Graph create_graph(Closure&& arg_closure) { #include #if defined(KOKKOS_ENABLE_HIP) // The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#if defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) #include #endif #endif diff --git a/packages/kokkos/core/src/Kokkos_GraphNode.hpp b/packages/kokkos/core/src/Kokkos_GraphNode.hpp index 2a4e2cf6414a..a0a60c07d094 100644 --- a/packages/kokkos/core/src/Kokkos_GraphNode.hpp +++ b/packages/kokkos/core/src/Kokkos_GraphNode.hpp @@ -48,7 +48,7 @@ class GraphNodeRef { // intended to be SFINAE-safe, so do validation before you instantiate. static_assert( - std::is_same::value || + std::is_same_v || Kokkos::Impl::is_specialization_of::value, "Invalid predecessor template parameter given to GraphNodeRef"); @@ -56,7 +56,7 @@ class GraphNodeRef { Kokkos::is_execution_space::value, "Invalid execution space template parameter given to GraphNodeRef"); - static_assert(std::is_same::value || + static_assert(std::is_same_v || Kokkos::Impl::is_graph_kernel::value, "Invalid kernel template parameter given to GraphNodeRef"); @@ -151,7 +151,7 @@ class GraphNodeRef { typename return_t::node_impl_t>( m_node_impl->execution_space_instance(), Kokkos::Impl::_graph_node_kernel_ctor_tag{}, - (NextKernelDeduced &&) arg_kernel, + (NextKernelDeduced&&)arg_kernel, // *this is the predecessor Kokkos::Impl::_graph_node_predecessor_ctor_tag{}, *this)); @@ -184,10 +184,10 @@ class GraphNodeRef { // {{{3 // Copyable and movable (basically just shared_ptr semantics - GraphNodeRef() noexcept = default; - GraphNodeRef(GraphNodeRef const&) = default; - GraphNodeRef(GraphNodeRef&&) noexcept = default; - GraphNodeRef& operator=(GraphNodeRef const&) = default; + GraphNodeRef() noexcept = default; + GraphNodeRef(GraphNodeRef const&) = default; + GraphNodeRef(GraphNodeRef&&) noexcept = default; + GraphNodeRef& operator=(GraphNodeRef const&) = default; GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; ~GraphNodeRef() = default; @@ -197,19 +197,19 @@ class GraphNodeRef { //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // {{{3 - template < - class OtherKernel, class OtherPredecessor, - std::enable_if_t< - // Not a copy/move constructor - !std::is_same>::value && - // must be an allowed type erasure of the kernel - Kokkos::Impl::is_compatible_type_erasure::value && - // must be an allowed type erasure of the predecessor - Kokkos::Impl::is_compatible_type_erasure< - OtherPredecessor, graph_predecessor>::value, - int> = 0> + template > && + // must be an allowed type erasure of the kernel + Kokkos::Impl::is_compatible_type_erasure< + OtherKernel, graph_kernel>::value && + // must be an allowed type erasure of the predecessor + Kokkos::Impl::is_compatible_type_erasure< + OtherPredecessor, graph_predecessor>::value, + int> = 0> /* implicit */ GraphNodeRef( GraphNodeRef const& other) @@ -257,7 +257,7 @@ class GraphNodeRef { //|| policy_t::execution_space_is_defaulted, "Execution Space mismatch between execution policy and graph"); - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using next_policy_t = decltype(policy); @@ -266,8 +266,8 @@ class GraphNodeRef { std::decay_t, Kokkos::ParallelForTag>; return this->_then_kernel(next_kernel_t{std::move(arg_name), policy.space(), - (Functor &&) functor, - (Policy &&) policy}); + (Functor&&)functor, + (Policy&&)policy}); } template < @@ -280,8 +280,7 @@ class GraphNodeRef { int> = 0> auto then_parallel_for(Policy&& policy, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", (Policy &&) policy, - (Functor &&) functor); + return this->then_parallel_for("", (Policy&&)policy, (Functor&&)functor); } template @@ -290,13 +289,13 @@ class GraphNodeRef { // needs to static assert constraint: DataParallelFunctor return this->then_parallel_for(std::move(name), Kokkos::RangePolicy(0, n), - (Functor &&) functor); + (Functor&&)functor); } template auto then_parallel_for(std::size_t n, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", n, (Functor &&) functor); + return this->then_parallel_for("", n, (Functor&&)functor); } // end then_parallel_for }}}2 @@ -359,6 +358,23 @@ class GraphNodeRef { Kokkos::is_reducer::value, "Output argument to parallel reduce in a graph must be a " "View or a Reducer"); + + if constexpr (Kokkos::is_reducer_v) { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename return_type_remove_cvref:: + result_view_type::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } else { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, + typename return_type_remove_cvref::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } + using return_type = // Yes, you do really have to do this... std::conditional_t::value, @@ -373,7 +389,7 @@ class GraphNodeRef { // End of Kokkos reducer disaster //---------------------------------------- - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using passed_reducer_type = typename return_value_adapter::reducer_type; @@ -399,7 +415,7 @@ class GraphNodeRef { return this->_then_kernel(next_kernel_t{ std::move(arg_name), graph_impl_ptr->get_execution_space(), - functor_reducer, (Policy &&) policy, + functor_reducer, (Policy&&)policy, return_value_adapter::return_value(return_value, functor)}); } @@ -413,9 +429,9 @@ class GraphNodeRef { int> = 0> auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", (Policy &&) arg_policy, - (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", (Policy&&)arg_policy, + (Functor&&)functor, + (ReturnType&&)return_value); } template @@ -425,15 +441,15 @@ class GraphNodeRef { ReturnType&& return_value) const { return this->then_parallel_reduce( std::move(label), Kokkos::RangePolicy{0, idx_end}, - (Functor &&) functor, (ReturnType &&) return_value); + (Functor&&)functor, (ReturnType&&)return_value); } template auto then_parallel_reduce(typename execution_space::size_type idx_end, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", idx_end, (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", idx_end, (Functor&&)functor, + (ReturnType&&)return_value); } // end then_parallel_reduce }}}2 diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp index 8b5f29f95b21..706586826f48 100644 --- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp @@ -63,10 +63,10 @@ class HostSpace { //! This memory space preferred device_type using device_type = Kokkos::Device; - HostSpace() = default; - HostSpace(HostSpace&& rhs) = default; - HostSpace(const HostSpace& rhs) = default; - HostSpace& operator=(HostSpace&&) = default; + HostSpace() = default; + HostSpace(HostSpace&& rhs) = default; + HostSpace(const HostSpace& rhs) = default; + HostSpace& operator=(HostSpace&&) = default; HostSpace& operator=(const HostSpace&) = default; ~HostSpace() = default; @@ -183,18 +183,6 @@ namespace Kokkos { namespace Impl { -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy_async(exec, dst, src, n); - } -}; - template struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { @@ -202,10 +190,15 @@ struct DeepCopy { } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); + if constexpr (!Kokkos::SpaceAccessibility::accessible) { + exec.fence( + "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); + hostspace_parallel_deepcopy_async(dst, src, n); + } else { + hostspace_parallel_deepcopy_async(exec, dst, src, n); + } } }; diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp index 37b80e54a85f..a760e7054a16 100644 --- a/packages/kokkos/core/src/Kokkos_Layout.hpp +++ b/packages/kokkos/core/src/Kokkos_Layout.hpp @@ -52,13 +52,17 @@ struct LayoutLeft { using array_layout = LayoutLeft; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutLeft(LayoutLeft const&) = default; - LayoutLeft(LayoutLeft&&) = default; + LayoutLeft(LayoutLeft const&) = default; + LayoutLeft(LayoutLeft&&) = default; LayoutLeft& operator=(LayoutLeft const&) = default; - LayoutLeft& operator=(LayoutLeft&&) = default; + LayoutLeft& operator=(LayoutLeft&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -69,7 +73,8 @@ struct LayoutLeft { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride(KOKKOS_IMPL_CTOR_DEFAULT_ARG) {} friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -101,13 +106,17 @@ struct LayoutRight { using array_layout = LayoutRight; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutRight(LayoutRight const&) = default; - LayoutRight(LayoutRight&&) = default; + LayoutRight(LayoutRight const&) = default; + LayoutRight(LayoutRight&&) = default; LayoutRight& operator=(LayoutRight const&) = default; - LayoutRight& operator=(LayoutRight&&) = default; + LayoutRight& operator=(LayoutRight&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -118,7 +127,8 @@ struct LayoutRight { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{KOKKOS_IMPL_CTOR_DEFAULT_ARG} {} friend bool operator==(const LayoutRight& left, const LayoutRight& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -144,10 +154,10 @@ struct LayoutStride { enum : bool { is_extent_constructible = false }; - LayoutStride(LayoutStride const&) = default; - LayoutStride(LayoutStride&&) = default; + LayoutStride(LayoutStride const&) = default; + LayoutStride(LayoutStride&&) = default; LayoutStride& operator=(LayoutStride const&) = default; - LayoutStride& operator=(LayoutStride&&) = default; + LayoutStride& operator=(LayoutStride&&) = default; /** \brief Compute strides from ordered dimensions. * @@ -191,8 +201,8 @@ struct LayoutStride { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S5 = 0, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S6 = 0, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S7 = 0) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3, - S4, S5, S6, S7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{S0, S1, S2, S3, S4, S5, S6, S7} {} friend bool operator==(const LayoutStride& left, const LayoutStride& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp index 0a0acd303f52..97b78a3c6485 100644 --- a/packages/kokkos/core/src/Kokkos_Macros.hpp +++ b/packages/kokkos/core/src/Kokkos_Macros.hpp @@ -27,7 +27,7 @@ * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget * execution space * KOKKOS_ENABLE_HIP Kokkos::HIP execution space - * KOKKOS_ENABLE_SYCL Kokkos::Experimental::SYCL execution space + * KOKKOS_ENABLE_SYCL Kokkos::SYCL execution space * KOKKOS_ENABLE_HWLOC HWLOC library is available. * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! * KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space. @@ -132,7 +132,7 @@ #define KOKKOS_CLASS_LAMBDA [ =, *this ] #endif -//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. +// #if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. // Intel compiler for host code. @@ -252,10 +252,10 @@ // CLANG compiler macros #if defined(KOKKOS_COMPILER_CLANG) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -273,10 +273,10 @@ // GNU Compiler macros #if defined(KOKKOS_COMPILER_GNU) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -298,7 +298,7 @@ #if defined(KOKKOS_COMPILER_NVHPC) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #endif @@ -357,6 +357,21 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif +// FIXME_OPENACC FIXME_OPENMPTARGET +// Move to setup files once there is more content +// clang-format off +#if defined(KOKKOS_ENABLE_OPENACC) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenACC backend" +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenMPTarget backend" +#endif +// clang-format on + +#if !defined(KOKKOS_IMPL_RELOCATABLE_FUNCTION) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION +#endif + //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -369,10 +384,14 @@ #define KOKKOS_FORCEINLINE_FUNCTION \ KOKKOS_IMPL_FORCEINLINE_FUNCTION \ __attribute__((annotate("KOKKOS_FORCEINLINE_FUNCTION"))) +#define KOKKOS_RELOCATABLE_FUNCTION \ + KOKKOS_IMPL_RELOCATABLE_FUNCTION \ + __attribute__((annotate("KOKKOS_RELOCATABLE_FUNCTION"))) #else #define KOKKOS_FUNCTION KOKKOS_IMPL_FUNCTION #define KOKKOS_INLINE_FUNCTION KOKKOS_IMPL_INLINE_FUNCTION #define KOKKOS_FORCEINLINE_FUNCTION KOKKOS_IMPL_FORCEINLINE_FUNCTION +#define KOKKOS_RELOCATABLE_FUNCTION KOKKOS_IMPL_RELOCATABLE_FUNCTION #endif //---------------------------------------------------------------------------- @@ -537,14 +556,17 @@ static constexpr bool kokkos_omp_on_host() { return false; } // If compiling with CUDA, we must use relocatable device code to enable the // task policy. +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #if defined(KOKKOS_ENABLE_CUDA) #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) #define KOKKOS_ENABLE_TASKDAG #endif // FIXME_SYCL Tasks not implemented -#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOS_ENABLE_TASKDAG #endif +#endif #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC @@ -582,9 +604,11 @@ static constexpr bool kokkos_omp_on_host() { return false; } // clang-format off #if defined(__NVCOMPILER) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ - _Pragma("diag_suppress 1216") + _Pragma("diag_suppress 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ - _Pragma("diag_default 1216") + _Pragma("diag_default 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #elif defined(__EDG__) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ _Pragma("warning push") \ @@ -607,6 +631,18 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif + +#if defined(__NVCOMPILER) +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() \ + _Pragma("diag_suppress code_is_unreachable") \ + _Pragma("diag_suppress initialization_not_reachable") +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() \ + _Pragma("diag_default code_is_unreachable") \ + _Pragma("diag_default initialization_not_reachable") +#else +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() +#endif // clang-format on #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp index ce8c9e152fa3..f7e9e2a78c45 100644 --- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -196,9 +196,10 @@ class MemoryPool { stats.consumed_superblocks++; stats.consumed_blocks += block_used; - stats.consumed_bytes += block_used * block_size; + stats.consumed_bytes += static_cast(block_used) * block_size; stats.reserved_blocks += block_count - block_used; - stats.reserved_bytes += (block_count - block_used) * block_size; + stats.reserved_bytes += + static_cast(block_count - block_used) * block_size; } } @@ -234,9 +235,9 @@ class MemoryPool { //-------------------------------------------------------------------------- - KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; KOKKOS_INLINE_FUNCTION MemoryPool() diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp index 118bf52c05fa..1304d3ba9260 100644 --- a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -114,7 +114,7 @@ template <> struct signaling_NaN_helper { static constexpr long dou #endif template struct digits_helper {}; template <> struct digits_helper { static constexpr int value = 1; }; -template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed::value; }; +template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed_v; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT - 1; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT*sizeof(short)-1; }; diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp index e569fefc14df..c44d1f231098 100644 --- a/packages/kokkos/core/src/Kokkos_Pair.hpp +++ b/packages/kokkos/core/src/Kokkos_Pair.hpp @@ -449,7 +449,8 @@ struct KOKKOS_DEPRECATED pair { // Specialization of relational operators for Kokkos::pair. // -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #endif template @@ -487,7 +488,8 @@ KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif #endif diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp index 122239df7908..24349e95aea1 100644 --- a/packages/kokkos/core/src/Kokkos_Parallel.hpp +++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp @@ -72,19 +72,19 @@ struct FunctorPolicyExecutionSpace { static_assert( !is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with an execution space " "are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with a device " "type are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A functor with both an execution space and device type is " "given but their execution space types do not match!"); @@ -134,8 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, const FunctorType& functor) { uint64_t kpID = 0; - ExecPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< @@ -348,9 +350,11 @@ template ::value>> inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor) { - uint64_t kpID = 0; - ExecutionPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + uint64_t kpID = 0; + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 53913266f130..3b89d184f2a4 100644 --- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -73,7 +73,7 @@ struct Sum { template KOKKOS_DEDUCTION_GUIDE Sum(View const&) - ->Sum::memory_space>; + -> Sum::memory_space>; template struct Prod { @@ -118,7 +118,7 @@ struct Prod { template KOKKOS_DEDUCTION_GUIDE Prod(View const&) - ->Prod::memory_space>; + -> Prod::memory_space>; template struct Min { @@ -165,7 +165,7 @@ struct Min { template KOKKOS_DEDUCTION_GUIDE Min(View const&) - ->Min::memory_space>; + -> Min::memory_space>; template struct Max { @@ -213,7 +213,7 @@ struct Max { template KOKKOS_DEDUCTION_GUIDE Max(View const&) - ->Max::memory_space>; + -> Max::memory_space>; template struct LAnd { @@ -259,7 +259,7 @@ struct LAnd { template KOKKOS_DEDUCTION_GUIDE LAnd(View const&) - ->LAnd::memory_space>; + -> LAnd::memory_space>; template struct LOr { @@ -306,7 +306,7 @@ struct LOr { template KOKKOS_DEDUCTION_GUIDE LOr(View const&) - ->LOr::memory_space>; + -> LOr::memory_space>; template struct BAnd { @@ -353,7 +353,7 @@ struct BAnd { template KOKKOS_DEDUCTION_GUIDE BAnd(View const&) - ->BAnd::memory_space>; + -> BAnd::memory_space>; template struct BOr { @@ -400,7 +400,7 @@ struct BOr { template KOKKOS_DEDUCTION_GUIDE BOr(View const&) - ->BOr::memory_space>; + -> BOr::memory_space>; template struct ValLocScalar { @@ -438,7 +438,12 @@ struct MinLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -458,11 +463,10 @@ struct MinLoc { }; template -KOKKOS_DEDUCTION_GUIDE MinLoc( - View, Properties...> const&) - ->MinLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MinLoc(View, Properties...> const&) -> MinLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MaxLoc { @@ -494,7 +498,12 @@ struct MaxLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -514,11 +523,10 @@ struct MaxLoc { }; template -KOKKOS_DEDUCTION_GUIDE MaxLoc( - View, Properties...> const&) - ->MaxLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MaxLoc(View, Properties...> const&) -> MaxLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MinMaxScalar { @@ -580,8 +588,8 @@ struct MinMax { template KOKKOS_DEDUCTION_GUIDE MinMax(View, Properties...> const&) - ->MinMax, Properties...>::memory_space>; + -> MinMax, Properties...>::memory_space>; template struct MinMaxLocScalar { @@ -622,10 +630,16 @@ struct MinMaxLoc { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { + dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -650,9 +664,9 @@ struct MinMaxLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxLoc( View, Properties...> const&) - ->MinMaxLoc, - Properties...>::memory_space>; + -> MinMaxLoc, + Properties...>::memory_space>; // -------------------------------------------------- // reducers added to support std algorithms @@ -718,9 +732,9 @@ struct MaxFirstLoc { template KOKKOS_DEDUCTION_GUIDE MaxFirstLoc( View, Properties...> const&) - ->MaxFirstLoc, - Properties...>::memory_space>; + -> MaxFirstLoc, + Properties...>::memory_space>; // // MaxFirstLocCustomComparator @@ -788,9 +802,9 @@ template KOKKOS_DEDUCTION_GUIDE MaxFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MaxFirstLocCustomComparator, - Properties...>::memory_space>; + -> MaxFirstLocCustomComparator, + Properties...>::memory_space>; // // MinFirstLoc @@ -852,9 +866,9 @@ struct MinFirstLoc { template KOKKOS_DEDUCTION_GUIDE MinFirstLoc( View, Properties...> const&) - ->MinFirstLoc, - Properties...>::memory_space>; + -> MinFirstLoc, + Properties...>::memory_space>; // // MinFirstLocCustomComparator @@ -922,9 +936,9 @@ template KOKKOS_DEDUCTION_GUIDE MinFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinFirstLocCustomComparator, - Properties...>::memory_space>; + -> MinFirstLocCustomComparator, + Properties...>::memory_space>; // // MinMaxFirstLastLoc @@ -997,9 +1011,9 @@ struct MinMaxFirstLastLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLoc( View, Properties...> const&) - ->MinMaxFirstLastLoc, - Properties...>::memory_space>; + -> MinMaxFirstLastLoc, + Properties...>::memory_space>; // // MinMaxFirstLastLocCustomComparator @@ -1077,7 +1091,7 @@ template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinMaxFirstLastLocCustomComparator< + -> MinMaxFirstLastLocCustomComparator< Scalar, Index, ComparatorType, typename View, Properties...>::memory_space>; @@ -1139,10 +1153,9 @@ struct FirstLoc { }; template -KOKKOS_DEDUCTION_GUIDE FirstLoc( - View, Properties...> const&) - ->FirstLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +FirstLoc(View, Properties...> const&) -> FirstLoc< + Index, typename View, Properties...>::memory_space>; // // LastLoc @@ -1202,8 +1215,8 @@ struct LastLoc { template KOKKOS_DEDUCTION_GUIDE LastLoc(View, Properties...> const&) - ->LastLoc, Properties...>::memory_space>; + -> LastLoc, + Properties...>::memory_space>; template struct StdIsPartScalar { @@ -1270,8 +1283,8 @@ struct StdIsPartitioned { template KOKKOS_DEDUCTION_GUIDE StdIsPartitioned( View, Properties...> const&) - ->StdIsPartitioned, - Properties...>::memory_space>; + -> StdIsPartitioned, + Properties...>::memory_space>; template struct StdPartPointScalar { @@ -1333,8 +1346,8 @@ struct StdPartitionPoint { template KOKKOS_DEDUCTION_GUIDE StdPartitionPoint( View, Properties...> const&) - ->StdPartitionPoint, - Properties...>::memory_space>; + -> StdPartitionPoint, + Properties...>::memory_space>; } // namespace Kokkos namespace Kokkos { @@ -1404,9 +1417,9 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< std::enable_if_t::value && - (!std::is_array::value && - !std::is_pointer::value) && - !Kokkos::is_reducer::value>, + (!std::is_array_v && + !std::is_pointer_v< + ReturnType>)&&!Kokkos::is_reducer::value>, ReturnType, FunctorType> { using return_type = Kokkos::View; @@ -1422,8 +1435,8 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< - std::enable_if_t<(std::is_array::value || - std::is_pointer::value)>, + std::enable_if_t<(std::is_array_v || + std::is_pointer_v)>, ReturnType, FunctorType> { using return_type = Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; @@ -1434,7 +1447,7 @@ struct ParallelReduceReturnValue< static return_type return_value(ReturnType& return_val, const FunctorType& functor) { - if (std::is_array::value) + if (std::is_array_v) return return_type(return_val); else return return_type(return_val, functor.value_count); @@ -1467,8 +1480,7 @@ struct ParallelReducePolicyType< template struct ParallelReducePolicyType< - std::enable_if_t::value>, PolicyType, - FunctorType> { + std::enable_if_t>, PolicyType, FunctorType> { using execution_space = typename Impl::FunctorPolicyExecutionSpace::execution_space; @@ -1501,27 +1513,28 @@ struct ParallelReduceAdaptor { using PassedReducerType = typename return_value_adapter::reducer_type; uint64_t kpID = 0; - PolicyType inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_reduce( - inner_policy, functor, label, kpID); - using ReducerSelector = - Kokkos::Impl::if_c::value, + Kokkos::Impl::if_c, FunctorType, PassedReducerType>; using Analysis = FunctorAnalysis; - using CombinedFunctorReducerType = CombinedFunctorReducer; + + CombinedFunctorReducerType functor_reducer( + functor, typename Analysis::Reducer( + ReducerSelector::select(functor, return_value))); + const auto& response = Kokkos::Tools::Impl::begin_parallel_reduce< + typename return_value_adapter::reducer_type>(policy, functor_reducer, + label, kpID); + const auto& inner_policy = response.policy; + auto closure = construct_with_shared_allocation_tracking_disabled< Impl::ParallelReduce::execution_space>>( - CombinedFunctorReducerType( - functor, typename Analysis::Reducer( - ReducerSelector::select(functor, return_value))), - inner_policy, + functor_reducer, inner_policy, return_value_adapter::return_value(return_value, functor)); closure.execute(); @@ -1536,7 +1549,7 @@ struct ParallelReduceAdaptor { template static inline std::enable_if_t::value)> + std::is_pointer_v)> execute(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { execute_impl(label, policy, functor, return_value); @@ -1568,7 +1581,7 @@ struct ReducerHasTestReferenceFunction { static std::false_type test_func(...); enum { - value = std::is_same(nullptr))>::value + value = std::is_same_v(nullptr))> }; }; @@ -1611,7 +1624,7 @@ struct ParallelReduceFence { template static void fence(const ExecutionSpace& ex, const std::string& name, ArgsDeduced&&... args) { - if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) { + if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced&&)args...)) { ex.fence(name); } } @@ -1663,11 +1676,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1684,11 +1697,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1704,11 +1717,11 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1728,11 +1741,11 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1754,7 +1767,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1771,7 +1784,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1787,7 +1800,7 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = @@ -1806,7 +1819,7 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index e7a9ba0c7ed7..1759c2b4a1c5 100644 --- a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -32,7 +32,7 @@ class [[nodiscard]] ProfilingSection { uint32_t sectionID; public: - ProfilingSection(ProfilingSection const&) = delete; + ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp index f45dfa324e9f..a4168b9401fa 100644 --- a/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp +++ b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp @@ -30,7 +30,7 @@ namespace Kokkos::Profiling { class [[nodiscard]] ScopedRegion { public: - ScopedRegion(ScopedRegion const &) = delete; + ScopedRegion(ScopedRegion const &) = delete; ScopedRegion &operator=(ScopedRegion const &) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp index a925e32a339e..f00e25fdb629 100644 --- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -110,7 +110,7 @@ class ScratchMemorySpace { // Note: for team scratch m_offset is 0, since every // thread will get back the same shared pointer void* tmp = m_iter + m_offset * size; - uintptr_t increment = size * m_multiplier; + uintptr_t increment = static_cast(size) * m_multiplier; // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL const auto end_iter = diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp index 869a5f8ec26a..3edecb4502a8 100644 --- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_TASKSCHEDULER_HPP #define KOKKOS_TASKSCHEDULER_HPP @@ -44,6 +50,11 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -54,7 +65,7 @@ class TaskExec; } // end namespace Impl template -class BasicTaskScheduler : public Impl::TaskSchedulerBase { +class KOKKOS_DEPRECATED BasicTaskScheduler : public Impl::TaskSchedulerBase { public: using scheduler_type = BasicTaskScheduler; using execution_space = ExecSpace; @@ -494,8 +505,8 @@ namespace Kokkos { // Construct a TaskTeam execution policy template -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskTeam, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -503,7 +514,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskTeam( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -512,18 +524,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskTeam, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -531,8 +543,8 @@ Impl::TaskPolicyWithScheduler -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskSingle, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -540,7 +552,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskSingle( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -549,18 +562,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskSingle, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -575,7 +588,8 @@ Impl::TaskPolicyWithScheduler -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> host_spawn(Impl::TaskPolicyWithScheduler arg_policy, FunctorType&& arg_functor) { @@ -606,7 +620,8 @@ host_spawn(Impl::TaskPolicyWithScheduler */ template -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> KOKKOS_INLINE_FUNCTION task_spawn(Impl::TaskPolicyWithScheduler arg_policy, @@ -633,7 +648,7 @@ typename Scheduler::template future_type_for_functor> * 2) High, Normal, or Low priority */ template -void KOKKOS_INLINE_FUNCTION +KOKKOS_DEPRECATED void KOKKOS_INLINE_FUNCTION respawn(FunctorType* arg_self, T const& arg, TaskPriority const& arg_priority = TaskPriority::Regular) { static_assert(Kokkos::is_future::value || Kokkos::is_scheduler::value, @@ -656,7 +671,8 @@ respawn(FunctorType* arg_self, T const& arg, // Wait for all runnable tasks to complete template -inline void wait(BasicTaskScheduler const& scheduler) { +KOKKOS_DEPRECATED inline void wait( + BasicTaskScheduler const& scheduler) { using scheduler_type = BasicTaskScheduler; scheduler_type::specialization::execute(scheduler); // scheduler.m_queue->execute(); @@ -664,6 +680,10 @@ inline void wait(BasicTaskScheduler const& scheduler) { } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp index 203fb16eaf0b..83e1c06db9b9 100644 --- a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp @@ -31,31 +31,40 @@ static_assert(false, #include //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // Forward declarations used in Impl::TaskQueue template -class BasicFuture; +class KOKKOS_DEPRECATED BasicFuture; template -class SimpleTaskScheduler; +class KOKKOS_DEPRECATED SimpleTaskScheduler; template -class BasicTaskScheduler; +class KOKKOS_DEPRECATED BasicTaskScheduler; template -struct is_scheduler : public std::false_type {}; +struct KOKKOS_DEPRECATED is_scheduler : public std::false_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; -enum class TaskPriority : int { High = 0, Regular = 1, Low = 2 }; +enum class KOKKOS_DEPRECATED TaskPriority : int { + High = 0, + Regular = 1, + Low = 2 +}; } // namespace Kokkos @@ -141,28 +150,28 @@ using default_tasking_memory_space_for_execution_space_t = namespace Kokkos { template -using DeprecatedTaskScheduler = BasicTaskScheduler< +using DeprecatedTaskScheduler KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using DeprecatedTaskSchedulerMultiple = BasicTaskScheduler< +using DeprecatedTaskSchedulerMultiple KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueueMultiple< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using TaskScheduler = SimpleTaskScheduler< +using TaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::SingleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, Impl::TaskQueueTraitsLockBased>>; template -using TaskSchedulerMultiple = SimpleTaskScheduler< +using TaskSchedulerMultiple KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -172,7 +181,7 @@ using TaskSchedulerMultiple = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -using ChaseLevTaskScheduler = SimpleTaskScheduler< +using ChaseLevTaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -182,7 +191,7 @@ using ChaseLevTaskScheduler = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -void wait(BasicTaskScheduler const&); +KOKKOS_DEPRECATED void wait(BasicTaskScheduler const&); namespace Impl { @@ -204,6 +213,10 @@ struct TaskPolicyData; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/packages/kokkos/core/src/Kokkos_Timer.hpp b/packages/kokkos/core/src/Kokkos_Timer.hpp index a210b6ff1832..ab31484d76ac 100644 --- a/packages/kokkos/core/src/Kokkos_Timer.hpp +++ b/packages/kokkos/core/src/Kokkos_Timer.hpp @@ -48,7 +48,7 @@ class Timer { inline Timer() { reset(); } - Timer(const Timer&) = delete; + Timer(const Timer&) = delete; Timer& operator=(const Timer&) = delete; inline double seconds() const { diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp index f5ffc66af5b5..fcb061b378f2 100644 --- a/packages/kokkos/core/src/Kokkos_Tuners.hpp +++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp @@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t); VariableValue make_variable_value(size_t, double); SetOrRange make_candidate_range(double lower, double upper, double step, bool openLower, bool openUpper); +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower, bool openUpper); size_t get_new_context_id(); void begin_context(size_t context_id); void end_context(size_t context_id); @@ -412,18 +414,19 @@ class TeamSizeTuner : public ExtendableTunerMixin { TunerType tuner; public: - TeamSizeTuner() = default; + TeamSizeTuner() = default; TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; TeamSizeTuner(const TeamSizeTuner& other) = default; - TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; - TeamSizeTuner(TeamSizeTuner&& other) = default; + TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; + TeamSizeTuner(TeamSizeTuner&& other) = default; template TeamSizeTuner(const std::string& name, - Kokkos::TeamPolicy& policy, + const Kokkos::TeamPolicy& policy_in, const Functor& functor, const TagType& tag, ViableConfigurationCalculator calc) { - using PolicyType = Kokkos::TeamPolicy; + using PolicyType = Kokkos::TeamPolicy; + PolicyType policy(policy_in); auto initial_vector_length = policy.impl_vector_length(); if (initial_vector_length < 1) { policy.impl_set_vector_length(1); @@ -505,7 +508,8 @@ class TeamSizeTuner : public ExtendableTunerMixin { } template - void tune(Kokkos::TeamPolicy& policy) { + auto tune(const Kokkos::TeamPolicy& policy_in) { + Kokkos::TeamPolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); auto team_size = std::get<1>(configuration); @@ -515,6 +519,111 @@ class TeamSizeTuner : public ExtendableTunerMixin { policy.impl_set_vector_length(vector_length); } } + return policy; + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } + + TunerType get_tuner() const { return tuner; } +}; +namespace Impl { +template +struct tuning_type_for; + +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_double; + static double get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.double_value; + } +}; +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + static int64_t get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.int_value; + } +}; +} // namespace Impl +template +class SingleDimensionalRangeTuner { + size_t id; + size_t context; + using tuning_util = Impl::tuning_type_for; + + Bound default_value; + + public: + SingleDimensionalRangeTuner() = default; + SingleDimensionalRangeTuner( + const std::string& name, + Kokkos::Tools::Experimental::StatisticalCategory category, + Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) { + default_value = default_val; + Kokkos::Tools::Experimental::VariableInfo info; + info.category = category; + info.candidates = make_candidate_range( + static_cast(lower), static_cast(upper), + static_cast(step), false, false); + info.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range; + info.type = tuning_util::value; + id = Kokkos::Tools::Experimental::declare_output_type(name, info); + } + + Bound begin() { + context = Kokkos::Tools::Experimental::get_new_context_id(); + Kokkos::Tools::Experimental::begin_context(context); + auto tuned_value = + Kokkos::Tools::Experimental::make_variable_value(id, default_value); + Kokkos::Tools::Experimental::request_output_values(context, 1, + &tuned_value); + return tuning_util::get(tuned_value); + } + + void end() { Kokkos::Tools::Experimental::end_context(context); } + + template + void with_tuned_value(Functor& func) { + func(begin()); + end(); + } +}; + +class RangePolicyOccupancyTuner { + private: + using TunerType = SingleDimensionalRangeTuner; + TunerType tuner; + + public: + RangePolicyOccupancyTuner() = default; + template + RangePolicyOccupancyTuner(const std::string& name, + const Kokkos::RangePolicy&, + const Functor&, const TagType&, + ViableConfigurationCalculator) + : tuner(TunerType(name, + Kokkos::Tools::Experimental::StatisticalCategory:: + kokkos_value_ratio, + 100, 5, 100, 5)) {} + + template + auto tune(const Kokkos::RangePolicy& policy_in) { + Kokkos::RangePolicy policy(policy_in); + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto occupancy = tuner.begin(); + policy.impl_set_desired_occupancy( + Kokkos::Experimental::DesiredOccupancy{static_cast(occupancy)}); + } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { @@ -578,11 +687,13 @@ struct MDRangeTuner : public ExtendableTunerMixin> { policy.impl_change_tile_size({std::get(tuple)...}); } template - void tune(Kokkos::MDRangePolicy& policy) { + auto tune(const Kokkos::MDRangePolicy& policy_in) { + Kokkos::MDRangePolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); set_policy_tile(policy, configuration, std::make_index_sequence{}); } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { diff --git a/packages/kokkos/core/src/Kokkos_TypeInfo.hpp b/packages/kokkos/core/src/Kokkos_TypeInfo.hpp new file mode 100644 index 000000000000..e5710da2e3d5 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_TypeInfo.hpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TYPE_INFO_HPP +#define KOKKOS_TYPE_INFO_HPP + +#include +#include +#include + +#include + +// Intel C++ Compiler Classic version 2021.2.0 works but 2021.1.2 doesn't +// Both have __INTEL_COMPILER defined to 2021 so using +// __INTEL_COMPILER_BUILD_DATE to discriminate. +// Experimenting on the compiler explorer gave +// icc version | __INTEL_COMPILER | __INTEL_COMPILER_BUILD_DATE +// 2021.1.2 | 2021 | 20201208 +// 2021.2.0 | 2021 | 20210228 +// NVCC versions less than 11.3.0 segfault when that header is included +// NVCC+MSVC doesn't work at all - it simply reports "T" inside type_name +#if (!defined(KOKKOS_COMPILER_INTEL) || \ + (__INTEL_COMPILER_BUILD_DATE >= 20210228)) && \ + (!defined(KOKKOS_COMPILER_NVCC) || (KOKKOS_COMPILER_NVCC >= 1130)) && \ + (!(defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_COMPILER_MSVC))) + +#define KOKKOS_ENABLE_IMPL_TYPEINFO + +namespace Kokkos::Impl { + +template +constexpr std::array to_array(std::string_view src) { + std::array dst{}; + for (size_t i = 0; i < N; ++i) { + dst[i] = src[i]; + } + return dst; +} + +template +constexpr auto type_name() { +#if defined(__clang__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(__GNUC__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[with T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(_MSC_VER) + constexpr std::string_view func = __FUNCSIG__; + constexpr std::string_view prefix{"type_name<"}; + constexpr std::string_view suffix{">(void)"}; +#else +#error bug +#endif + constexpr auto beg = func.find(prefix) + prefix.size(); + constexpr auto end = func.rfind(suffix); + static_assert(beg != std::string_view::npos); + static_assert(end != std::string_view::npos); + return to_array(func.substr(beg, end)); +} + +template +class TypeInfo { + static constexpr auto value_ = type_name(); + + public: + static constexpr std::string_view name() noexcept { + return {value_.data(), value_.size()}; + } +}; + +} // namespace Kokkos::Impl + +#else // out of luck, using Intel C++ Compiler Classic + +namespace Kokkos::Impl { + +template +class TypeInfo { + public: + static constexpr std::string_view name() noexcept { return "not supported"; } +}; + +} // namespace Kokkos::Impl + +#endif + +#endif diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp index 04d1fcf15184..d5b352876c30 100644 --- a/packages/kokkos/core/src/Kokkos_View.hpp +++ b/packages/kokkos/core/src/Kokkos_View.hpp @@ -22,2016 +22,10 @@ static_assert(false, #ifndef KOKKOS_VIEW_HPP #define KOKKOS_VIEW_HPP -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -#include -#include -#include -#endif -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewArrayAnalysis; - -template ::non_const_value_type> -struct ViewDataAnalysis; - -template -class ViewMapping { - public: - enum : bool { is_assignable_data_type = false }; - enum : bool { is_assignable = false }; -}; - -template -constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( - const IntType i0, const IntType i1, const IntType i2, const IntType i3, - const IntType i4, const IntType i5, const IntType i6, const IntType i7) { - static_assert(std::is_integral::value, - "count_valid_integers() must have integer arguments."); - - return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + - (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + - (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + - (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); -} - -// FIXME Ideally, we would not instantiate this function for every possible View -// type. We should be able to only pass "extent" when we use mdspan. -template -KOKKOS_INLINE_FUNCTION void runtime_check_rank( - const View&, const bool is_void_spec, const size_t i0, const size_t i1, - const size_t i2, const size_t i3, const size_t i4, const size_t i5, - const size_t i6, const size_t i7, const char* label) { - (void)(label); - - if (is_void_spec) { - const size_t num_passed_args = - count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); - // We either allow to pass as many extents as the dynamic rank is, or - // as many extents as the total rank is. In the latter case, the given - // extents for the static dimensions must match the - // compile-time extents. - constexpr int rank = View::rank(); - constexpr int dyn_rank = View::rank_dynamic(); - const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; - const bool n_args_is_rank = num_passed_args == rank; - - if constexpr (rank != dyn_rank) { - if (n_args_is_rank) { - size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - for (int i = dyn_rank; i < rank; ++i) - if (new_extents[i] != View::static_extent(i)) { - KOKKOS_IF_ON_HOST( - const std::string message = - "The specified run-time extent for Kokkos::View '" + - std::string(label) + - "' does not match the compile-time extent in dimension " + - std::to_string(i) + ". The given extent is " + - std::to_string(new_extents[i]) + " but should be " + - std::to_string(View::static_extent(i)) + ".\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "The specified run-time extents for a Kokkos::View " - "do not match the compile-time extents.");) - } - } - } - - if (!n_args_is_dyn_rank && !n_args_is_rank) { - KOKKOS_IF_ON_HOST( - const std::string message = - "Constructor for Kokkos::View '" + std::string(label) + - "' has mismatched number of arguments. The number " - "of arguments = " + - std::to_string(num_passed_args) + - " neither matches the dynamic rank = " + - std::to_string(dyn_rank) + - " nor the total rank = " + std::to_string(rank) + "\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " - "mismatched number of arguments.");) - } - } -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -// Class to provide a uniform type -namespace Kokkos { -namespace Impl { -template -struct ViewUniformType; -} -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -/** \class ViewTraits - * \brief Traits class for accessing attributes of a View. - * - * This is an implementation detail of View. It is only of interest - * to developers implementing a new specialization of View. - * - * Template argument options: - * - View< DataType > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , ArrayLayout > - * - View< DataType , ArrayLayout , Space > - * - View< DataType , ArrayLayout , MemoryTraits > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - */ - -template -struct ViewTraits; - -template <> -struct ViewTraits { - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = void; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - // Ignore an extraneous 'void' - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - HooksPolicy, Prop...> { - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = HooksPolicy; -}; - -template -struct ViewTraits::value>, - ArrayLayout, Prop...> { - // Specify layout, keep subsequent space and memory traits arguments - - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = ArrayLayout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits::value>, Space, - Prop...> { - // Specify Space, memory traits should be the only subsequent argument. - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::HostMirrorSpace, - void>::value && - std::is_same::array_layout, - void>::value, - "Only one View Execution or Memory Space template argument"); - - using execution_space = typename Space::execution_space; - using memory_space = typename Space::memory_space; - using HostMirrorSpace = - typename Kokkos::Impl::HostMirror::Space::memory_space; - using array_layout = typename execution_space::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - MemoryTraits, Prop...> { - // Specify memory trait, should not be any subsequent arguments - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::array_layout, - void>::value && - std::is_same::memory_traits, - void>::value && - std::is_same::hooks_policy, - void>::value, - "MemoryTrait is the final optional template argument for a View"); - - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = MemoryTraits; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - private: - // Unpack the properties arguments - using prop = ViewTraits; - - using ExecutionSpace = - std::conditional_t::value, - typename prop::execution_space, - Kokkos::DefaultExecutionSpace>; - - using MemorySpace = - std::conditional_t::value, - typename prop::memory_space, - typename ExecutionSpace::memory_space>; - - using ArrayLayout = - std::conditional_t::value, - typename prop::array_layout, - typename ExecutionSpace::array_layout>; - - using HostMirrorSpace = std::conditional_t< - !std::is_void::value, - typename prop::HostMirrorSpace, - typename Kokkos::Impl::HostMirror::Space>; - - using MemoryTraits = - std::conditional_t::value, - typename prop::memory_traits, - typename Kokkos::MemoryManaged>; - - using HooksPolicy = - std::conditional_t::value, - typename prop::hooks_policy, - Kokkos::Experimental::DefaultViewHooks>; - - // Analyze data type's properties, - // May be specialized based upon the layout and value type - using data_analysis = Kokkos::Impl::ViewDataAnalysis; - - public: - //------------------------------------ - // Data type traits: - - using data_type = typename data_analysis::type; - using const_data_type = typename data_analysis::const_type; - using non_const_data_type = typename data_analysis::non_const_type; - - //------------------------------------ - // Compatible array of trivial type traits: - - using scalar_array_type = typename data_analysis::scalar_array_type; - using const_scalar_array_type = - typename data_analysis::const_scalar_array_type; - using non_const_scalar_array_type = - typename data_analysis::non_const_scalar_array_type; - - //------------------------------------ - // Value type traits: - - using value_type = typename data_analysis::value_type; - using const_value_type = typename data_analysis::const_value_type; - using non_const_value_type = typename data_analysis::non_const_value_type; - - //------------------------------------ - // Mapping traits: - - using array_layout = ArrayLayout; - using dimension = typename data_analysis::dimension; - - using specialize = std::conditional_t< - std::is_void::value, - typename prop::specialize, - typename data_analysis::specialize>; /* mapping specialization tag */ - - static constexpr unsigned rank = dimension::rank; - static constexpr unsigned rank_dynamic = dimension::rank_dynamic; - - //------------------------------------ - // Execution space, memory space, memory access traits, and host mirror space. - - using execution_space = ExecutionSpace; - using memory_space = MemorySpace; - using device_type = Kokkos::Device; - using memory_traits = MemoryTraits; - using host_mirror_space = HostMirrorSpace; - using hooks_policy = HooksPolicy; - - using size_type = typename MemorySpace::size_type; - - enum { is_hostspace = std::is_same::value }; - enum { is_managed = MemoryTraits::is_unmanaged == 0 }; - enum { is_random_access = MemoryTraits::is_random_access == 1 }; - - //------------------------------------ -}; - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -namespace Impl { -struct UnsupportedKokkosArrayLayout; - -template -struct MDSpanViewTraits { - using mdspan_type = UnsupportedKokkosArrayLayout; -}; - -// "Natural" mdspan for a view if the View's ArrayLayout is supported. -template -struct MDSpanViewTraits::type>> { - using index_type = std::size_t; - using extents_type = - typename Impl::ExtentsFromDataType::type; - using mdspan_layout_type = - typename Impl::LayoutFromArrayLayout::type; - using accessor_type = Impl::SpaceAwareAccessor< - typename Traits::memory_space, - Kokkos::default_accessor>; - using mdspan_type = mdspan; -}; -} // namespace Impl -#endif // KOKKOS_ENABLE_IMPL_MDSPAN - -/** \class View - * \brief View to an array of data. - * - * A View represents an array of one or more dimensions. - * For details, please refer to Kokkos' tutorial materials. - * - * \section Kokkos_View_TemplateParameters Template parameters - * - * This class has both required and optional template parameters. The - * \c DataType parameter must always be provided, and must always be - * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are - * placeholders for different template parameters. The default value - * of the fifth template parameter \c Specialize suffices for most use - * cases. When explaining the template parameters, we won't refer to - * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer - * to the valid categories of template parameters, in whatever order - * they may occur. - * - * Valid ways in which template arguments may be specified: - * - View< DataType > - * - View< DataType , Layout > - * - View< DataType , Layout , Space > - * - View< DataType , Layout , Space , MemoryTraits > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - * - * \tparam DataType (required) This indicates both the type of each - * entry of the array, and the combination of compile-time and - * run-time array dimension(s). For example, double* - * indicates a one-dimensional array of \c double with run-time - * dimension, and int*[3] a two-dimensional array of \c int - * with run-time first dimension and compile-time second dimension - * (of 3). In general, the run-time dimensions (if any) must go - * first, followed by zero or more compile-time dimensions. For - * more examples, please refer to the tutorial materials. - * - * \tparam Space (required) The memory space. - * - * \tparam Layout (optional) The array's layout in memory. For - * example, LayoutLeft indicates a column-major (Fortran style) - * layout, and LayoutRight a row-major (C style) layout. If not - * specified, this defaults to the preferred layout for the - * Space. - * - * \tparam MemoryTraits (optional) Assertion of the user's intended - * access behavior. For example, RandomAccess indicates read-only - * access with limited spatial locality, and Unmanaged lets users - * wrap externally allocated memory in a View without automatic - * deallocation. - * - * \section Kokkos_View_MT MemoryTraits discussion - * - * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on - * Space - * - * Some \c MemoryTraits options may have different interpretations for - * different \c Space types. For example, with the Cuda device, - * \c RandomAccess tells Kokkos to fetch the data through the texture - * cache, whereas the non-GPU devices have no such hardware construct. - * - * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits - * - * Users should defer applying the optional \c MemoryTraits parameter - * until the point at which they actually plan to rely on it in a - * computational kernel. This minimizes the number of template - * parameters exposed in their code, which reduces the cost of - * compilation. Users may always assign a View without specified - * \c MemoryTraits to a compatible View with that specification. - * For example: - * \code - * // Pass in the simplest types of View possible. - * void - * doSomething (View out, - * View in) - * { - * // Assign the "generic" View in to a RandomAccess View in_rr. - * // Note that RandomAccess View objects must have const data. - * View in_rr = in; - * // ... do something with in_rr and out ... - * } - * \endcode - */ - -} // namespace Kokkos - -namespace Kokkos { - -template -struct is_always_assignable_impl; - -template -struct is_always_assignable_impl, - Kokkos::View> { - using mapping_type = Kokkos::Impl::ViewMapping< - typename Kokkos::View::traits, - typename Kokkos::View::traits, - typename Kokkos::View::traits::specialize>; - - constexpr static bool value = - mapping_type::is_assignable && - static_cast(Kokkos::View::rank_dynamic) >= - static_cast(Kokkos::View::rank_dynamic); -}; - -template -using is_always_assignable = is_always_assignable_impl< - std::remove_reference_t, - std::remove_const_t>>; - -template -inline constexpr bool is_always_assignable_v = - is_always_assignable::value; - -template -constexpr bool is_assignable(const Kokkos::View& dst, - const Kokkos::View& src) { - using DstTraits = typename Kokkos::View::traits; - using SrcTraits = typename Kokkos::View::traits; - using mapping_type = - Kokkos::Impl::ViewMapping; - - return is_always_assignable_v, - Kokkos::View> || - (mapping_type::is_assignable && - ((DstTraits::dimension::rank_dynamic >= 1) || - (dst.static_extent(0) == src.extent(0))) && - ((DstTraits::dimension::rank_dynamic >= 2) || - (dst.static_extent(1) == src.extent(1))) && - ((DstTraits::dimension::rank_dynamic >= 3) || - (dst.static_extent(2) == src.extent(2))) && - ((DstTraits::dimension::rank_dynamic >= 4) || - (dst.static_extent(3) == src.extent(3))) && - ((DstTraits::dimension::rank_dynamic >= 5) || - (dst.static_extent(4) == src.extent(4))) && - ((DstTraits::dimension::rank_dynamic >= 6) || - (dst.static_extent(5) == src.extent(5))) && - ((DstTraits::dimension::rank_dynamic >= 7) || - (dst.static_extent(6) == src.extent(6))) && - ((DstTraits::dimension::rank_dynamic >= 8) || - (dst.static_extent(7) == src.extent(7)))); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with -// the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp declare target +#if defined(KOKKOS_ENABLE_IMPL_MDSPAN) && !defined(KOKKOS_COMPILER_INTEL) +#include #endif -inline constexpr Kokkos::ALL_t ALL{}; - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp end declare target -#endif - -inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; - -inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; - -inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; - -/** \brief Create View allocation parameter bundle from argument list. - * - * Valid argument list members are: - * 1) label as a "string" or std::string - * 2) memory space instance of the View::memory_space type - * 3) execution space instance compatible with the View::memory_space - * 4) Kokkos::WithoutInitializing to bypass initialization - * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory - * alignment - */ -template -inline Impl::ViewCtorProp::type...> -view_alloc(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_pointer, - "Cannot give pointer-to-memory for view allocation"); - - return return_type(args...); -} - -template -KOKKOS_INLINE_FUNCTION - Impl::ViewCtorProp::type...> - view_wrap(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_memory_space && - !return_type::has_execution_space && - !return_type::has_label && return_type::has_pointer, - "Must only give pointer-to-memory for view wrapping"); - - return return_type(args...); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -class View; - -template -struct is_view : public std::false_type {}; - -template -struct is_view> : public std::true_type {}; - -template -struct is_view> : public std::true_type {}; - -template -inline constexpr bool is_view_v = is_view::value; - -template -class View : public ViewTraits { - private: - template - friend class View; - template - friend class Kokkos::Impl::ViewMapping; - - using view_tracker_type = Kokkos::Impl::ViewTracker; - - public: - using traits = ViewTraits; - - private: - using map_type = - Kokkos::Impl::ViewMapping; - template - friend struct Kokkos::Impl::ViewTracker; - using hooks_policy = typename traits::hooks_policy; - - view_tracker_type m_track; - map_type m_map; - - public: - //---------------------------------------- - /** \brief Compatible view of array of scalar types */ - using array_type = - View; - - /** \brief Compatible view of const data type */ - using const_type = - View; - - /** \brief Compatible view of non-const data type */ - using non_const_type = - View; - - /** \brief Compatible HostMirror view */ - using HostMirror = - View, - typename traits::hooks_policy>; - - /** \brief Compatible HostMirror view */ - using host_mirror_type = - View; - - /** \brief Unified types */ - using uniform_type = typename Impl::ViewUniformType::type; - using uniform_const_type = - typename Impl::ViewUniformType::const_type; - using uniform_runtime_type = - typename Impl::ViewUniformType::runtime_type; - using uniform_runtime_const_type = - typename Impl::ViewUniformType::runtime_const_type; - using uniform_nomemspace_type = - typename Impl::ViewUniformType::nomemspace_type; - using uniform_const_nomemspace_type = - typename Impl::ViewUniformType::const_nomemspace_type; - using uniform_runtime_nomemspace_type = - typename Impl::ViewUniformType::runtime_nomemspace_type; - using uniform_runtime_const_nomemspace_type = - typename Impl::ViewUniformType::runtime_const_nomemspace_type; - - //---------------------------------------- - // Domain rank and extents - - static constexpr Impl::integral_constant - rank = {}; - static constexpr Impl::integral_constant - rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = - map_type::Rank}; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const noexcept { - return m_map.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return map_type::static_extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const noexcept { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() - const { - return m_map.layout(); - } - - //---------------------------------------- - /* Deprecate all 'dimension' functions in favor of - * ISO/C++ vocabulary 'extent'. - */ - - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::SharedAllocationTracker& impl_track() const { - return m_track.m_tracker; - } - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); \ - Kokkos::Impl::view_verify_operator_bounds( \ - __VA_ARGS__); - -#else - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); - -#endif - - template - static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is)); - static_assert(sizeof...(Is) <= 8); - static_assert(Kokkos::Impl::are_integral::value); - } - - template - static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is)); - static_assert(Kokkos::Impl::are_integral::value); - } - - public: - //------------------------------ - // Rank 1 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which - // have "inlined" versions above - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - ((0 == rank) || !is_default_map)), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.reference(indices...); - } - - //------------------------------ - // Rank 0 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> - access(Is... extra) const { - check_access_member_function_valid_args(extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) - return m_map.reference(); - } - - //------------------------------ - // Rank 1 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && !is_default_map), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.reference(i0, i1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.reference(i0, i1, i2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.reference(i0, i1, i2, i3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.reference(i0, i1, i2, i3, i4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map - .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - - //---------------------------------------- - // Standard destructor, constructors, and assignment operators - - KOKKOS_DEFAULTED_FUNCTION - ~View() = default; - - KOKKOS_DEFAULTED_FUNCTION - View() = default; - - KOKKOS_FUNCTION - View(const View& other) : m_track(other.m_track), m_map(other.m_map) { - KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View(View&& other) - : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { - KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View& operator=(const View& other) { - m_map = other.m_map; - m_track = other.m_track; - - KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) - - return *this; - } - - KOKKOS_FUNCTION - View& operator=(View&& other) { - m_map = std::move(other.m_map); - m_track = std::move(other.m_track); - - KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) - - return *this; - } - - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View( - const View& rhs, - std::enable_if_t::traits, - typename traits::specialize>::is_assignable_data_type>* = nullptr) - : m_track(rhs), m_map() { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - } - - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - Kokkos::Impl::ViewMapping< - traits, typename View::traits, - typename traits::specialize>::is_assignable_data_type, - View>& - operator=(const View& rhs) { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - m_track.assign(rhs); - return *this; - } - - //---------------------------------------- - // Compatible subview constructor - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, - Args... args) - : m_track(src_view), m_map() { - using SrcType = View; - - using Mapping = Kokkos::Impl::ViewMapping; - - using DstType = typename Mapping::type; - - static_assert( - Kokkos::Impl::ViewMapping::is_assignable, - "Subview construction requires compatible view and subview arguments"); - - Mapping::assign(m_map, src_view.m_map, arg0, args...); - } - - //---------------------------------------- - // Allocation tracking properties - - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.m_tracker.use_count(); } - - inline const std::string label() const { - return m_track.m_tracker - .template get_label(); - } - - public: - //---------------------------------------- - // Allocation according to allocation properties and array layout - - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), m_map() { - // Copy the input allocation properties with possibly defaulted properties - // We need to split it in two to avoid MSVC compiler errors - auto prop_copy_tmp = - Impl::with_properties_if_unset(arg_prop, std::string{}); - auto prop_copy = Impl::with_properties_if_unset( - prop_copy_tmp, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing View and initializing data with uninitialized " - "execution space"); - } - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, alloc_name.c_str()); - } -#endif - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.m_tracker.assign_allocated_record_to_uninitialized(record); - } - - KOKKOS_INLINE_FUNCTION - void assign_data(pointer_type arg_data) { - m_track.m_tracker.clear(); - m_map.assign_data(arg_data); - } - - // Wrap memory according to properties and array layout - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing View to wrap user memory must supply matching pointer " - "type"); - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, "UNMANAGED"); - } -#endif - } - - // Simple dimension-only layout - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Allocate with label and layout - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, - typename traits::array_layout> const& arg_layout) - : View(Impl::ViewCtorProp(arg_label), arg_layout) {} - - // Allocate label and layout, must disambiguate from subview constructor. - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, const size_t> - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_label), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Construct view from ViewTracker and map - // This should be the preferred method because future extensions may need to - // use the ViewTracker class. - template - KOKKOS_INLINE_FUNCTION View( - const view_tracker_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track.m_tracker); - } - - // Construct View from internal shared allocation tracker object and map - // This is here for backwards compatibility for classes that derive from - // Kokkos::View - template - KOKKOS_INLINE_FUNCTION View( - const typename view_tracker_type::track_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track); - } - - //---------------------------------------- - // Memory span required to wrap these dimensions. - static constexpr size_t required_allocation_size( - typename traits::array_layout const& layout) { - return map_type::memory_span(layout); - } - - static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_ptr), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} - - //---------------------------------------- - // Shared scratch memory constructor - - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - const size_t num_passed_args = Impl::count_valid_integers( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); - - if (std::is_void::value && - num_passed_args != rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - - return View::shmem_size(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - private: - // Want to be able to align to minimum scratch alignment or sizeof or alignof - // elements - static constexpr size_t scratch_value_alignment = - max({sizeof(typename traits::value_type), - alignof(typename traits::value_type), - static_cast( - traits::execution_space::scratch_memory_space::ALIGN)}); - - public: - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(typename traits::array_layout const& arg_layout) { - return map_type::memory_span(arg_layout) + scratch_value_alignment; - } - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(reinterpret_cast( - arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), - scratch_value_alignment))), - arg_layout) {} - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp( - reinterpret_cast(arg_space.get_shmem_aligned( - map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7)), - scratch_value_alignment))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - //---------------------------------------- - // MDSpan converting constructors -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN - template ::mdspan_type> - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(traits::is_managed) -#endif - View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, - std::enable_if_t< - !std::is_same_v>* = - nullptr) - : View(mds.data_handle(), - Impl::array_layout_from_mapping< - typename traits::array_layout, - typename Impl::MDSpanViewTraits::mdspan_type>( - mds.mapping())) { - } - - template - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(!std::is_convertible_v< - Kokkos::mdspan, - typename Impl::MDSpanViewTraits::mdspan_type>) -#endif - View(const Kokkos::mdspan& mds) - : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { - } - - //---------------------------------------- - // Conversion to MDSpan - template ::mdspan_type, - typename = std::enable_if_t, - std::false_type, - std::is_assignable, - ImplNaturalMDSpanType>>::value>> - KOKKOS_INLINE_FUNCTION constexpr operator mdspan< - OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - return mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map)}; - } - - template >, - typename = std::enable_if_t>> - KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( - const OtherAccessorType& other_accessor = - typename Impl::MDSpanViewTraits::accessor_type()) { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - using ret_mdspan_type = - mdspan; - return ret_mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map), - other_accessor}; - } -#endif // KOKKOS_ENABLE_IMPL_MDSPAN -}; - -template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { - return View::rank(); -} - -namespace Impl { - -template -struct RankDataType { - using type = typename RankDataType::type*; -}; - -template -struct RankDataType { - using type = ValueType; -}; - -template -KOKKOS_FUNCTION std::enable_if_t< - N == View::rank() && - std::is_same::specialize, void>::value, - View> -as_view_of_rank_n(View v) { - return v; -} - -// Placeholder implementation to compile generic code for DynRankView; should -// never be called -template -KOKKOS_FUNCTION std::enable_if_t< - N != View::rank() && - std::is_same::specialize, void>::value, - View::value_type, N>::type, - Args...>> -as_view_of_rank_n(View) { - Kokkos::abort("Trying to get at a View of the wrong rank"); - return {}; -} - -template -void apply_to_view_of_static_rank(Function&& f, View a) { - f(a); -} - -} // namespace Impl -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Impl { -template -struct TypeListToViewTraits; - -template -struct TypeListToViewTraits> { - using type = ViewTraits; -}; - -// It is not safe to assume that subviews of views with the Aligned memory trait -// are also aligned. Hence, just remove that attribute for subviews. -template -struct RemoveAlignedMemoryTrait { - private: - using type_list_in = Kokkos::Impl::type_list; - using memory_traits = typename ViewTraits::memory_traits; - using type_list_in_wo_memory_traits = - typename Kokkos::Impl::type_list_remove_first::type; - using new_memory_traits = - Kokkos::MemoryTraits; - using new_type_list = typename Kokkos::Impl::concat_type_list< - type_list_in_wo_memory_traits, - Kokkos::Impl::type_list>::type; - - public: - using type = typename TypeListToViewTraits::type; -}; -} // namespace Impl - -template -KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, - Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} -#endif - -template -using Subview = decltype(subview(std::declval(), std::declval()...)); - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, - const View& rhs) { - // Same data, layout, dimensions - using lhs_traits = ViewTraits; - using rhs_traits = ViewTraits; - - return std::is_same::value && - std::is_same::value && - std::is_same::value && - View::rank() == View::rank() && - lhs.data() == rhs.data() && lhs.span() == rhs.span() && - lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && - lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && - lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && - lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); -} - -template -KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, - const View& rhs) { - return !(operator==(lhs, rhs)); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct CommonViewValueType; - -template -struct CommonViewValueType { - using value_type = std::common_type_t; -}; - -template -struct CommonViewAllocProp; - -template -struct CommonViewAllocProp { - using value_type = ValueType; - using scalar_array_type = ValueType; - - template - KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} -}; - -template -struct DeduceCommonViewAllocProp; - -// Base case must provide types for: -// 1. specialize 2. value_type 3. is_view 4. prop_type -template -struct DeduceCommonViewAllocProp { - using specialize = typename FirstView::traits::specialize; - - using value_type = typename FirstView::traits::value_type; - - enum : bool { is_view = is_view::value }; - - using prop_type = CommonViewAllocProp; -}; - -template -struct DeduceCommonViewAllocProp { - using NextTraits = DeduceCommonViewAllocProp; - - using first_specialize = typename FirstView::traits::specialize; - using first_value_type = typename FirstView::traits::value_type; - - enum : bool { first_is_view = is_view::value }; - - using next_specialize = typename NextTraits::specialize; - using next_value_type = typename NextTraits::value_type; - - enum : bool { next_is_view = NextTraits::is_view }; - - // common types - - // determine specialize type - // if first and next specialize differ, but are not the same specialize, error - // out - static_assert(!(!std::is_same::value && - !std::is_void::value && - !std::is_void::value), - "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " - "specialize trait allowed"); - - // otherwise choose non-void specialize if either/both are non-void - using specialize = std::conditional_t< - std::is_same::value, first_specialize, - std::conditional_t<(std::is_void::value && - !std::is_void::value), - next_specialize, first_specialize>>; - - using value_type = typename CommonViewValueType::value_type; - - enum : bool { is_view = (first_is_view && next_is_view) }; - - using prop_type = CommonViewAllocProp; -}; - -} // end namespace Impl - -template -using DeducedCommonPropsType = - typename Impl::DeduceCommonViewAllocProp::prop_type; - -// This function is required in certain scenarios where users customize -// Kokkos View internals. One example are dynamic length embedded ensemble -// types. The function is used to propagate necessary information -// (like the ensemble size) when creating new views. -// However, most of the time it is called with a single view. -// Furthermore, the propagated information is not just for view allocations. -// From what I can tell, the type of functionality provided by -// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, -// a mechanism we will eventually use to replace this clunky approach here, when -// we are finally mdspan based. -// TODO: get rid of this when we have mdspan -template -KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( - Views const&... views) { - return DeducedCommonPropsType(views...); -} - -} // namespace Kokkos - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +#include -#endif /* #ifndef KOKKOS_VIEW_HPP */ +#endif /* KOKKOS_VIEW_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index efa56a086e36..4d2263428154 100644 --- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -120,7 +120,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { (std::int32_t)BEGIN_TOKEN))) { // Attempt to claim ready work index succeeded, // update the hint and return work index - atomic_increment(begin_hint); + atomic_inc(begin_hint); return w; } // arrive here when ready_queue[i] == BEGIN_TOKEN @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { void operator()(const TagCount, int i) const noexcept { std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; - atomic_increment(count_queue + m_graph.entries[i]); + atomic_inc(count_queue + m_graph.entries[i]); } KOKKOS_INLINE_FUNCTION diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index 99daf379b6ff..37fcfb7a1d99 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -23,7 +23,19 @@ #include #include +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) +#include +#elif defined(KOKKOS_ARCH_AMD_GPU) +// FIXME_OPENACC - hip_runtime_api.h contains two implementations: one for AMD +// GPUs and the other for NVIDIA GPUs; below macro is needed to choose AMD GPUs. +#define __HIP_PLATFORM_AMD__ +#include +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) +#include +#endif + #include +#include Kokkos::Experimental::OpenACC::OpenACC() : m_space_instance( @@ -46,6 +58,8 @@ Kokkos::Experimental::OpenACC::OpenACC(int async_arg) void Kokkos::Experimental::OpenACC::impl_initialize( InitializationSettings const& settings) { + Impl::OpenACCInternal::m_concurrency = + 256000; // FIXME_OPENACC - random guess when cannot compute if (Impl::OpenACC_Traits::may_fallback_to_host && acc_get_num_devices(Impl::OpenACC_Traits::dev_type) == 0 && !settings.has_device_id()) { @@ -59,11 +73,46 @@ void Kokkos::Experimental::OpenACC::impl_initialize( acc_get_device_num(acc_device_host); } else { using Kokkos::Impl::get_visible_devices; + acc_set_device_type(Impl::OpenACC_Traits::dev_type); std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + cudaDeviceProp deviceProp; + cudaError error = cudaGetDeviceProperties(&deviceProp, dev_num); + if (error != cudaSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "CUDA device properties: (" << cudaGetErrorName(error) + << "): " << cudaGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ARCH_AMD_GPU) + hipDeviceProp_t deviceProp; + hipError_t error = hipGetDeviceProperties(&deviceProp, dev_num); + if (error != hipSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "HIP device properties: (" << hipGetErrorName(error) + << "): " << hipGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + Impl::OpenACCInternal::m_concurrency = std::thread::hardware_concurrency(); + if (Impl::OpenACCInternal::m_concurrency == 0) { + Kokkos::Impl::host_abort( + "Error: During OpenACC backend initialization, failed to retrieve " + "CPU hardware concurrency"); + } +#else + // FIXME_OPENACC: Compute Impl::OpenACCInternal::m_concurrency correctly. +#endif } Impl::OpenACCInternal::singleton().initialize(); } @@ -86,6 +135,12 @@ void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os, os << "yes\n"; #else os << "no\n"; +#endif + os << " KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE: "; +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + os << "yes\n"; +#else + os << "no\n"; #endif m_space_instance->print_configuration(os, verbose); } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index 5155bee33dc3..aee696bd34e6 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -42,6 +42,7 @@ static_assert(false, // LLVM/Clacc compiler does not need this. #ifndef KOKKOS_COMPILER_CLANG #define KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS +#define KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS #endif namespace Kokkos::Experimental::Impl { @@ -87,9 +88,9 @@ class OpenACC { static char const* name() { return "OpenACC"; } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency() { return 256000; } // FIXME_OPENACC + static int concurrency(); #else - int concurrency() const { return 256000; } // FIXME_OPENACC + int concurrency() const; #endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEPRECATED static bool in_parallel() { diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp index 4e7170cbbdf3..75cef98a8d91 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp @@ -85,16 +85,26 @@ class OpenACCSpace { template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp index 82d38586eb8f..1373f8fa7a48 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp @@ -38,7 +38,7 @@ class FunctorAdapter; \ KOKKOS_IMPL_ACC_PRAGMA(routine CLAUSE) \ template \ - KOKKOS_FUNCTION void operator()(Args &&... args) const { \ + KOKKOS_FUNCTION void operator()(Args &&...args) const { \ if constexpr (std::is_void_v) { \ m_functor(static_cast(args)...); \ } else { \ diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp index 10a76fbd3136..1dad499c1bec 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp @@ -27,6 +27,7 @@ // Arbitrary value to denote that we don't know yet what device to use. int Kokkos::Experimental::Impl::OpenACCInternal::m_acc_device_num = -1; +int Kokkos::Experimental::Impl::OpenACCInternal::m_concurrency = -1; Kokkos::Experimental::Impl::OpenACCInternal& Kokkos::Experimental::Impl::OpenACCInternal::singleton() { @@ -78,8 +79,18 @@ void Kokkos::Experimental::Impl::OpenACCInternal::fence( [&]() { acc_wait(m_async_arg); }); } -uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() const - noexcept { +uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() + const noexcept { return Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +int Kokkos::Experimental::OpenACC::concurrency() { + return Impl::OpenACCInternal::m_concurrency; +} +#else +int Kokkos::Experimental::OpenACC::concurrency() const { + return Impl::OpenACCInternal::m_concurrency; +} +#endif diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index c3d723687270..343d9921a95a 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -30,11 +30,12 @@ namespace Kokkos::Experimental::Impl { class OpenACCInternal { bool m_is_initialized = false; - OpenACCInternal(const OpenACCInternal&) = default; + OpenACCInternal(const OpenACCInternal&) = default; OpenACCInternal& operator=(const OpenACCInternal&) = default; public: static int m_acc_device_num; + static int m_concurrency; int m_async_arg = acc_async_noval; OpenACCInternal() = default; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index 550436fe7bec..629d26928ed3 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -30,10 +30,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i1 = m / dim0 + begin1; + auto i0 = m % dim0 + begin0; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -42,6 +55,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, functor(i0, i1); } } +#endif } template @@ -50,10 +64,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i0 = m / dim1 + begin0; + auto i1 = m % dim1 + begin1; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -62,6 +89,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, functor(i0, i1); } } +#endif } template @@ -71,12 +99,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1) copyin(functor) async(async_arg) // clang-format on @@ -94,12 +122,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; // clang-format off #pragma acc parallel loop gang vector tile(tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -116,12 +144,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim1 * dim0; + auto i2 = m / tmp1 + begin2; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -132,6 +177,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -140,12 +186,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim2 + begin1; + auto i2 = tmp2 % dim2 + begin2; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -156,6 +219,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -165,15 +229,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2) copyin(functor) async(async_arg) // clang-format on @@ -193,15 +257,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; // clang-format off #pragma acc parallel loop gang vector tile(tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -220,14 +284,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1 * dim0; + auto i3 = m / tmp1 + begin3; + auto tmp2 = m % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -240,6 +325,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -248,14 +334,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + auto i2 = tmp2 / dim3 + begin2; + auto i3 = tmp2 % dim3 + begin3; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -268,6 +375,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -277,18 +385,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3) copyin(functor) async(async_arg) // clang-format on @@ -310,18 +418,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; // clang-format off #pragma acc parallel loop gang vector tile(tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -342,16 +450,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = m / tmp1 + begin4; + auto tmp2 = m % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -366,6 +499,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -374,16 +508,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i3 = tmp2 / dim4 + begin3; + auto i4 = tmp2 % dim4 + begin4; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -398,6 +557,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -407,21 +567,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4) copyin(functor) async(async_arg) // clang-format on @@ -445,21 +605,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; // clang-format off #pragma acc parallel loop gang vector tile(tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -482,18 +642,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; + auto i5 = m / tmp1 + begin5; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = tmp2 / tmp1 + begin4; + tmp2 = tmp2 % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -510,6 +699,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -518,18 +708,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim5 * dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + auto i4 = tmp2 / dim5 + begin4; + auto i5 = tmp2 % dim5 + begin5; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -546,6 +765,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -555,24 +775,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int tile5 = tile[5]; - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto tile5 = tile[5]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4,tile5) copyin(functor) async(async_arg) // clang-format on @@ -598,24 +818,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile5 = tile[5]; - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto tile5 = tile[5]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; // clang-format off #pragma acc parallel loop gang vector tile(tile5,tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 5afb5e75d392..2b5631d6f8a3 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -113,6 +113,404 @@ class Kokkos::Impl::ParallelReduce \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i1 = m / dim0 + begin1; \ + auto i0 = m % dim0 + begin0; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i0 = m / dim1 + begin0; \ + auto i1 = m % dim1 + begin1; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim1 * dim0; \ + auto i2 = m / tmp1 + begin2; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim2 + begin1; \ + auto i2 = tmp2 % dim2 + begin2; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1 * dim0; \ + auto i3 = m / tmp1 + begin3; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + auto i2 = tmp2 / dim3 + begin2; \ + auto i3 = tmp2 % dim3 + begin3; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = m / tmp1 + begin4; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i3 = tmp2 / dim4 + begin3; \ + auto i4 = tmp2 % dim4 + begin4; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; \ + auto i5 = m / tmp1 + begin5; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = tmp2 / tmp1 + begin4; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim5 * dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + auto i4 = tmp2 / dim5 + begin4; \ + auto i5 = tmp2 % dim5 + begin5; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#else + #define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ OPERATOR) \ namespace Kokkos::Experimental::Impl { \ @@ -124,10 +522,10 @@ class Kokkos::Impl::ParallelReduce \ diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 430bdcb68088..d4cb73164d20 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -163,13 +163,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + wrapped_reducer.final(&tmp); result = tmp; } } @@ -180,15 +191,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -200,7 +221,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -208,6 +239,7 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -218,9 +250,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -228,6 +268,8 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -239,7 +281,17 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -247,6 +299,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -273,10 +326,23 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + ValueType tmp = ValueType(); #pragma acc loop worker reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); result = tmp; } @@ -314,11 +380,22 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } @@ -357,11 +434,23 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); result = tmp; } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index c6d3267bdb0a..b1c48baa1e73 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -225,7 +225,7 @@ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ } #pragma acc exit data delete (functor, chunk_values, offset_values, \ - final_reducer)async(async_arg) + final_reducer)async(async_arg) acc_wait(async_arg); } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index faa50aa7c388..95526aa7849c 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -28,8 +28,11 @@ struct OpenACC_Traits { #elif defined(KOKKOS_ARCH_AMD_GPU) static constexpr acc_device_t dev_type = acc_device_radeon; static constexpr bool may_fallback_to_host = false; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + static constexpr acc_device_t dev_type = acc_device_host; + static constexpr bool may_fallback_to_host = true; #else - static constexpr acc_device_t dev_type = acc_device_not_host; + static constexpr acc_device_t dev_type = acc_device_default; static constexpr bool may_fallback_to_host = true; #endif }; diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index a403909f677c..aa4be87ceb62 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -30,7 +30,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -93,11 +92,16 @@ class OpenMP { void fence(std::string const& name = "Kokkos::OpenMP::fence: Unnamed Instance Fence") const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Does the given instance return immediately after launching /// a parallel algorithm /// /// This always returns false on OpenMP - inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED inline static bool is_asynchronous( + OpenMP const& = OpenMP()) noexcept { + return false; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); @@ -154,10 +158,6 @@ inline int OpenMP::impl_thread_pool_rank() noexcept { KOKKOS_IF_ON_DEVICE((return -1;)) } -inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { - return false; -} - inline int OpenMP::impl_thread_pool_size(int depth) const { return depth < 2 ? impl_thread_pool_size() : 1; } @@ -202,7 +202,9 @@ struct MemorySpaceAccess #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 2877d940fafc..6edcbff0c26b 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -26,12 +26,19 @@ #include #include +#include + #include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -360,6 +367,10 @@ extern template class TaskQueue #include #include -#include #include #include #include @@ -148,7 +147,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { #include #include #include -#include /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index ed625cfcc82c..ec33d25b9695 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include +#include #ifdef KOKKOS_ENABLE_OPENMPTARGET @@ -91,9 +92,9 @@ class OpenMPTargetSpace { /**\brief Default memory space instance */ OpenMPTargetSpace(); - OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; - OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; - OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; + OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; + OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; + OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; ~OpenMPTargetSpace() = default; @@ -141,79 +142,5 @@ class OpenMPTargetSpace { KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( Kokkos::Experimental::OpenMPTargetSpace); -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -// TODO: implement all possible deep_copies -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - // In the Release and RelWithDebInfo builds, the size of the memcpy should - // be greater than zero to avoid error. omp_target_memcpy returns zero on - // success. - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence " - "before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } -}; - -} // namespace Impl -} // namespace Kokkos - #endif #endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp new file mode 100644 index 000000000000..aace09e266b0 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_OPENMPTARGET_DEEP_COPY_HPP +#define KOKKOS_OPENMPTARGET_DEEP_COPY_HPP + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// TODO: implement all possible deep_copies +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + // In the Release and RelWithDebInfo builds, the size of the memcpy should + // be greater than zero to avoid error. omp_target_memcpy returns zero on + // success. + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence " + "before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_OPENMPTARGET_DEEP_COPY_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp deleted file mode 100644 index 6c5eb048e34e..000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ /dev/null @@ -1,130 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -// FIXME_OPENMPTARGET currently unused -/* -namespace Kokkos { -namespace Impl { -namespace { - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() { return omp_in_parallel(); } - -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos -*/ - -namespace Kokkos { -namespace Impl { - -void OpenMPTargetExec::verify_is_process(const char* const label) { - // Fails if the current task is in a parallel region or is not on the host. - if (omp_in_parallel() && (!omp_is_initial_device())) { - std::string msg(label); - msg.append(" ERROR: in parallel or on device"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void OpenMPTargetExec::verify_initialized(const char* const label) { - if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { - std::string msg(label); - msg.append(" ERROR: not initialized"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void* OpenMPTargetExec::m_scratch_ptr = nullptr; -int64_t OpenMPTargetExec::m_scratch_size = 0; -uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; -int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; -std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; - -void OpenMPTargetExec::clear_scratch() { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_ptr = nullptr; - m_scratch_size = 0; -} - -void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } - -void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, - int64_t shmem_size_L1, - int64_t league_size) { - Kokkos::Experimental::OpenMPTargetSpace space; - // Level-0 scratch when using clang/17 and higher comes from their OpenMP - // extension, `ompx_dyn_cgroup_mem`. -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - shmem_size_L0 = 0; -#endif - const int64_t shmem_size = - shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. - const int64_t padding = shmem_size * 10 / 100; // Padding per team. - - // Maximum active teams possible. - // The number should not exceed the maximum in-flight teams possible or the - // league_size. - int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); - - // max_active_teams is the number of active teams on the given hardware. - // We set the number of teams to be twice the number of max_active_teams for - // the compiler to pick the right number in its case. - // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. -#if !defined(KOKKOS_COMPILER_CRAY_LLVM) - omp_set_num_teams(max_active_teams * 2); -#endif - - // Total amount of scratch memory allocated is depenedent - // on the maximum number of in-flight teams possible. - int64_t total_size = - (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - max_active_teams * 2; - - if (total_size > m_scratch_size) { - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_size = total_size; - m_scratch_ptr = space.allocate(total_size); - } -} - -} // namespace Impl -} // namespace Kokkos - -#endif // KOKKOS_ENABLE_OPENMPTARGET diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp new file mode 100644 index 000000000000..13b509c0ada0 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp @@ -0,0 +1,48 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP +#define KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP + +#include +#include + +namespace Kokkos::Experimental::Impl { + +template +class FunctorAdapter { + Functor m_functor; + using WorkTag = typename Policy::work_tag; + + public: + FunctorAdapter() = default; + FunctorAdapter(Functor const &functor) : m_functor(functor) {} + + Functor get_functor() const { return m_functor; } + + template + KOKKOS_FUNCTION void operator()(Args &&...args) const { + if constexpr (std::is_void_v) { + m_functor(static_cast(args)...); + } else { + m_functor(WorkTag(), static_cast(args)...); + } + } +}; + +} // namespace Kokkos::Experimental::Impl + +#endif // KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 44e9119ea886..53e723882f55 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,11 +27,11 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND +#include #include #include #include #include -#include #include @@ -105,18 +105,15 @@ void OpenMPTargetInternal::print_configuration(std::ostream& os, void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; - Kokkos::Impl::OpenMPTargetExec space; - if (space.m_uniquetoken_ptr != nullptr) + if (m_uniquetoken_ptr != nullptr) Kokkos::kokkos_free( - space.m_uniquetoken_ptr); + m_uniquetoken_ptr); } void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; - Kokkos::Impl::OpenMPTargetExec::MAX_ACTIVE_THREADS = concurrency(); - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. // FIXME_OPENMPTARGTE: Cray compiler did not yet implement omp_set_num_teams. @@ -136,7 +133,75 @@ OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { return &self; } -} // Namespace Impl +void OpenMPTargetInternal::verify_is_process(const char* const label) { + // Fails if the current task is in a parallel region or is not on the host. + if (omp_in_parallel() && (!omp_is_initial_device())) { + std::string msg(label); + msg.append(" ERROR: in parallel or on device"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::verify_initialized(const char* const label) { + if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { + std::string msg(label); + msg.append(" ERROR: not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::clear_scratch() { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_ptr = nullptr; + m_scratch_size = 0; +} + +void* OpenMPTargetInternal::get_scratch_ptr() { return m_scratch_ptr; } + +void OpenMPTargetInternal::resize_scratch(int64_t team_size, + int64_t shmem_size_L0, + int64_t shmem_size_L1, + int64_t league_size) { + Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif + const int64_t shmem_size = + shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. + const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetInternal::concurrency() / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. +#if !defined(KOKKOS_COMPILER_CRAY_LLVM) + omp_set_num_teams(max_active_teams * 2); +#endif + + // Total amount of scratch memory allocated is depenedent + // on the maximum number of in-flight teams possible. + int64_t total_size = + (shmem_size + + ::Kokkos::Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * + max_active_teams * 2; + + if (total_size > m_scratch_size) { + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_size = total_size; + m_scratch_ptr = space.allocate(total_size); + } +} + +} // namespace Impl OpenMPTarget::OpenMPTarget() : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} @@ -206,9 +271,9 @@ namespace Experimental { UniqueToken:: - UniqueToken(Kokkos::Experimental::OpenMPTarget const&) { + UniqueToken(Kokkos::Experimental::OpenMPTarget const& space) { #ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr; + uint32_t* ptr = space.impl_internal_space_instance()->m_uniquetoken_ptr; int count = Kokkos::Experimental::OpenMPTarget().concurrency(); if (ptr == nullptr) { int size = count * sizeof(uint32_t); @@ -221,7 +286,7 @@ UniqueTokenm_uniquetoken_ptr = ptr; } #else // FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` @@ -229,8 +294,7 @@ UniqueToken - namespace Kokkos { namespace Experimental { namespace Impl { @@ -27,9 +25,9 @@ enum class openmp_fence_is_static { yes, no }; class OpenMPTargetInternal { private: - OpenMPTargetInternal() = default; - OpenMPTargetInternal(const OpenMPTargetInternal&) = default; - OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default; + OpenMPTargetInternal() = default; + OpenMPTargetInternal(const OpenMPTargetInternal&) = delete; + OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = delete; public: void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no); @@ -55,6 +53,19 @@ class OpenMPTargetInternal { static OpenMPTargetInternal* impl_singleton(); + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + void* get_scratch_ptr(); + void clear_scratch(); + void resize_scratch(int64_t team_reduce_bytes, int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + void* m_scratch_ptr = nullptr; + std::mutex m_mutex_scratch_ptr; + int64_t m_scratch_size = 0; + uint32_t* m_uniquetoken_ptr = nullptr; + private: bool m_is_initialized = false; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index e222d6525010..f71f8887135e 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -116,8 +116,8 @@ class OpenMPTargetExecTeamMember { // FIXME_OPENMPTARGET this function currently ignores the reducer passed. template KOKKOS_INLINE_FUNCTION std::enable_if_t::value> - team_reduce(ReducerType const&, typename ReducerType::value_type& value) const - noexcept { + team_reduce(ReducerType const&, + typename ReducerType::value_type& value) const noexcept { #pragma omp barrier using value_type = typename ReducerType::value_type; @@ -741,43 +741,6 @@ struct TeamVectorRangeBoundariesStruct { } // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static int MAX_ACTIVE_THREADS; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static void* get_scratch_ptr(); - static void clear_scratch(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static std::mutex m_mutex_scratch_ptr; - static int64_t m_scratch_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl } // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp index bd7d3eef5d73..38ed7c5681a1 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp @@ -20,6 +20,8 @@ #include #include #include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -31,38 +33,38 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; + using Policy = Kokkos::MDRangePolicy; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + const FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; public: inline void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); + Policy policy = m_policy; - typename Policy::point_type unused; static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || Policy::inner_direction == Iterate::Right); execute_tile( - unused, functor, policy, + m_functor, policy, std::integral_constant()); } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -72,18 +74,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i0 = begin_0; i0 < end_0; ++i0) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -96,10 +94,7 @@ class ParallelFor, for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -107,9 +102,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -125,10 +119,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -137,9 +128,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -158,11 +148,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -172,9 +158,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -197,12 +182,7 @@ class ParallelFor, for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } @@ -214,9 +194,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -226,18 +205,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i1 = begin_1; i1 < end_1; ++i1) for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -250,10 +225,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -261,9 +233,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -279,10 +250,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -291,9 +259,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -312,11 +279,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -326,9 +289,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -351,12 +313,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp index a674637a3b1a..502461cc5e08 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -20,6 +20,8 @@ #include #include #include +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" namespace Kokkos { namespace Impl { @@ -28,36 +30,30 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using Member = typename Policy::member_type; - const FunctorType m_functor; + Kokkos::Experimental::Impl::FunctorAdapter m_functor; const Policy m_policy; public: - void execute() const { execute_impl(); } + void execute() const { execute_impl(); } - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto begin = m_policy.begin(); const auto end = m_policy.end(); if (end <= begin) return; - FunctorType a_functor(m_functor); + auto const a_functor(m_functor); #pragma omp target teams distribute parallel for map(to : a_functor) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } + a_functor(i); } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 26085f11400f..77dc71a87b78 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace Kokkos { @@ -76,28 +77,27 @@ class ParallelFor, using Policy = Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; + + Kokkos::Experimental::Impl::FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; const size_t m_shmem_size; public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); + execute_impl(); } private: - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto league_size = m_policy.league_size(); const auto team_size = m_policy.team_size(); @@ -105,11 +105,12 @@ class ParallelFor, const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); + m_policy.space().impl_internal_space_instance()->resize_scratch( + team_size, shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); + void* scratch_ptr = + m_policy.space().impl_internal_space_instance()->get_scratch_ptr(); + auto const a_functor(m_functor); // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the // scratch implementation does not work in the Release or RelWithDebugInfo @@ -122,7 +123,7 @@ class ParallelFor, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(m_policy.space().concurrency() / team_size, league_size); #endif // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the @@ -161,16 +162,13 @@ class ParallelFor, typename Policy::member_type team(league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #else #pragma omp target teams distribute firstprivate(a_functor) \ is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ - thread_limit(team_size) + thread_limit(team_size) for (int i = 0; i < league_size; i++) { #pragma omp parallel { @@ -180,10 +178,7 @@ class ParallelFor, typename Policy::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #endif diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index e86a12197497..bee604834c78 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -37,9 +37,8 @@ class ParallelReduce; + public: inline void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = FunctorAdapter(m_functor_reducer.get_functor()); execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr, + functor, m_policy, m_result_ptr, std::integral_constant()); } @@ -77,7 +81,7 @@ class ParallelReduce inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -90,32 +94,23 @@ class ParallelReduce::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -126,7 +121,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -141,38 +136,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -184,7 +170,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -201,40 +187,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -247,7 +222,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -266,26 +241,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -293,18 +260,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -318,7 +280,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -339,27 +301,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -368,19 +322,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -395,7 +344,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -408,32 +357,23 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -444,7 +384,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -459,38 +399,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -502,7 +433,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -519,40 +450,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -565,7 +485,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -584,26 +504,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -611,18 +523,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -636,7 +543,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -657,27 +564,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -686,19 +585,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4a112ed11d06..b7c8abcb4495 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -33,8 +34,6 @@ class ParallelReduce, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -55,14 +54,17 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; public: void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); + if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -75,26 +77,26 @@ class ParallelReduce, // Enter this loop if the reduction is on an array and the routine is // templated over the size of the array. if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<2>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<4>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<8>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<16>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<32>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<1>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 16c0eedb8185..b81e3aa7ed0b 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -59,7 +59,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) +#pragma omp for reduction(+ : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -68,7 +68,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for reduction(custom : TeamThread_scratch[:1]) +#pragma omp for reduction(custom : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -90,11 +90,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< const Lambda& lambda, ReducerType result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custominner \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of // elements in the array <= 32. For reduction we allocate, 16 bytes per @@ -109,7 +108,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); #pragma omp barrier -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) +#pragma omp for reduction(custominner : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamThread_scratch[0]); } @@ -132,11 +131,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< ValueType* TeamThread_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -145,8 +143,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) +#pragma omp for reduction( \ + omp_red_teamthread_reducer : TeamThread_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -259,11 +258,10 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< const Lambda& lambda, ReducerType const& result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType vector_reduce; Impl::OpenMPTargetReducerWrapper::init(vector_reduce); @@ -329,7 +327,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) +#pragma omp for simd reduction(+ : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -338,7 +336,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -363,11 +361,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< static_assert(sizeof(ValueType) <= Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); @@ -376,7 +373,7 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); #pragma omp barrier -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamVector_scratch[0]); } @@ -400,11 +397,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -413,8 +409,9 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) +#pragma omp for simd reduction( \ + omp_red_teamthread_reducer : TeamVector_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -443,8 +440,7 @@ class ParallelReduce scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index b0d693280243..ec8a96cb2f36 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -20,6 +20,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -30,7 +31,6 @@ class ParallelScan, protected: using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using idx_type = typename Policy::index_type; @@ -48,18 +48,8 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); - } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); - } + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; public: void impl_execute( @@ -77,8 +67,10 @@ class ParallelScan, idx_type team_size = 128; auto a_functor_reducer = m_functor_reducer; -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) + auto a_functor = FunctorAdapter(m_functor_reducer.get_functor()); + +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -91,9 +83,8 @@ class ParallelScan, const idx_type idx = local_offset + i; value_type val; reducer.init(&val); - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, val, - false); + if (idx < N) a_functor(idx, val, false); + element_values(team_id, i) = val; } #pragma omp barrier @@ -120,9 +111,8 @@ class ParallelScan, } } -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) \ - thread_limit(team_size) +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -143,14 +133,9 @@ class ParallelScan, local_offset_value = element_values(team_id, i - 1); // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs #if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ - !defined(KOKKOS_ARCH_AMD_GFX1100) + !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) if constexpr (Analysis::Reducer::has_join_member_function()) { - if constexpr (std::is_void_v) - a_functor_reducer.get_functor().join(local_offset_value, - offset_value); - else - a_functor_reducer.get_functor().join( - WorkTag{}, local_offset_value, offset_value); + a_functor.get_functor().join(local_offset_value, offset_value); } else local_offset_value += offset_value; #else @@ -158,9 +143,8 @@ class ParallelScan, #endif } else local_offset_value = offset_value; - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, - local_offset_value, true); + if (idx < N) a_functor(idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) *m_result_ptr = local_offset_value; } @@ -169,9 +153,9 @@ class ParallelScan, } void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const idx_type N = m_policy.end() - m_policy.begin(); const idx_type chunk_size = 128; @@ -179,7 +163,7 @@ class ParallelScan, // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View, public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); const int chunk_size = 128; @@ -231,7 +215,9 @@ class ParallelScanWithTotal, if (N > 0) { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + base_t::m_policy.space() + .impl_internal_space_instance() + ->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View #include #include +#include namespace Kokkos { namespace Impl { @@ -72,7 +73,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = Kokkos::RangePolicy; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -82,12 +82,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); const auto begin = p.begin(); @@ -104,33 +107,27 @@ struct ParallelReduceSpecialize, return; } -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), ptr_on_device); } - template - static void execute_array(const FunctorType& f, const PolicyType& p, + template + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); const auto begin = p.begin(); @@ -150,27 +147,14 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result) + for (auto i = begin; i < end; ++i) f(i, result); } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) + for (auto i = begin; i < end; ++i) f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -186,13 +170,10 @@ struct ParallelReduceSpecialize, ptr_on_device); return; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result[ : NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result( @@ -200,12 +181,12 @@ struct ParallelReduceSpecialize, } } - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); const auto begin = p.begin(); @@ -219,23 +200,25 @@ struct ParallelReduceSpecialize, const auto size = end - begin; - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // FIXME_OPENMPTARGET: The team size and concurrency are currently // based on NVIDIA-V100 and should be modifid to be based on the // architecture in the future. const int max_team_threads = 32; const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + p.space().impl_internal_space_instance()->concurrency() / + max_team_threads; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. Achieved by setting the first // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = - static_cast(OpenMPTargetExec::get_scratch_ptr()); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), + std::numeric_limits::max()); + ValueType* scratch_ptr = static_cast( + p.space().impl_internal_space_instance()->get_scratch_ptr()); - typename FunctorAnalysis::Reducer final_reducer(f); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { #pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) @@ -260,8 +243,7 @@ struct ParallelReduceSpecialize, } #pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : final_reducer) is_device_ptr(scratch_ptr) + map(to : final_reducer) is_device_ptr(scratch_ptr) { #pragma omp parallel { @@ -279,11 +261,7 @@ struct ParallelReduceSpecialize, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } // Reduce all paritial results within a team. @@ -304,8 +282,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ +#pragma omp target teams distribute parallel for simd map(to : f) \ is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { @@ -344,7 +321,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = TeamPolicyInternal; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -355,12 +331,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); @@ -370,9 +349,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); ValueType result = ValueType(); @@ -383,16 +364,15 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. if (max_active_teams <= 0) return; -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) KOKKOS_IMPL_OMPTARGET_PRAGMA( @@ -414,16 +394,13 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #else #pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ - num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ - : result) + num_teams(max_active_teams) thread_limit(team_size) \ + reduction(custom : result) for (int i = 0; i < league_size; i++) { #pragma omp parallel reduction(custom : result) { @@ -433,10 +410,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #endif @@ -447,12 +421,12 @@ struct ParallelReduceSpecialize, } template - static void execute_array(const FunctorType& f, const PolicyType& p, + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); @@ -462,9 +436,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); // Maximum active teams possible. // FIXME_OPENMPTARGET: Cray compiler did not yet implement @@ -473,7 +449,7 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. @@ -504,19 +480,14 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) #pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to \ - : f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + map(to : f) is_device_ptr(scratch_ptr) reduction(custom : result) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -531,10 +502,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } @@ -545,10 +513,10 @@ struct ParallelReduceSpecialize, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to : f) is_device_ptr(scratch_ptr) \ + reduction(+ : result[ : NumReductions]) +#pragma omp parallel reduction(+ : result[ : NumReductions]) { if (omp_get_num_teams() > max_active_teams) Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); @@ -562,10 +530,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } @@ -577,12 +542,12 @@ struct ParallelReduceSpecialize, // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join "); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join"); using FunctorAnalysis = @@ -611,13 +576,14 @@ struct ParallelReduceSpecialize, const auto nteams = league_size; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - typename FunctorAnalysis::Reducer final_reducer(f); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { // If there is no work to be done, copy back the initialized values and @@ -661,11 +627,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) { - f(team, result); - } else { - f(TagType(), team, result); - } + f(team, result); } } // end parallel } // end target @@ -673,7 +635,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { #pragma omp target teams distribute parallel for simd firstprivate( \ - final_reducer) is_device_ptr(scratch_ptr) + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 9b578aca1129..4308fb042a34 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -34,9 +34,6 @@ struct OpenMPTargetReducerWrapper { KOKKOS_INLINE_FUNCTION static void join(value_type&, const value_type&) = delete; - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - KOKKOS_INLINE_FUNCTION static void init(value_type&) = delete; }; @@ -51,11 +48,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest += src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::sum(); @@ -72,11 +64,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest *= src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::prod(); @@ -95,11 +82,6 @@ struct OpenMPTargetReducerWrapper> { if (src < dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::min(); @@ -118,11 +100,6 @@ struct OpenMPTargetReducerWrapper> { if (src > dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - // Required KOKKOS_INLINE_FUNCTION static void init(value_type& val) { @@ -141,11 +118,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest && src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::land(); @@ -166,11 +138,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest || src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::lor(); @@ -189,11 +156,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest & src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::band(); @@ -212,11 +174,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest | src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::bor(); @@ -236,12 +193,12 @@ struct OpenMPTargetReducerWrapper> { // Required KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -263,12 +220,12 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -298,16 +255,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -331,22 +278,16 @@ struct OpenMPTargetReducerWrapper> { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -385,15 +326,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::max(); @@ -428,15 +360,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::min(); @@ -480,23 +403,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -531,13 +437,6 @@ struct OpenMPTargetReducerWrapper> { : dest.min_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_true = reduction_identity::min(); @@ -569,13 +468,6 @@ struct OpenMPTargetReducerWrapper> { : dest.max_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = reduction_identity::max(); @@ -611,17 +503,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = ::Kokkos::reduction_identity::max(); @@ -654,13 +535,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_false = ::Kokkos::reduction_identity::min(); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp deleted file mode 100644 index 458c4c9a43e6..000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp +++ /dev/null @@ -1,251 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY) - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template class TaskQueue; - -//---------------------------------------------------------------------------- - -TaskExec::TaskExec() - : m_self_exec(0), - m_team_exec(0), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(0), - m_team_rank(0), - m_team_size(1) {} - -TaskExec::TaskExec( - Kokkos::Impl::OpenMPTargetExec &arg_exec, int const arg_team_size) - : m_self_exec(&arg_exec), - m_team_exec(arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size)), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(arg_exec.pool_rank_rev() / arg_team_size), - m_team_rank(arg_exec.pool_rank_rev() % arg_team_size), - m_team_size(arg_team_size) { - // This team spans - // m_self_exec->pool_rev( team_size * group_rank ) - // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) - - int64_t volatile *const sync = (int64_t *)m_self_exec->scratch_reduce(); - - sync[0] = int64_t(0); - sync[1] = int64_t(0); - - for (int i = 0; i < m_team_size; ++i) { - m_sync_value |= int64_t(1) << (8 * i); - m_sync_mask |= int64_t(3) << (8 * i); - } - - Kokkos::memory_fence(); -} - -void TaskExec::team_barrier_impl() const { - if (m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t))) { - Kokkos::abort("TaskQueue scratch_reduce memory too small"); - } - - // Use team shared memory to synchronize. - // Alternate memory locations between barriers to avoid a sequence - // of barriers overtaking one another. - - int64_t volatile *const sync = - ((int64_t *)m_team_exec->scratch_reduce()) + (m_sync_step & 0x01); - - // This team member sets one byte within the sync variable - int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank; - - *sync_self = int8_t(m_sync_value & 0x03); // signal arrival - - while (m_sync_value != *sync) - ; // wait for team to arrive - - ++m_sync_step; - - if (0 == (0x01 & m_sync_step)) { // Every other step - m_sync_value ^= m_sync_mask; - if (1000 < m_sync_step) m_sync_step = 0; - } -} - -//---------------------------------------------------------------------------- - -void TaskQueueSpecialization::execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - using Member = TaskExec; - - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - // Required: team_size <= 8 - - const int team_size = PoolExec::pool_size(2); // Threads per core - // const int team_size = PoolExec::pool_size(1); // Threads per NUMA - - if (8 < team_size) { - Kokkos::abort("TaskQueue unsupported team size"); - } - -#pragma omp parallel - { - PoolExec &self = *PoolExec::get_thread_omp(); - - Member single_exec; - Member team_exec(self, team_size); - - // Team shared memory - task_root_type *volatile *const task_shared = - (task_root_type **)team_exec.m_team_exec->scratch_thread(); - -// Barrier across entire OpenMPTarget thread pool to insure initialization -#pragma omp barrier - - // Loop until all queues are empty and no tasks in flight - - do { - task_root_type *task = 0; - - // Each team lead attempts to acquire either a thread team task - // or a single thread task for the team. - - if (0 == team_exec.team_rank()) { - task = 0 < *((volatile int *)&queue->m_ready_count) ? end : 0; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - } - - // Team lead broadcast acquired task to team members: - - if (1 < team_exec.team_size()) { - if (0 == team_exec.team_rank()) *task_shared = task; - - // Fence to be sure task_shared is stored before the barrier - Kokkos::memory_fence(); - - // Whole team waits for every team member to reach this statement - team_exec.team_barrier(); - - // Fence to be sure task_shared is stored - Kokkos::memory_fence(); - - task = *task_shared; - } - - if (0 == task) break; // 0 == m_ready_count - - if (end == task) { - // All team members wait for whole team to reach this statement. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } else if (task_root_type::TaskTeam == task->m_task_type) { - // Thread Team Task - (*task->m_apply)(task, &team_exec); - - // The m_apply function performs a barrier - - if (0 == team_exec.team_rank()) { - // team member #0 completes the task, which may delete the task - queue->complete(task); - } - } else { - // Single Thread Task - - if (0 == team_exec.team_rank()) { - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - } - - // All team members wait for whole team to reach this statement. - // Not necessary to complete the task. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } - } while (1); - } - // END #pragma omp parallel -} - -void TaskQueueSpecialization:: - iff_single_thread_recursive_execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using Member = TaskExec; - - if (1 == omp_get_num_threads()) { - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - Member single_exec; - - task_root_type *task = end; - - do { - task = end; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - - if (end == task) break; - - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - - } while (1); - } -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( \ - KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp deleted file mode 100644 index c9aa7b128f17..000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp +++ /dev/null @@ -1,319 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP -#define KOKKOS_IMPL_OPENMP_TASK_HPP - -#if defined(KOKKOS_ENABLE_TASKPOLICY) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <> -class TaskQueueSpecialization { - public: - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = Kokkos::Impl::TaskQueue; - using task_base_type = Kokkos::Impl::TaskBase; - - // Must specify memory space - using memory_space = Kokkos::HostSpace; - - static void iff_single_thread_recursive_execute(queue_type* const); - - // Must provide task queue execution function - static void execute(queue_type* const); - - // Must provide mechanism to set function pointer in - // execution space from the host process. - template - static void proc_set_apply(task_base_type::function_type* ptr) { - using TaskType = TaskBase; - *ptr = TaskType::apply; - } -}; - -extern template class TaskQueue; - -//---------------------------------------------------------------------------- - -template <> -class TaskExec { - private: - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; - TaskExec& operator=(TaskExec const&) = delete; - - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - - friend class Kokkos::Impl::TaskQueue; - friend class Kokkos::Impl::TaskQueueSpecialization< - Kokkos::Experimental::OpenMPTarget>; - - PoolExec* const m_self_exec; ///< This thread's thread pool data structure - PoolExec* const m_team_exec; ///< Team thread's thread pool data structure - int64_t m_sync_mask; - int64_t mutable m_sync_value; - int mutable m_sync_step; - int m_group_rank; ///< Which "team" subset of thread pool - int m_team_rank; ///< Which thread within a team - int m_team_size; - - TaskExec(); - TaskExec(PoolExec& arg_exec, int arg_team_size); - - void team_barrier_impl() const; - - public: - KOKKOS_FUNCTION void* team_shared() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread() : nullptr;)) - - KOKKOS_IF_ON_DEVICE((return nullptr;)) - } - - KOKKOS_FUNCTION int team_shared_size() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread_size() : 0;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /**\brief Whole team enters this function call - * before any teeam member returns from - * this function call. - */ - KOKKOS_FUNCTION void team_barrier() const { - KOKKOS_IF_ON_HOST((if (1 < m_team_size) { team_barrier_impl(); })) - } - - KOKKOS_INLINE_FUNCTION - int team_rank() const { return m_team_rank; } - - KOKKOS_INLINE_FUNCTION - int team_size() const { return m_team_size; } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, - count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& start, const iType& end) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, start, - end); -} - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i); - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - shared[0] += shared[i]; - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - join(shared[0], shared[i]); - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) {} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { -} - -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - ValueType accum = 0; - ValueType val, local_total; - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - int team_size = loop_boundaries.thread.team_size(); - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - - // Intra-member scan - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } - - shared[team_rank] = accum; - loop_boundaries.thread.team_barrier(); - - // Member 0 do scan on accumulated totals - if (team_rank == 0) { - for (iType i = 1; i < team_size; i += 1) { - shared[i] += shared[i - 1]; - } - accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan - } - - loop_boundaries.thread.team_barrier(); - - // Inter-member scan adding in accumulated totals - if (team_rank != 0) { - accum = shared[team_rank - 1]; - } - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) {} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 4de6931918e4..2583a1cdc047 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -46,7 +46,6 @@ struct Container { } // namespace namespace Kokkos { -namespace Experimental { SYCL::SYCL() : m_space_instance(&Impl::SYCLInternal::singleton(), [](Impl::SYCLInternal*) {}) { @@ -100,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif +#ifdef KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : defined\n"; +#else + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : undefined\n"; +#endif #ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; #else @@ -172,8 +176,7 @@ void SYCL::fence(const std::string& name) const { } void SYCL::impl_static_fence(const std::string& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, @@ -261,8 +264,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << device.get_info() << "\nImage Max Buffer Size: " << device.get_info() - << "\nImage Max Array Size: " - << device.get_info() << "\nMax Samplers: " << device.get_info() << "\nMax Parameter Size: " << device.get_info() @@ -317,5 +318,4 @@ int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index 0f3d1f0994df..937dcceab483 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -39,7 +39,6 @@ static_assert(false, #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal; } @@ -91,9 +90,8 @@ class SYCL { /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); - void fence( - const std::string& name = - "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence") const; + void fence(const std::string& name = + "Kokkos::SYCL::fence: Unnamed Instance Fence") const; /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -131,15 +129,13 @@ class SYCL { Kokkos::Impl::HostSharedPtr m_space_instance; }; -} // namespace Experimental - namespace Tools { namespace Experimental { template <> -struct DeviceTypeTraits { +struct DeviceTypeTraits { /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling static constexpr DeviceType id = DeviceType::SYCL; - static int device_id(const Kokkos::Experimental::SYCL& exec) { + static int device_id(const Kokkos::SYCL& exec) { return exec.impl_internal_space_instance()->m_syclDev; } }; @@ -185,10 +181,11 @@ std::vector partition_space(const SYCL& sycl_space, return instances; } +} // namespace Experimental + namespace Impl { std::vector get_sycl_devices(); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index afc7eebd3881..a9e2eca4fb3a 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -28,37 +28,34 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n); -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n); +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n); void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n); template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value && is_sycl_type_space::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; @@ -66,10 +63,9 @@ struct DeepCopy struct DeepCopy< MemSpace1, MemSpace2, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + is_sycl_type_space::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -93,9 +89,8 @@ struct DeepCopy< template struct DeepCopy< MemSpace, HostSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -118,9 +113,8 @@ struct DeepCopy< template struct DeepCopy< HostSpace, MemSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp index 9c39df941592..54ca64599532 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp @@ -32,30 +32,29 @@ namespace Impl { template -class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag< - PatternTag, Functor, PolicyType, Args..., - Kokkos::Experimental::SYCL>::type { +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { public: using Policy = PolicyType; using graph_kernel = GraphNodeKernelImpl; - using base_t = typename PatternImplSpecializationFromTag< - PatternTag, Functor, Policy, Args..., Kokkos::Experimental::SYCL>::type; + using base_t = + typename PatternImplSpecializationFromTag::type; // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::Experimental::SYCL const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + GraphNodeKernelImpl(std::string, Kokkos::SYCL const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...) {} template - GraphNodeKernelImpl(Kokkos::Experimental::SYCL const& exec_space, - Functor arg_functor, PolicyDeduced&& arg_policy) + GraphNodeKernelImpl(Kokkos::SYCL const& exec_space, Functor arg_functor, + PolicyDeduced&& arg_policy) : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} + (PolicyDeduced&&)arg_policy) {} void set_sycl_graph_ptr( sycl::ext::oneapi::experimental::command_graph< @@ -102,14 +101,14 @@ template ::type> struct get_graph_node_kernel_type - : type_identity> {}; + : type_identity< + GraphNodeKernelImpl> {}; template struct get_graph_node_kernel_type : type_identity, Kokkos::ParallelReduceTag>> {}; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp index 6bbe6711a2e8..828f1cacb4ac 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp @@ -28,7 +28,7 @@ namespace Kokkos { namespace Impl { template <> -struct GraphNodeBackendSpecificDetails { +struct GraphNodeBackendSpecificDetails { std::optional node; explicit GraphNodeBackendSpecificDetails() = default; @@ -38,16 +38,16 @@ struct GraphNodeBackendSpecificDetails { }; template -struct GraphNodeBackendDetailsBeforeTypeErasure { +struct GraphNodeBackendDetailsBeforeTypeErasure { protected: GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, Kernel &, PredecessorRef const &, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails &) noexcept {} GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, _graph_node_is_root_ctor_tag, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails &) noexcept {} }; } // namespace Impl diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp index 1dc4a9c99739..dc63052dd7a7 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -31,29 +31,28 @@ namespace Kokkos { namespace Impl { template <> -class GraphImpl { +class GraphImpl { public: - using node_details_t = - GraphNodeBackendSpecificDetails; - using root_node_impl_t = GraphNodeImpl; + using node_details_t = GraphNodeBackendSpecificDetails; + using root_node_impl_t = + GraphNodeImpl; using aggregate_kernel_impl_t = SYCLGraphNodeAggregateKernel; using aggregate_node_impl_t = - GraphNodeImpl; // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); - explicit GraphImpl(Kokkos::Experimental::SYCL instance); + explicit GraphImpl(Kokkos::SYCL instance); void add_node(std::shared_ptr const& arg_node_ptr); @@ -63,19 +62,25 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::SYCL& exec); - Kokkos::Experimental::SYCL const& get_execution_space() const noexcept; + Kokkos::SYCL const& get_execution_space() const noexcept; auto create_root_node_ptr(); template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { m_graph_exec = m_graph.finalize(); } + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec.has_value()); + m_graph_exec = m_graph.finalize(); + } - Kokkos::Experimental::SYCL m_execution_space; + auto& sycl_graph() { return m_graph; } + auto& sycl_graph_exec() { return m_graph_exec; } + + private: + Kokkos::SYCL m_execution_space; sycl::ext::oneapi::experimental::command_graph< sycl::ext::oneapi::experimental::graph_state::modifiable> m_graph; @@ -84,17 +89,16 @@ class GraphImpl { m_graph_exec; }; -inline GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); } -inline GraphImpl::GraphImpl( - Kokkos::Experimental::SYCL instance) +inline GraphImpl::GraphImpl(Kokkos::SYCL instance) : m_execution_space(std::move(instance)), m_graph(m_execution_space.sycl_queue().get_context(), m_execution_space.sycl_queue().get_device()) {} -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // add an empty node that needs to be set up before finalizing the graph arg_node_ptr->node_details_t::node = m_graph.add(); @@ -103,7 +107,7 @@ inline void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in its policy template -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -122,7 +126,7 @@ inline void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -inline void GraphImpl::add_predecessor( +inline void GraphImpl::add_predecessor( NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); @@ -137,19 +141,19 @@ inline void GraphImpl::add_predecessor( m_graph.make_edge(*pred_node, *node); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::SYCL& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - m_execution_space.sycl_queue().ext_oneapi_graph(*m_graph_exec); + exec.sycl_queue().ext_oneapi_graph(*m_graph_exec); } -inline Kokkos::Experimental::SYCL const& -GraphImpl::get_execution_space() const noexcept { +inline Kokkos::SYCL const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } -inline auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), _graph_node_is_root_ctor_tag{}); @@ -158,7 +162,7 @@ inline auto GraphImpl::create_root_node_ptr() { } template -inline auto GraphImpl::create_aggregate_ptr( +inline auto GraphImpl::create_aggregate_ptr( PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 5843dca81239..5af1330d9390 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -24,14 +24,12 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { namespace { // FIXME_SYCL Should be a multiple of the maximum subgroup size. -static constexpr auto sizeScratchGrain = - sizeof(Kokkos::Experimental::SYCL::size_type[32]); +static constexpr auto sizeScratchGrain = sizeof(Kokkos::SYCL::size_type[32]); std::size_t scratch_count(const std::size_t size) { return (size + sizeScratchGrain - 1) / sizeScratchGrain; @@ -55,8 +53,8 @@ Kokkos::View sycl_global_unique_token_locks( SYCLInternal::~SYCLInternal() { if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { - std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " - "Kokkos::Experimental::SYCL::finalize()" + std::cerr << "Kokkos::SYCL ERROR: Failed to call " + "Kokkos::SYCL::finalize()" << std::endl; std::cerr.flush(); } @@ -64,7 +62,7 @@ SYCLInternal::~SYCLInternal() { int SYCLInternal::verify_is_initialized(const char* const label) const { if (!is_initialized()) { - Kokkos::abort((std::string("Kokkos::Experimental::SYCL::") + label + + Kokkos::abort((std::string("Kokkos::SYCL::") + label + " : ERROR device not initialized\n") .c_str()); } @@ -171,12 +169,12 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && @@ -184,9 +182,9 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -255,7 +253,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) mem_space.deallocate(m_scratchSpace, @@ -265,8 +263,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - m_scratchSpace = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -276,7 +274,7 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchHost) mem_space.deallocate(m_scratchHost, @@ -286,8 +284,8 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchHostCount, sizeScratchGrain); - m_scratchHost = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); + m_scratchHost = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchHost", alloc_size)); } return m_scratchHost; @@ -297,7 +295,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) mem_space.deallocate(m_scratchFlags, @@ -307,8 +305,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - m_scratchFlags = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchFlags", alloc_size)); // We only zero-initialize the allocation when we actually allocate. // It's the responsibility of the features using scratch_flags, @@ -326,8 +324,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( template void SYCLInternal::fence_helper(WAT& wat, const std::string& name, uint32_t instance_id) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id}, [&]() { try { @@ -364,8 +361,7 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { AllocationSpace alloc_space(*m_q); if (m_data) alloc_space.deallocate(m_data, m_capacity); - m_data = - alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); + m_data = alloc_space.allocate("Kokkos::SYCL::USMObjectMem", n); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); @@ -396,5 +392,4 @@ template class SYCLInternal::USMObjectMem; template class SYCLInternal::USMObjectMem; } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 2d784ef8a5f0..c982154a9a82 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -28,7 +28,6 @@ #include #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal { @@ -38,10 +37,10 @@ class SYCLInternal { SYCLInternal() = default; ~SYCLInternal(); - SYCLInternal(const SYCLInternal&) = delete; + SYCLInternal(const SYCLInternal&) = delete; SYCLInternal& operator=(const SYCLInternal&) = delete; - SYCLInternal& operator=(SYCLInternal&&) = delete; - SYCLInternal(SYCLInternal&&) = delete; + SYCLInternal& operator=(SYCLInternal&&) = delete; + SYCLInternal(SYCLInternal&&) = delete; Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); @@ -76,8 +75,9 @@ class SYCLInternal { mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; - uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< - Kokkos::Experimental::SYCL>(reinterpret_cast(this)); + uint32_t m_instance_id = + Kokkos::Tools::Experimental::Impl::idForInstance( + reinterpret_cast(this)); std::optional m_queue; // Using std::vector> reveals a compiler bug when @@ -102,9 +102,9 @@ class SYCLInternal { explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept : m_q(std::move(q)), m_instance_id(instance_id) {} - USMObjectMem(USMObjectMem const&) = delete; - USMObjectMem(USMObjectMem&&) = delete; - USMObjectMem& operator=(USMObjectMem&&) = delete; + USMObjectMem(USMObjectMem const&) = delete; + USMObjectMem(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem&&) = delete; USMObjectMem& operator=(USMObjectMem const&) = delete; ~USMObjectMem() { reset(); }; @@ -119,12 +119,12 @@ class SYCLInternal { size_t reserve(size_t n); private: - using AllocationSpace = std::conditional_t< - Kind == sycl::usm::alloc::device, - Kokkos::Experimental::SYCLDeviceUSMSpace, - std::conditional_t>; + using AllocationSpace = + std::conditional_t>; public: // Performs either sycl::memcpy (for USM device memory) or std::memcpy @@ -144,11 +144,10 @@ class SYCLInternal { } void fence() { - SYCLInternal::fence( - m_last_event, - "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for " - "last event to finish", - m_instance_id); + SYCLInternal::fence(m_last_event, + "Kokkos::SYCLInternal::USMObject fence to wait for " + "last event to finish", + m_instance_id); } void register_event(sycl::event event) { @@ -324,13 +323,12 @@ auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) { return SYCLFunctionWrapper(functor, storage); } } // namespace Impl -} // namespace Experimental } // namespace Kokkos #if defined(SYCL_DEVICE_COPYABLE) && defined(KOKKOS_ARCH_INTEL_GPU) template struct sycl::is_device_copyable< - Kokkos::Experimental::Impl::SYCLFunctionWrapper> + Kokkos::Impl::SYCLFunctionWrapper> : std::true_type {}; #if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ @@ -352,8 +350,7 @@ static_assert( template struct sycl::is_device_copyable< - const Kokkos::Experimental::Impl::SYCLFunctionWrapper, + const Kokkos::Impl::SYCLFunctionWrapper, std::enable_if_t>>> : std::true_type {}; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp index d212e2dacc3a..9498513a3e8c 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -22,13 +22,13 @@ namespace Kokkos { template <> -struct default_outer_direction { +struct default_outer_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; template <> -struct default_inner_direction { +struct default_inner_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; @@ -37,8 +37,8 @@ namespace Impl { // Settings for MDRangePolicy template <> -inline TileSizeProperties get_tile_size_properties( - const Kokkos::Experimental::SYCL& space) { +inline TileSizeProperties get_tile_size_properties( + const Kokkos::SYCL& space) { TileSizeProperties properties; properties.max_threads = space.impl_internal_space_instance()->m_maxWorkgroupSize; @@ -50,8 +50,7 @@ inline TileSizeProperties get_tile_size_properties( // Settings for TeamMDRangePolicy template -struct ThreadAndVectorNestLevel +struct ThreadAndVectorNestLevel : AcceleratorBasedNestLevel {}; } // namespace Impl diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index cb7b1048da35..3dbd63d81ad5 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -25,7 +25,7 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelFor, const typename Policy::index_type m_num_tiles; static constexpr Iterate inner_direction = Policy::inner_direction; } m_policy; - const Kokkos::Experimental::SYCL& m_space; + const Kokkos::SYCL& m_space; sycl::nd_range<3> compute_ranges() const { const auto& m_tile = m_policy.m_tile; @@ -180,12 +180,11 @@ class Kokkos::Impl::ParallelFor, } void execute() const { - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = - m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 8ef43d392c6a..da75f3e901d1 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -67,7 +67,7 @@ struct FunctorWrapperRangePolicyParallelForCustom { template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; @@ -82,8 +82,8 @@ class Kokkos::Impl::ParallelFor, sycl::event sycl_direct_launch(const Policy& policy, const Functor& functor, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -160,13 +160,13 @@ class Kokkos::Impl::ParallelFor, void execute() const { if (m_policy.begin() == m_policy.end()) return; - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = m_policy.space() - .impl_internal_space_instance() - ->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_policy.space() + .impl_internal_space_instance() + ->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index cf7f582bc79f..d8859cda9f3e 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -27,11 +27,11 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using functor_type = FunctorType; - using size_type = ::Kokkos::Experimental::SYCL::size_type; + using size_type = ::Kokkos::SYCL::size_type; private: using member_type = typename Policy::member_type; @@ -52,8 +52,8 @@ class Kokkos::Impl::ParallelFor, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = m_policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -146,11 +146,11 @@ class Kokkos::Impl::ParallelFor, scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); @@ -164,10 +164,14 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended(arg_functor, ParallelForTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 0774b24bca16..1e313549757b 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -30,7 +30,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -76,7 +76,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -85,7 +85,7 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch( functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); @@ -370,7 +368,7 @@ class Kokkos::Impl::ParallelReduce template -class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { +class Kokkos::Impl::ParallelReduce< + CombinedFunctorReducerType, Kokkos::RangePolicy, Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -49,7 +48,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -59,8 +58,8 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_reducer_wrapper, diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b443bcbf9023..8f5310cbb21c 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -29,7 +29,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -46,7 +46,7 @@ class Kokkos::Impl::ParallelReduce(m_scratch_size[1]) * m_league_size)); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_reducer_wrapper, @@ -436,16 +434,21 @@ class Kokkos::Impl::ParallelReduce::accessible), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended( m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), ParallelReduceTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } + // Must be a power of two greater than two, get the one not bigger than the // requested one. if ((m_team_size & m_team_size - 1) || m_team_size < 2) { @@ -461,7 +464,7 @@ class Kokkos::Impl::ParallelReduce(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bdb5b8837705..ed7cee2805d9 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -145,7 +145,7 @@ class ParallelScanSYCLBase { using value_type = typename Analysis::value_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; - using size_type = Kokkos::Experimental::SYCL::size_type; + using size_type = Kokkos::SYCL::size_type; using index_type = typename Policy::index_type; protected: @@ -161,8 +161,8 @@ class ParallelScanSYCLBase { sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, sycl::event memcpy_event) { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = m_policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -374,11 +374,11 @@ class ParallelScanSYCLBase { std::scoped_lock scratch_buffers_lock( instance.m_mutexScratchSpace); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); @@ -399,7 +399,7 @@ class ParallelScanSYCLBase { template class Kokkos::Impl::ParallelScan, - Kokkos::Experimental::SYCL> + Kokkos::SYCL> : private ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; @@ -417,13 +417,12 @@ class Kokkos::Impl::ParallelScan, template class Kokkos::Impl::ParallelScanWithTotal< - FunctorType, Kokkos::RangePolicy, ReturnType, - Kokkos::Experimental::SYCL> + FunctorType, Kokkos::RangePolicy, ReturnType, Kokkos::SYCL> : public ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; - const Kokkos::Experimental::SYCL& m_exec; + const Kokkos::SYCL& m_exec; inline void execute() { Base::impl_execute([&]() { @@ -445,7 +444,7 @@ class Kokkos::Impl::ParallelScanWithTotal< const typename Base::Policy& arg_policy, const ViewType& arg_result_view) : Base(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible), m_exec(arg_policy.space()) {} }; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 19fad29150e5..022f88e0a812 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -33,11 +33,11 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); } -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; auto event = q.memcpy(dst, src, n); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -46,9 +46,8 @@ void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, } void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); - Experimental::SYCL().fence( - "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + SYCL().fence("Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); } } // namespace Impl @@ -60,12 +59,9 @@ namespace { std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { switch (allocation_kind) { - case sycl::usm::alloc::host: - return Kokkos::Experimental::SYCLHostUSMSpace::name(); - case sycl::usm::alloc::device: - return Kokkos::Experimental::SYCLDeviceUSMSpace::name(); - case sycl::usm::alloc::shared: - return Kokkos::Experimental::SYCLSharedUSMSpace::name(); + case sycl::usm::alloc::host: return Kokkos::SYCLHostUSMSpace::name(); + case sycl::usm::alloc::device: return Kokkos::SYCLDeviceUSMSpace::name(); + case sycl::usm::alloc::shared: return Kokkos::SYCLSharedUSMSpace::name(); default: Kokkos::abort("bug: unknown sycl allocation type"); return "unreachable"; @@ -75,7 +71,6 @@ std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { } // namespace namespace Kokkos { -namespace Experimental { SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} @@ -114,12 +109,12 @@ void* allocate_sycl(const char* arg_label, const size_t arg_alloc_size, return hostPtr; } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const size_t arg_alloc_size) const { return allocate(exec_space, "[unlabeled]", arg_alloc_size); } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { @@ -244,7 +239,6 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, Kokkos::Tools::make_space_handle(name()), m_queue); } -} // namespace Experimental } // namespace Kokkos //============================================================================== @@ -253,11 +247,11 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, #include KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); + Kokkos::SYCLDeviceUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLSharedUSMSpace); + Kokkos::SYCLSharedUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index b86cfca413c0..5a37da130caf 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -39,8 +39,6 @@ template struct is_sycl_type_space : public std::false_type {}; } // namespace Impl -namespace Experimental { - class SYCLDeviceUSMSpace { public: using execution_space = SYCL; @@ -154,45 +152,40 @@ class SYCLHostUSMSpace { sycl::queue m_queue; }; -} // namespace Experimental - namespace Impl { template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type {}; -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space enum : bool { assignable = false }; enum : bool { accessible = true }; @@ -200,26 +193,24 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space == - // Experimental::SYCLHostUSMSpace::execution_space + // SYCLHostUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; @@ -227,14 +218,11 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLDeviceUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLDeviceUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLDeviceUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLDeviceUSMSpace::execution_space enum : bool { deepcopy = true }; }; @@ -243,16 +231,15 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; // SYCL cannot access HostSpace enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space // Can access SYCLSharedUSMSpace from Host but cannot access // SYCLDeviceUSMSpace from Host @@ -264,47 +251,38 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLSharedUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLSharedUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLSharedUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLSharedUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from SYCL - enum : bool { - accessible = true - }; // Experimental::SYCLHostUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLHostUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from Host enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // different execution_space enum : bool { accessible = true }; // same accessibility enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::ScratchMemorySpace> { +struct MemorySpaceAccess> { enum : bool { assignable = false }; enum : bool { accessible = true }; enum : bool { deepcopy = false }; @@ -315,11 +293,9 @@ struct MemorySpaceAccess< } // namespace Kokkos KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLSharedUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLHostUSMSpace); #endif #endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 1e42faa5a833..6359e4a2d9e7 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -34,7 +34,7 @@ namespace Impl { */ class SYCLTeamMember { public: - using execution_space = Kokkos::Experimental::SYCL; + using execution_space = Kokkos::SYCL; using scratch_memory_space = execution_space::scratch_memory_space; using team_handle = SYCLTeamMember; @@ -126,6 +126,20 @@ class SYCLTeamMember { team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + using value_type = typename WrappedReducerType::value_type; auto sg = m_item.get_sub_group(); const auto sub_group_range = sg.get_local_range()[0]; @@ -139,7 +153,7 @@ class SYCLTeamMember { if (vector_range * shift < sub_group_range) { const value_type tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } }; shuffle_combine(1); @@ -153,14 +167,13 @@ class SYCLTeamMember { shift <<= 1) { auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } #endif value = Kokkos::Impl::SYCLReduction::select_from_group(sg, value, 0); const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { - reducer.reference() = value; return; } @@ -187,16 +200,15 @@ class SYCLTeamMember { for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && group_id < std::min(start + step_width, n_subgroups)) - reducer.join(reduction_array[group_id - start], value); + wrapped_reducer.join(&reduction_array[group_id - start], &value); sycl::group_barrier(m_item.get_group()); } // Do the final reduction for all threads redundantly value = reduction_array[0]; for (int i = 1; i < std::min(step_width, n_subgroups); ++i) - reducer.join(value, reduction_array[i]); + wrapped_reducer.join(&value, &reduction_array[i]); - reducer.reference() = value; // Make sure that every thread is done using the reduction array. sycl::group_barrier(m_item.get_group()); } @@ -271,8 +283,8 @@ class SYCLTeamMember { const auto update = Kokkos::Impl::SYCLReduction::shift_group_right(sg, value, vector_range); - Type intermediate = (group_id > 0 ? base_data[group_id - 1] : 0) + - (id_in_sg >= vector_range ? update : 0); + Type intermediate = (group_id > 0 ? base_data[group_id - 1] : Type{0}) + + (id_in_sg >= vector_range ? update : Type{0}); if (global_accum) { if (id_in_sg == sub_group_range - 1 && @@ -311,6 +323,19 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const { + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const { const auto tidx1 = m_item.get_local_id(1); const auto grange1 = m_item.get_local_range(1); @@ -319,13 +344,13 @@ class SYCLTeamMember { if (grange1 == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = grange1; (i >>= 1);) { tmp2 = Kokkos::Impl::SYCLReduction::shift_group_left(sg, tmp, i); if (static_cast(tidx1) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -336,8 +361,7 @@ class SYCLTeamMember { tmp2 = Kokkos::Impl::SYCLReduction::select_from_group( sg, tmp, (sg.get_local_id() / grange1) * grange1); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; } //---------------------------------------- @@ -531,8 +555,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); @@ -541,7 +573,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel_reduce assuming summation. @@ -557,20 +591,28 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); i < loop_boundaries.end; i += loop_boundaries.member.item().get_local_range(0)) { - closure(i, val); + closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } /** \brief Inter-thread parallel exclusive prefix sum. @@ -657,8 +699,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -670,8 +720,11 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< i < loop_boundaries.end; i += grange0 * grange1) closure(i, value); - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -679,10 +732,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -692,11 +751,13 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1; i < loop_boundaries.end; i += grange0 * grange1) - closure(i, val); + closure(i, value); + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- @@ -746,16 +807,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const iType grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, reducer.reference()); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Intra-thread vector parallel_reduce. @@ -774,16 +846,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const int grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, result); + closure(i, value); - loop_boundaries.member.vector_reduce(Kokkos::Sum(result)); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp index 17ce59058bdd..556ca0d28186 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp @@ -22,8 +22,7 @@ #include template -class Kokkos::Impl::TeamPolicyInternal +class Kokkos::Impl::TeamPolicyInternal : public PolicyTraits { public: using execution_policy = TeamPolicyInternal; @@ -45,7 +44,7 @@ class Kokkos::Impl::TeamPolicyInternal TeamPolicyInternal(TeamPolicyInternal const& p) { diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index d55fc6a84ba4..79d9e8a8d482 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -22,13 +22,14 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { Kokkos::View sycl_global_unique_token_locks( bool deallocate = false); } +namespace Experimental { + // both global and instance Unique Tokens are implemented in the same way // the global version has one shared static lock array underneath // but it can't be a static member variable since we need to acces it on device @@ -42,7 +43,7 @@ class UniqueToken { using size_type = int32_t; explicit UniqueToken(execution_space const& = execution_space()) - : m_locks(Impl::sycl_global_unique_token_locks()) {} + : m_locks(Kokkos::Impl::sycl_global_unique_token_locks()) {} KOKKOS_DEFAULTED_FUNCTION UniqueToken(const UniqueToken&) = default; @@ -75,11 +76,15 @@ class UniqueToken { /// \brief acquire value such that 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type impl_acquire() const { +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); +#else auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); +#endif std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; @@ -122,11 +127,11 @@ class UniqueToken public: UniqueToken() : UniqueToken( - Kokkos::Experimental::SYCL().concurrency()) {} + Kokkos::SYCL().concurrency()) {} explicit UniqueToken(execution_space const& arg) : UniqueToken( - Kokkos::Experimental::SYCL().concurrency(), arg) {} + Kokkos::SYCL().concurrency(), arg) {} explicit UniqueToken(size_type max_size) : UniqueToken(max_size) {} diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 61db6b34aac0..2905733a4de9 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -23,12 +23,11 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst) { - auto event = exec_space.impl_internal_space_instance()->m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::SYCL& exec_space, void* dst, size_t cnt) { + auto event = + exec_space.impl_internal_space_instance()->m_queue->memset(dst, 0, cnt); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES exec_space.impl_internal_space_instance() ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp index 81d43b31b35b..a1fa9e43e083 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -34,7 +34,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -267,7 +266,7 @@ template std::vector partition_space(const Serial&, std::vector const& weights) { static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); // We only care about the number of instances to create and ignore weights @@ -284,7 +283,9 @@ std::vector partition_space(const Serial&, #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 34e115eca9b7..addcaba009fa 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -44,11 +44,16 @@ class ParallelFor, public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_iter.m_rp.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->exec(); } template @@ -112,10 +117,15 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 80faec9041d5..2ab7b7f80348 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -31,7 +31,7 @@ class ParallelFor, Kokkos::Serial> { const Policy m_policy; template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { m_functor(i); @@ -39,7 +39,7 @@ class ParallelFor, Kokkos::Serial> { } template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -49,10 +49,15 @@ class ParallelFor, Kokkos::Serial> { public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_policy.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->template exec(); } @@ -79,7 +84,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -88,7 +93,7 @@ class ParallelReduce, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; @@ -108,10 +113,15 @@ class ParallelReduce, auto* internal_instance = m_policy.space().impl_internal_space_instance(); + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -166,7 +176,7 @@ class ParallelScan, const Policy m_policy; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -175,7 +185,7 @@ class ParallelScan, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -194,10 +204,16 @@ class ParallelScan, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -235,7 +251,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -244,7 +260,7 @@ class ParallelScanWithTotal, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -262,10 +278,16 @@ class ParallelScanWithTotal, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index a523cc86c97b..7a6faf3d9fb5 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -223,7 +223,7 @@ class ParallelFor, const size_t m_shared; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor(Member(data, ileague, m_league)); @@ -231,7 +231,7 @@ class ParallelFor, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { const TagType t{}; for (int ileague = 0; ileague < m_league; ++ileague) { @@ -247,10 +247,16 @@ class ParallelFor, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -293,7 +299,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor_reducer.get_functor()(Member(data, ileague, m_league), update); @@ -301,7 +307,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { const TagType t{}; @@ -321,10 +327,16 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index 5905d6d32e14..678d18250474 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -25,10 +25,16 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -102,9 +108,8 @@ class TaskQueueSpecialization> { template class TaskQueueSpecializationConstrained< - Scheduler, - std::enable_if_t::value>> { + Scheduler, std::enable_if_t>> { public: // Note: Scheduler may be an incomplete type at class scope (but not inside // of the methods, obviously) @@ -215,6 +220,10 @@ extern template class TaskQueue, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 6ad6aabc5a7c..527e09407989 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -31,15 +31,11 @@ namespace Impl { // parallel execution space since the specialization for // DefaultHostExecutionSpace is defined elsewhere. struct DummyExecutionSpace; -template +template <> struct ZeroMemset< - std::conditional_t::value, - Serial, DummyExecutionSpace>, - View> { - ZeroMemset(const Serial&, const View& dst) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } + std::conditional_t, + Serial, DummyExecutionSpace>> { + ZeroMemset(const Serial&, void* dst, size_t cnt) { std::memset(dst, 0, cnt); } }; } // namespace Impl diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 3842966cd77b..edc9489f67e7 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -67,8 +67,9 @@ std::pair int s_thread_pool_size[3] = {0, 0, 0}; -void (*volatile s_current_function)(ThreadsInternal &, const void *); -const void *volatile s_current_function_arg = nullptr; +using s_current_function_type = void (*)(ThreadsInternal &, const void *); +std::atomic s_current_function; +std::atomic s_current_function_arg = nullptr; inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); @@ -79,7 +80,7 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } -void wait_yield(volatile ThreadState &flag, const ThreadState value) { +void wait_yield(std::atomic &flag, const ThreadState value) { while (value == flag) { std::this_thread::yield(); } @@ -135,11 +136,12 @@ ThreadsInternal::ThreadsInternal() ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding - const int entry = reinterpret_cast(s_current_function_arg) < - size_t(s_thread_pool_size[0]) - ? reinterpret_cast(s_current_function_arg) - : size_t(Kokkos::hwloc::bind_this_thread( - s_thread_pool_size[0], s_threads_coord)); + const int entry = + reinterpret_cast(s_current_function_arg.load()) < + size_t(s_thread_pool_size[0]) + ? reinterpret_cast(s_current_function_arg.load()) + : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0], + s_threads_coord)); // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && @@ -543,7 +545,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. ThreadsInternal *const th = - ((ThreadsInternal * volatile *)s_threads_exec)[ith]; + ((ThreadsInternal *volatile *)s_threads_exec)[ith]; if (th) { wait_yield(th->m_pool_state, ThreadState::Active); } else { diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index a5eb231cb011..130b3433d026 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -60,7 +60,7 @@ class ThreadsInternal { int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - ThreadState volatile m_pool_state; ///< State for global synchronizations + std::atomic m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -96,7 +96,7 @@ class ThreadsInternal { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION auto &state() { return m_pool_state; } KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } @@ -225,7 +225,7 @@ class ThreadsInternal { // to inactive triggers another thread to exit a spinwait // and read the 'reduce_memory'. // Must 'memory_fence()' to guarantee that storing the update to - // 'reduce_memory()' will complete before storing the the update to + // 'reduce_memory()' will complete before storing the update to // 'm_pool_state'. memory_fence(); @@ -403,7 +403,7 @@ class ThreadsInternal { static void start(void (*)(ThreadsInternal &, const void *), const void *); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static int in_parallel(); + static int in_parallel(); #endif static void fence(); static void fence(const std::string &); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 59577609ab78..711b1b69261f 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -51,7 +51,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -65,7 +65,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 4a89c4fad823..25aab9ebfbc1 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -35,7 +35,7 @@ class ParallelFor, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -47,7 +47,7 @@ class ParallelFor, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { const TagType t{}; #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -64,7 +64,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -77,7 +77,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index f927d7c6a67e..40be3884c3d4 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -36,8 +36,8 @@ class ParallelFor, const size_t m_shared; template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_static(); member.next_static()) { functor(member); @@ -45,8 +45,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -55,8 +55,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_dynamic(); member.next_dynamic()) { functor(member); @@ -64,8 +64,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_dynamic(); member.next_dynamic()) { @@ -88,8 +88,12 @@ class ParallelFor, policy.impl_set_vector_length(1); } if (policy.team_size() < 0) { - policy.impl_set_team_size( - policy.team_size_recommended(m_functor, ParallelForTag{})); + int team_size = policy.team_size_recommended(m_functor, ParallelForTag{}); + if (team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index fa63215a9e5d..9f28f9bbfcc2 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -59,7 +59,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -76,7 +76,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -91,8 +91,8 @@ class ParallelReduce(instance.reduce_memory())); + reference_type update = + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; @@ -100,7 +100,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -55,7 +55,7 @@ class ParallelReduce, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { const TagType t{}; @@ -73,7 +73,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), @@ -89,7 +89,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index 4db310701f9f..69527ee3e65e 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -42,7 +42,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { for (; member.valid_static(); member.next_static()) { functor(member, update); @@ -50,7 +50,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -106,9 +106,14 @@ class ParallelReduce could not find " + "a valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp index 62f34d741ff4..d54f4ca952e6 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp @@ -39,7 +39,7 @@ class ParallelScan, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -52,7 +52,7 @@ class ParallelScan, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; @@ -119,7 +119,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -132,7 +132,7 @@ class ParallelScanWithTotal, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 3df9dc07bf43..0f9a77f2afa9 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -108,7 +108,7 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value) { Kokkos::store_fence(); uint32_t i = 0; diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp index b98b6dbb73bc..7ab43cdb7af6 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,7 +35,7 @@ enum class WaitMode : int { void host_thread_yield(const uint32_t i, const WaitMode mode); -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value); } // namespace Impl diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index a3501a437d29..f627e0d47a51 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -143,8 +143,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); @@ -164,8 +164,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; f(value); if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); memory_fence(); @@ -186,7 +186,7 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: using type = - typename if_c::type; + std::conditional_t; if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -215,52 +215,65 @@ class ThreadsExecTeamMember { } template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - team_reduce(const ReducerType& reducer, - const typename ReducerType::value_type contribution) const { + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(const ReducerType& reducer, + typename ReducerType::value_type& contribution) const { KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution;)) - KOKKOS_IF_ON_HOST(( - using value_type = typename ReducerType::value_type; - // Make sure there is enough scratch space: - using type = typename if_c::type; - - type* const local_value = ((type*)m_instance->scratch_memory()); + KOKKOS_IF_ON_HOST( + (using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), contribution); + reducer.reference() = contribution;)) + } - // Set this thread's contribution - if (team_rank() != team_size() - 1) { *local_value = contribution; } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + const WrappedReducerType& wrapped_reducer, + typename WrappedReducerType::value_type& contribution) const { + using value_type = typename WrappedReducerType::value_type; + // Make sure there is enough scratch space: + using type = std::conditional_t; + + type* const local_value = ((type*)m_instance->scratch_memory()); + + // Set this thread's contribution + if (team_rank() != team_size() - 1) { + *local_value = contribution; + } - // Fence to make sure the base team member has access: - memory_fence(); + // Fence to make sure the base team member has access: + memory_fence(); - if (team_fan_in()) { - // The last thread to synchronize returns true, all other threads - // wait for team_fan_out() - type* const team_value = ((type*)m_team_base[0]->scratch_memory()); + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads + // wait for team_fan_out() + type* const team_value = ((type*)m_team_base[0]->scratch_memory()); - *team_value = contribution; - // Join to the team value: - for (int i = 1; i < m_team_size; ++i) { - reducer.join(*team_value, - *((type*)m_team_base[i]->scratch_memory())); - } + *team_value = contribution; + // Join to the team value: + for (int i = 1; i < m_team_size; ++i) { + wrapped_reducer.join(team_value, + ((type*)m_team_base[i]->scratch_memory())); + } - // Team base thread may "lap" member threads so copy out to their - // local value. - for (int i = 1; i < m_team_size; ++i) { - *((type*)m_team_base[i]->scratch_memory()) = *team_value; - } + // Team base thread may "lap" member threads so copy out to their + // local value. + for (int i = 1; i < m_team_size; ++i) { + *((type*)m_team_base[i]->scratch_memory()) = *team_value; + } - // Fence to make sure all team members have access - memory_fence(); - } + // Fence to make sure all team members have access + memory_fence(); + } - team_fan_out(); + team_fan_out(); - // Value was changed by the team base - reducer.reference() = *local_value;)) + contribution = *local_value; } /** \brief Intra-team exclusive prefix sum with team_rank() ordering @@ -278,8 +291,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_DEVICE(((void)global_accum; return value;)) KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; volatile type* const work_value = ((type*)m_instance->scratch_memory()); @@ -887,19 +900,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType intermediate; - Sum sum(intermediate); - sum.init(intermediate); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - ValueType tmp = ValueType(); - lambda(i, tmp); - intermediate += tmp; + lambda(i, value); } - loop_boundaries.thread.team_reduce(sum, intermediate); - result = sum.reference(); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } template @@ -907,15 +926,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { lambda(i, value); } - loop_boundaries.thread.team_reduce(reducer, value); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } } // namespace Kokkos @@ -950,11 +979,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } template @@ -962,11 +1004,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel exclusive prefix sum. Executes @@ -1049,7 +1104,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( typename Impl::FunctorAnalysis, FunctorType, void>::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of closure and return type"); ValueType scan_val = ValueType(); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index c88d66db5f9a..5fed92db26de 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -36,13 +36,13 @@ class ParallelFor, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/packages/kokkos/core/src/View/Kokkos_BasicView.hpp b/packages/kokkos/core/src/View/Kokkos_BasicView.hpp new file mode 100644 index 000000000000..29eafca62eef --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_BasicView.hpp @@ -0,0 +1,652 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_BASIC_VIEW_HPP +#define KOKKOS_BASIC_VIEW_HPP +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// FIXME: we need to make this work for not using our mdspan impl +#define KOKKOS_IMPL_NO_UNIQUE_ADDRESS _MDSPAN_NO_UNIQUE_ADDRESS +namespace Kokkos::Impl { + +constexpr inline struct SubViewCtorTag { + explicit SubViewCtorTag() = default; +} subview_ctor_tag{}; + +template +struct KokkosSliceToMDSpanSliceImpl { + using type = T; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(const T &s) { return s; } +}; + +template <> +struct KokkosSliceToMDSpanSliceImpl { + using type = full_extent_t; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(Kokkos::ALL_t) { + return full_extent; + } +}; + +template +using kokkos_slice_to_mdspan_slice = + typename KokkosSliceToMDSpanSliceImpl::type; + +template +KOKKOS_INLINE_FUNCTION constexpr decltype(auto) +transform_kokkos_slice_to_mdspan_slice(const T &s) { + return KokkosSliceToMDSpanSliceImpl::transform(s); +} + +// We do have implementation detail versions of these in our mdspan impl +// However they are not part of the public standard interface +template +struct is_layout_right_padded : public std::false_type {}; + +template +struct is_layout_right_padded> + : public std::true_type {}; + +template +struct is_layout_left_padded : public std::false_type {}; + +template +struct is_layout_left_padded> + : public std::true_type {}; + +template +class BasicView { + public: + using mdspan_type = + mdspan; + using extents_type = typename mdspan_type::extents_type; + using layout_type = typename mdspan_type::layout_type; + using accessor_type = typename mdspan_type::accessor_type; + using mapping_type = typename mdspan_type::mapping_type; + using element_type = typename mdspan_type::element_type; + using value_type = typename mdspan_type::value_type; + using index_type = typename mdspan_type::index_type; + using size_type = typename mdspan_type::size_type; + using rank_type = typename mdspan_type::rank_type; + using data_handle_type = typename mdspan_type::data_handle_type; + using reference = typename mdspan_type::reference; + using memory_space = typename accessor_type::memory_space; + using execution_space = typename memory_space::execution_space; + + // For now View and BasicView will have a restriction that the data handle + // needs to be convertible to element_type* and vice versa + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + + KOKKOS_FUNCTION static constexpr rank_type rank() noexcept { + return extents_type::rank(); + } + KOKKOS_FUNCTION static constexpr rank_type rank_dynamic() noexcept { + return extents_type::rank_dynamic(); + } + KOKKOS_FUNCTION static constexpr size_t static_extent(rank_type r) noexcept { + return extents_type::static_extent(r); + } + KOKKOS_FUNCTION constexpr index_type extent(rank_type r) const noexcept { + return m_map.extents().extent(r); + }; + + protected: + // These are pre-condition checks which are unconditionally (i.e. in release + // mode) enabled in Kokkos::View 4.4 + template + KOKKOS_FUNCTION static constexpr void check_basic_view_constructibility( + [[maybe_unused]] const OtherMapping &rhs) { + using src_t = typename OtherMapping::layout_type; + using dst_t = layout_type; + constexpr size_t rnk = mdspan_type::rank(); + if constexpr (!std::is_same_v) { + if constexpr (Impl::is_layout_left_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == 0 ? rhs.stride(1) : rhs.extents().extent(r)); + } + } + } + if constexpr (Impl::is_layout_right_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == rnk ? rhs.stride(r - 2) + : rhs.extents().extent(r - 1)); + } + } + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r); + } + } else if constexpr (Impl::is_layout_left_padded::value && + rnk > 1) { + if (rhs.stride(1) != rhs.extents().extent(0)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r - 1); + } + } + } else if constexpr (Impl::is_layout_right_padded::value && + rnk > 1) { + if (rhs.stride(rnk - 2) != rhs.extents().extent(rnk - 1)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + } + } + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr BasicView() = default; + + KOKKOS_FUNCTION constexpr BasicView(const mdspan_type &other) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()){}; + KOKKOS_FUNCTION constexpr BasicView(mdspan_type &&other) + : m_ptr(std::move(other.data_handle())), + m_map(std::move(other.mapping())), + m_acc(std::move(other.accessor())){}; + + template + // requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit constexpr BasicView( + std::enable_if_t, + data_handle_type> + p, + OtherIndexTypes... exts) + : m_ptr(std::move(p)), + m_map(extents_type(static_cast(std::move(exts))...)), + m_acc{} {} + + template + // When doing C++20 we should switch to this, the conditional explicit we + // can't do in 17 + // requires(std::is_constructible_v>) + // explicit(Size != rank_dynamic()) + KOKKOS_FUNCTION constexpr BasicView( + std::enable_if_t< + std::is_constructible_v>, + data_handle_type> + p, + const Array &exts) + : m_ptr(std::move(p)), m_map(extents_type(exts)), m_acc{} {} + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, + const extents_type &exts) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v && + std::is_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(exts), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(m), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m, + const accessor_type &a) + : m_ptr(std::move(p)), m_map(m), m_acc(a) {} + + template +// requires(std::is_constructible_v::mdspan_type>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const BasicView &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, typename BasicView::mdspan_type>, + void *> = nullptr) + : m_ptr(other.m_ptr), m_map(other.m_map), m_acc(other.m_acc) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + template +// requires(std::is_constructible_v>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const mdspan &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, mdspan>, + void *> = nullptr) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + // Allocating constructors specific to BasicView + /// + /// Construct from a given mapping + /// + explicit constexpr BasicView(const std::string &label, + const mapping_type &mapping) + : BasicView(view_alloc(label), mapping) {} + + /// + /// Construct from a given extents + /// + explicit constexpr BasicView(const std::string &label, + const extents_type &ext) + : BasicView(view_alloc(label), mapping_type{ext}) {} + + private: + template + data_handle_type create_data_handle( + const Impl::ViewCtorProp &arg_prop, + const typename mdspan_type::mapping_type &arg_mapping) { + constexpr bool has_exec = Impl::ViewCtorProp::has_execution_space; + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, memory_space{}, execution_space{}); + using alloc_prop = decltype(prop_copy); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + return data_handle_type(Impl::make_shared_allocation_record( + arg_mapping.required_span_size(), + Impl::get_property(prop_copy), + Impl::get_property(prop_copy), + has_exec ? std::optional{Impl::get_property< + Impl::ExecutionSpaceTag>(prop_copy)} + : std::optional{std::nullopt}, + std::integral_constant(), + std::integral_constant())); + } + + public: + template + // requires(!Impl::ViewCtorProp::has_pointer) + explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView(create_data_handle(arg_prop, arg_mapping), arg_mapping) {} + + template + // requires(Impl::ViewCtorProp::has_pointer) + KOKKOS_FUNCTION explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView( + data_handle_type(Impl::get_property(arg_prop)), + arg_mapping) {} + + protected: + template + KOKKOS_INLINE_FUNCTION BasicView( + Impl::SubViewCtorTag, + const BasicView &src_view, + SliceSpecifiers... slices) + : BasicView(submdspan( + src_view.to_mdspan(), + Impl::transform_kokkos_slice_to_mdspan_slice(slices)...)) {} + + public: + //---------------------------------------- + // Conversion to MDSpan + template , + mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr + operator mdspan() const { + return mdspan_type(m_ptr, m_map, m_acc); + } + + // Here we use an overload instead of a default parameter as a workaround + // to a potential compiler bug with clang 17. It may be present in other + // compilers + template >> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan() const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), static_cast(accessor())); + } + + template < + class OtherAccessorType = AccessorPolicy, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType &other_accessor) const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), other_accessor); + } + + KOKKOS_FUNCTION void assign_data(element_type *ptr) { m_ptr = ptr; } + + // ========================= mdspan ================================= + + // [mdspan.mdspan.members], members + +// Introducing the C++20 and C++23 variants of the operators already +#ifndef KOKKOS_ENABLE_CXX17 +#ifndef KOKKOS_ENABLE_CXX20 + // C++23 only operator[] + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator[]( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#endif + + // C++20 operator() + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#else + // C++17 variant of operator() + + // Some weird unexplained issue in compiling the SFINAE version with CUDA/MSVC + // So we just use post factor check here with static_assert +#if defined(KOKKOS_ENABLE_CUDA) && defined(_WIN32) + template + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + static_assert((std::is_convertible_v && ...)); + static_assert( + (std::is_nothrow_constructible_v && ...)); + static_assert((sizeof...(OtherIndexTypes)) == rank()); + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#else + template + KOKKOS_FUNCTION constexpr std::enable_if_t< + ((std::is_convertible_v && ...)) && + ((std::is_nothrow_constructible_v && + ...)) && + ((sizeof...(OtherIndexTypes)) == rank()), + reference> + operator()(OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#endif +#endif + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside size() + template + KOKKOS_FUNCTION constexpr size_type size_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return 0u; + return ((static_cast(m_map.extents().extent(Idxs))) * ... * + size_type(1)); + } + + public: + KOKKOS_FUNCTION constexpr size_type size() const noexcept { + return size_impl(std::make_index_sequence()); + } + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside empty() + template + KOKKOS_FUNCTION constexpr bool empty_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return true; + return (rank() > 0) && + ((m_map.extents().extent(Idxs) == index_type(0)) || ... || false); + } + + public: + [[nodiscard]] KOKKOS_FUNCTION constexpr bool empty() const noexcept { + return empty_impl(std::make_index_sequence()); + } + + KOKKOS_FUNCTION friend constexpr void swap(BasicView &x, + BasicView &y) noexcept { + kokkos_swap(x.m_ptr, y.m_ptr); + kokkos_swap(x.m_map, y.m_map); + kokkos_swap(x.m_acc, y.m_acc); + } + + KOKKOS_FUNCTION constexpr const extents_type &extents() const noexcept { + return m_map.extents(); + }; + KOKKOS_FUNCTION constexpr const data_handle_type &data_handle() + const noexcept { + return m_ptr; + }; + KOKKOS_FUNCTION constexpr const mapping_type &mapping() const noexcept { + return m_map; + }; + KOKKOS_FUNCTION constexpr const accessor_type &accessor() const noexcept { + return m_acc; + }; + + KOKKOS_FUNCTION static constexpr bool is_always_unique() noexcept { + return mapping_type::is_always_unique(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return mapping_type::is_always_exhaustive(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_strided() noexcept { + return mapping_type::is_always_strided(); + }; + + KOKKOS_FUNCTION constexpr bool is_unique() const { + return m_map.is_unique(); + }; + KOKKOS_FUNCTION constexpr bool is_exhaustive() const { + return m_map.is_exhaustive(); + }; + KOKKOS_FUNCTION constexpr bool is_strided() const { + return m_map.is_strided(); + }; + KOKKOS_FUNCTION constexpr index_type stride(rank_type r) const { + return m_map.stride(r); + }; + + protected: +#ifndef __NVCC__ + KOKKOS_IMPL_NO_UNIQUE_ADDRESS data_handle_type m_ptr{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS mapping_type m_map{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS accessor_type m_acc{}; +#else + data_handle_type m_ptr{}; + mapping_type m_map{}; + accessor_type m_acc{}; +#endif + + template + friend class BasicView; +}; +} // namespace Kokkos::Impl + +#endif diff --git a/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 1ade75692f1f..eb11630b21b8 100644 --- a/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -26,6 +26,7 @@ static_assert(false, #include #include #include +#include #include #include @@ -41,22 +42,8 @@ bool is_zero_byte(const T& x) { return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } -//---------------------------------------------------------------------------- - -/* - * The construction, assignment to default, and destruction - * are merged into a single functor. - * Primarily to work around an unresolved CUDA back-end bug - * that would lose the destruction cuda device function when - * called from the shared memory tracking destruction. - * Secondarily to have two fewer partial specializations. - */ -template ::value> -struct ViewValueFunctor; - template -struct ViewValueFunctor { +struct ViewValueFunctor { using ExecSpace = typename DeviceType::execution_space; struct DestroyTag {}; @@ -68,20 +55,31 @@ struct ViewValueFunctor { std::string name; bool default_exec_space; - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - operator()(ConstructTag const&, const size_t i) const { + template + KOKKOS_FUNCTION + std::enable_if_t> + operator()(ConstructTag, const size_t i) const { new (ptr + i) ValueType(); } - KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, - const size_t i) const { + KOKKOS_FUNCTION void operator()(DestroyTag, const size_t i) const { + // When instantiating a View on host execution space with a host only + // destructor the workaround for CUDA device symbol instantiation tries to + // still compile a destruction kernel for the device, and issues a warning + // for host from host-device +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v) { + KOKKOS_IF_ON_DEVICE(((ptr + i)->~ValueType();)) + } else { + KOKKOS_IF_ON_HOST(((ptr + i)->~ValueType();)) + } +#else (ptr + i)->~ValueType(); +#endif } - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; ViewValueFunctor& operator=(const ViewValueFunctor&) = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, @@ -104,49 +102,6 @@ struct ViewValueFunctor { functor_instantiate_workaround(); } - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_dispatch() { - ValueType value{}; -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_dispatch() { - parallel_for_implementation(); - } - template void parallel_for_implementation() { using PolicyType = @@ -172,24 +127,62 @@ struct ViewValueFunctor { const Kokkos::Impl::ParallelFor closure( *this, policy); closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } + if (default_exec_space || std::is_same_v) { + space.fence(std::is_same_v + ? "Kokkos::View::destruction before deallocate" + : "Kokkos::View::initialization"); + } } - void construct_shared_allocation() { construct_dispatch(); } + // Shortcut for zero initialization + void zero_memset_implementation() { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + + (void)ZeroMemset(space, ptr, n * sizeof(ValueType)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) { + space.fence("Kokkos::View::initialization via memset"); + } + } + + void construct_shared_allocation() { +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + if constexpr (std::is_trivial_v) { + // value-initialization is equivalent to filling with zeros + zero_memset_implementation(); + } else +#endif + parallel_for_implementation(); + } void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v) { + // do nothing, don't bother calling the destructor + } else { #ifdef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - if constexpr (std::is_same_v) - for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); - else + if constexpr (std::is_same_v) + for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); + else #endif - { - parallel_for_implementation(); + parallel_for_implementation(); } } @@ -206,114 +199,6 @@ struct ViewValueFunctor { } }; -template -struct ViewValueFunctor { - using ExecSpace = typename DeviceType::execution_space; - using PolicyType = Kokkos::RangePolicy>; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t i) const { ptr[i] = ValueType(); } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) {} - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) {} - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_shared_allocation() { - // Shortcut for zero initialization -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - ValueType value{}; - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_shared_allocation() { - parallel_for_implementation(); - } - - void parallel_for_implementation() { - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } - - void destroy_shared_allocation() {} -}; - template struct ViewValueFunctorSequentialHostInit { using ExecSpace = typename DeviceType::execution_space; @@ -358,6 +243,63 @@ struct ViewValueFunctorSequentialHostInit { } }; +template +Kokkos::Impl::SharedAllocationRecord* make_shared_allocation_record( + const size_t& required_span_size, std::string_view label, + const MemorySpace& memory_space, + const std::optional exec_space, + std::bool_constant, std::bool_constant) { + static_assert(SpaceAccessibility::accessible); + + // Use this for constructing and destroying the view + using device_type = Kokkos::Device; + using functor_type = std::conditional_t< + SequentialInit, + ViewValueFunctorSequentialHostInit, + ViewValueFunctor>; + using record_type = + Kokkos::Impl::SharedAllocationRecord; + + /* Force alignment of allocations on on 8 byte boundaries even for + * element types smaller than 8 bytes */ + static constexpr std::size_t align_mask = 0x7; + + // Calculate the total size of the memory, in bytes, and make sure it is + // byte-aligned + const std::size_t alloc_size = + (required_span_size * sizeof(ElementType) + align_mask) & ~align_mask; + + auto* record = + exec_space + ? record_type::allocate(*exec_space, memory_space, std::string{label}, + alloc_size) + : record_type::allocate(memory_space, std::string{label}, alloc_size); + + auto ptr = static_cast(record->data()); + + auto functor = + exec_space ? functor_type(*exec_space, ptr, required_span_size, + std::string{label}) + : functor_type(ptr, required_span_size, std::string{label}); + + // Only initialize if the allocation is non-zero. + // May be zero if one of the dimensions is zero. + if constexpr (Initialize) { + if (alloc_size) { + // Assume destruction is only required when construction is requested. + // The ViewValueFunctor has both value construction and destruction + // operators. + record->m_destroy = std::move(functor); + + // Construct values + record->m_destroy.construct_shared_allocation(); + } + } + + return record; +} + } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp similarity index 96% rename from packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp index 23d4c2524c79..f77066b70f57 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_ATOMIC_VIEW_HPP -#define KOKKOS_ATOMIC_VIEW_HPP +#ifndef KOKKOS_VIEWATOMIC_HPP +#define KOKKOS_VIEWATOMIC_HPP #include #include @@ -44,10 +44,10 @@ class AtomicDataElement { } KOKKOS_INLINE_FUNCTION - void inc() const { Kokkos::atomic_increment(ptr); } + void inc() const { Kokkos::atomic_inc(ptr); } KOKKOS_INLINE_FUNCTION - void dec() const { Kokkos::atomic_decrement(ptr); } + void dec() const { Kokkos::atomic_dec(ptr); } KOKKOS_INLINE_FUNCTION const_value_type operator++() const { @@ -215,7 +215,7 @@ class AtomicViewDataHandle { } KOKKOS_INLINE_FUNCTION - operator typename ViewTraits::value_type*() const { return ptr; } + operator typename ViewTraits::value_type *() const { return ptr; } }; } // namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp similarity index 84% rename from packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp index 379180ae6435..f08047471728 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp @@ -72,8 +72,8 @@ struct ViewCtorProp {}; */ template struct ViewCtorProp> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = CommonViewAllocProp; @@ -92,8 +92,8 @@ struct ViewCtorProp || std::is_same_v || std::is_same_v>, P> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = P; @@ -106,14 +106,14 @@ struct ViewCtorProp || /* Map input label type to std::string */ template struct ViewCtorProp::value>, Label> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = std::string; ViewCtorProp(const type &arg) : value(arg) {} - ViewCtorProp(type &&arg) : value(arg) {} + ViewCtorProp(type &&arg) : value(std::move(arg)) {} type value; }; @@ -122,8 +122,8 @@ template struct ViewCtorProp::value || Kokkos::is_execution_space::value>, Space> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = Space; @@ -135,8 +135,8 @@ struct ViewCtorProp::value || template struct ViewCtorProp { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = T *; @@ -213,14 +213,19 @@ struct ViewCtorProp : public ViewCtorProp... { using execution_space = typename var_execution_space::type; using pointer_type = typename var_pointer::type; - /* Copy from a matching argument list. - * Requires std::is_same< P , ViewCtorProp< void , Args >::value ... - */ - template - inline ViewCtorProp(Args const &... args) : ViewCtorProp(args)... {} + // Construct from a matching argument list. + // + // Note that if P is empty, this constructor is the default constructor. + // On the other hand, if P is not empty, the constraint implies that + // there is no default constructor. + template , Args &&>...>>> + ViewCtorProp(Args &&...args) + : ViewCtorProp(std::forward(args))... {} template - KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &... args) + KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &...args) : ViewCtorProp(arg0), ViewCtorProp::type>(args)... {} @@ -252,7 +257,7 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop) { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, [[maybe_unused]] const Property &property, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -302,7 +307,7 @@ template struct WithPropertiesIfUnset, Property, Properties...> { static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, const Property &prop, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -328,7 +333,7 @@ struct WithPropertiesIfUnset, Property, Properties...> { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, - const Properties &... properties) { + const Properties &...properties) { return WithPropertiesIfUnset, Properties...>::apply_prop( view_ctor_prop, properties...); } @@ -437,6 +442,48 @@ using ViewAllocateWithoutInitializing = Impl::ViewCtorProp; +inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; + +inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; + +inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory + * alignment + */ +template +auto view_alloc(Args &&...args) { + using return_type = Impl::ViewCtorProp>::type...>; + + static_assert(!return_type::has_pointer, + "Cannot give pointer-to-memory for view allocation"); + + return return_type(std::forward(args)...); +} + +template +KOKKOS_INLINE_FUNCTION + Impl::ViewCtorProp::type...> + view_wrap(Args const &...args) { + using return_type = + Impl::ViewCtorProp::type...>; + + static_assert(!return_type::has_memory_space && + !return_type::has_execution_space && + !return_type::has_label && return_type::has_pointer, + "Must only give pointer-to-memory for view wrapping"); + + return return_type(args...); +} + } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp similarity index 96% rename from packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp index 04c0c9aeede7..37b6e2802fc9 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp @@ -60,8 +60,8 @@ struct rank_dynamic { static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ }; \ template \ @@ -72,8 +72,8 @@ struct rank_dynamic { struct ViewDimension##R<0u, RD> { \ static constexpr size_t ArgN##R = 0; \ std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ }; \ @@ -149,8 +149,8 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension static constexpr unsigned rank = sizeof...(Vals); static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; ViewDimension& operator=(const ViewDimension&) = default; KOKKOS_INLINE_FUNCTION @@ -370,8 +370,7 @@ struct ViewDataAnalysis { // ValueType is opportunity for partial specialization. // Must match array analysis when this default template is used. static_assert( - std::is_same::value); + std::is_same_v); public: using specialize = void; // No specialization diff --git a/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp b/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp new file mode 100644 index 000000000000..fd406d58ccae --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp @@ -0,0 +1,1604 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWLEGACY_HPP +#define KOKKOS_VIEWLEGACY_HPP + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#include +#endif +#include + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType > + * - View< DataType , Layout > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, double* + * indicates a one-dimensional array of \c double with run-time + * dimension, and int*[3] a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * Space. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on + * Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View out, + * View in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ + +} // namespace Kokkos + +namespace Kokkos { + +template +struct is_always_assignable_impl; + +template +struct is_always_assignable_impl, + Kokkos::View> { + using mapping_type = Kokkos::Impl::ViewMapping< + typename Kokkos::View::traits, + typename Kokkos::View::traits, + typename Kokkos::View::traits::specialize>; + + constexpr static bool value = + mapping_type::is_assignable && + static_cast(Kokkos::View::rank_dynamic) >= + static_cast(Kokkos::View::rank_dynamic); +}; + +template +using is_always_assignable = is_always_assignable_impl< + std::remove_reference_t, + std::remove_const_t>>; + +template +inline constexpr bool is_always_assignable_v = + is_always_assignable::value; + +template +constexpr bool is_assignable(const Kokkos::View& dst, + const Kokkos::View& src) { + using DstTraits = typename Kokkos::View::traits; + using SrcTraits = typename Kokkos::View::traits; + using mapping_type = + Kokkos::Impl::ViewMapping; + + return is_always_assignable_v, + Kokkos::View> || + (mapping_type::is_assignable && + ((DstTraits::dimension::rank_dynamic >= 1) || + (dst.static_extent(0) == src.extent(0))) && + ((DstTraits::dimension::rank_dynamic >= 2) || + (dst.static_extent(1) == src.extent(1))) && + ((DstTraits::dimension::rank_dynamic >= 3) || + (dst.static_extent(2) == src.extent(2))) && + ((DstTraits::dimension::rank_dynamic >= 4) || + (dst.static_extent(3) == src.extent(3))) && + ((DstTraits::dimension::rank_dynamic >= 5) || + (dst.static_extent(4) == src.extent(4))) && + ((DstTraits::dimension::rank_dynamic >= 6) || + (dst.static_extent(5) == src.extent(5))) && + ((DstTraits::dimension::rank_dynamic >= 7) || + (dst.static_extent(6) == src.extent(6))) && + ((DstTraits::dimension::rank_dynamic >= 8) || + (dst.static_extent(7) == src.extent(7)))); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +class View; + +template +struct is_view : public std::false_type {}; + +template +struct is_view> : public std::true_type {}; + +template +struct is_view> : public std::true_type {}; + +template +inline constexpr bool is_view_v = is_view::value; + +template +class View : public ViewTraits { + private: + template + friend class View; + template + friend class Kokkos::Impl::ViewMapping; + + using view_tracker_type = Kokkos::Impl::ViewTracker; + + public: + using traits = ViewTraits; + + private: + using map_type = + Kokkos::Impl::ViewMapping; + template + friend struct Kokkos::Impl::ViewTracker; + using hooks_policy = typename traits::hooks_policy; + + view_tracker_type m_track; + map_type m_map; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + View; + + /** \brief Compatible view of const data type */ + using const_type = + View; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + View; + + /** \brief Compatible host mirror view */ + using host_mirror_type = + View, + typename traits::hooks_policy>; + + /** \brief Compatible host mirror view */ + using HostMirror = host_mirror_type; + + /** \brief Unified types */ + using uniform_type = typename Impl::ViewUniformType::type; + using uniform_const_type = + typename Impl::ViewUniformType::const_type; + using uniform_runtime_type = + typename Impl::ViewUniformType::runtime_type; + using uniform_runtime_const_type = + typename Impl::ViewUniformType::runtime_const_type; + using uniform_nomemspace_type = + typename Impl::ViewUniformType::nomemspace_type; + using uniform_const_nomemspace_type = + typename Impl::ViewUniformType::const_nomemspace_type; + using uniform_runtime_nomemspace_type = + typename Impl::ViewUniformType::runtime_nomemspace_type; + using uniform_runtime_const_nomemspace_type = + typename Impl::ViewUniformType::runtime_const_nomemspace_type; + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + // Typedefs from mdspan + // using extents_type -> not applicable + // Defining layout_type here made MSVC+CUDA fail + // using layout_type = typename traits::array_layout; + // using accessor_type -> not applicable + // using mapping_type -> not applicable + using element_type = typename traits::value_type; + // using value_type -> conflicts with traits::value_type + using index_type = typename traits::memory_space::size_type; + // using size_type -> already from traits::size_type; where it is + // memory_space::size_type + using rank_type = size_t; + using data_handle_type = pointer_type; + using reference = reference_type; + + //---------------------------------------- + // Domain rank and extents + + static constexpr Impl::integral_constant + rank = {}; + static constexpr Impl::integral_constant + rank_dynamic = {}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = + map_type::Rank}; +#endif + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + extent(const iType& r) const noexcept { + return m_map.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return map_type::static_extent(r); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> + extent_int(const iType& r) const noexcept { + return static_cast(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference_v + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping& + impl_map() const { + return m_map; + } + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::SharedAllocationTracker& impl_track() const { + return m_track.m_tracker; + } + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same_v; + + static constexpr bool is_layout_right = + std::is_same_v; + + static constexpr bool is_layout_stride = + std::is_same_v; + + static constexpr bool is_default_map = + std::is_void_v && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); \ + Kokkos::Impl::view_verify_operator_bounds( \ + __VA_ARGS__); + +#else + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); + +#endif + + template + static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); + } + + template + static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); + } + + public: + //------------------------------ + // Rank 1 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 1 operator[] + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + operator()(I0 i0, I1 i1) const { + check_operator_parens_valid_args(i0, i1); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which + // have "inlined" versions above + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + ((0 == rank) || !is_default_map)), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.reference(indices...); + } + + //------------------------------ + // Rank 0 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> + access(Is... extra) const { + check_access_member_function_valid_args(extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) + return m_map.reference(); + } + + //------------------------------ + // Rank 1 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && !is_default_map), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (2 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + return m_map.reference(i0, i1); + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + //------------------------------ + // Rank 3 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View() = default; + + KOKKOS_FUNCTION + View(const View& other) : m_track(other.m_track), m_map(other.m_map) { + KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View(View&& other) + : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { + KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View& operator=(const View& other) { + m_map = other.m_map; + m_track = other.m_track; + + KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) + + return *this; + } + + KOKKOS_FUNCTION + View& operator=(View&& other) { + m_map = std::move(other.m_map); + m_track = std::move(other.m_track); + + KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) + + return *this; + } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View( + const View& rhs, + std::enable_if_t::traits, + typename traits::specialize>::is_assignable_data_type>* = nullptr) + : m_track(rhs), m_map() { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t< + Kokkos::Impl::ViewMapping< + traits, typename View::traits, + typename traits::specialize>::is_assignable_data_type, + View>& + operator=(const View& rhs) { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + m_track.assign(rhs); + return *this; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, + Args... args) + : m_track(src_view), m_map() { + using SrcType = View; + + using Mapping = Kokkos::Impl::ViewMapping; + + using DstType = typename Mapping::type; + + static_assert( + Kokkos::Impl::ViewMapping::is_assignable, + "Subview construction requires compatible view and subview arguments"); + + Mapping::assign(m_map, src_view.m_map, arg0, args...); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.m_tracker.use_count(); } + + inline const std::string label() const { + return m_track.m_tracker + .template get_label(); + } + + public: + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track(), m_map() { + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, typename traits::device_type::memory_space{}, + typename traits::device_type::execution_space{}); + using alloc_prop = decltype(prop_copy); + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif + + Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( + prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); + + // Setup and initialization complete, start tracking + m_track.m_tracker.assign_allocated_record_to_uninitialized(record); + } + + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_data) { + m_track.m_tracker.clear(); + m_map.assign_data(arg_data); + } + + // Wrap memory according to properties and array layout + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + static_assert( + std::is_same::pointer_type>::value, + "Constructing View to wrap user memory must supply matching pointer " + "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif + } + + // Simple dimension-only layout + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Allocate with label and layout + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, + typename traits::array_layout> const& arg_layout) + : View(Impl::ViewCtorProp(arg_label), arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor. + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, const size_t> + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Construct view from ViewTracker and map + // This should be the preferred method because future extensions may need to + // use the ViewTracker class. + template + KOKKOS_INLINE_FUNCTION View( + const view_tracker_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track.m_tracker); + } + + // Construct View from internal shared allocation tracker object and map + // This is here for backwards compatibility for classes that derive from + // Kokkos::View + template + KOKKOS_INLINE_FUNCTION View( + const typename view_tracker_type::track_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + typename traits::array_layout const& layout) { + return map_type::memory_span(layout); + } + + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_ptr), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + const size_t num_passed_args = Impl::count_valid_integers( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + + if (std::is_void_v && + num_passed_args != rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + + return View::shmem_size(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + private: + // Want to be able to align to minimum scratch alignment or sizeof or alignof + // elements + static constexpr size_t scratch_value_alignment = + max({sizeof(typename traits::value_type), + alignof(typename traits::value_type), + static_cast( + traits::execution_space::scratch_memory_space::ALIGN)}); + + public: + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(typename traits::array_layout const& arg_layout) { + return map_type::memory_span(arg_layout) + scratch_value_alignment; + } + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(reinterpret_cast( + arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), + scratch_value_alignment))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp( + reinterpret_cast(arg_space.get_shmem_aligned( + map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, + arg_N7)), + scratch_value_alignment))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template ::mdspan_type, + typename = std::enable_if_t, + std::false_type, + std::is_assignable, + ImplNaturalMDSpanType>>::value>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template >, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = + typename Impl::MDSpanViewTraits::accessor_type()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN +}; + +template +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { + return View::rank(); +} + +namespace Impl { + +template +struct RankDataType { + using type = typename RankDataType::type*; +}; + +template +struct RankDataType { + using type = ValueType; +}; + +template +KOKKOS_FUNCTION std::enable_if_t< + N == View::rank() && + std::is_same_v::specialize, void>, + View> +as_view_of_rank_n(View v) { + return v; +} + +// Placeholder implementation to compile generic code for DynRankView; should +// never be called +template +KOKKOS_FUNCTION std::enable_if_t< + N != View::rank() && + std::is_same_v::specialize, void>, + View::value_type, N>::type, + Args...>> +as_view_of_rank_n(View) { + Kokkos::abort("Trying to get at a View of the wrong rank"); + return {}; +} + +template +void apply_to_view_of_static_rank(Function&& f, View a) { + f(a); +} + +} // namespace Impl + +template +KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, + Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + static_assert(Kokkos::is_memory_traits::value); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} +#endif + +template +using Subview = decltype(subview(std::declval(), std::declval()...)); + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, + const View& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits; + using rhs_traits = ViewTraits; + + return std::is_same_v && + std::is_same_v && + std::is_same_v && + View::rank() == View::rank() && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template +KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, + const View& rhs) { + return !(operator==(lhs, rhs)); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +struct CommonViewValueType; + +template +struct CommonViewValueType { + using value_type = std::common_type_t; +}; + +template +struct CommonViewAllocProp; + +template +struct CommonViewAllocProp { + using value_type = ValueType; + using scalar_array_type = ValueType; + + template + KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} +}; + +template +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template +struct DeduceCommonViewAllocProp { + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view::value }; + + using prop_type = CommonViewAllocProp; +}; + +template +struct DeduceCommonViewAllocProp { + using NextTraits = DeduceCommonViewAllocProp; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error + // out + static_assert(!(!std::is_same_v && + !std::is_void_v && + !std::is_void_v), + "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " + "specialize trait allowed"); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = + std::conditional_t, + first_specialize, + std::conditional_t<(std::is_void_v && + !std::is_void_v), + next_specialize, first_specialize>>; + + using value_type = typename CommonViewValueType::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp; +}; + +} // end namespace Impl + +template +using DeducedCommonPropsType = + typename Impl::DeduceCommonViewAllocProp::prop_type; + +// This function is required in certain scenarios where users customize +// Kokkos View internals. One example are dynamic length embedded ensemble +// types. The function is used to propagate necessary information +// (like the ensemble size) when creating new views. +// However, most of the time it is called with a single view. +// Furthermore, the propagated information is not just for view allocations. +// From what I can tell, the type of functionality provided by +// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, +// a mechanism we will eventually use to replace this clunky approach here, when +// we are finally mdspan based. +// TODO: get rid of this when we have mdspan +template +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( + Views const&... views) { + return DeducedCommonPropsType(views...); +} + +} // namespace Kokkos + +#include +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWLEGACY_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp similarity index 90% rename from packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp index 10aaa63b7c82..ecc19eaf5e25 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp @@ -28,61 +28,41 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include #include #include #include -#include +#include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { - -struct ALL_t { - KOKKOS_INLINE_FUNCTION - constexpr const ALL_t& operator()() const { return *this; } - - KOKKOS_INLINE_FUNCTION - constexpr bool operator==(const ALL_t&) const { return true; } -}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Impl { -// TODO This alias declaration forces us to fully qualify ALL_t inside the -// Kokkos::Impl namespace to avoid deprecation warnings. Replace the -// fully-qualified name when we remove Kokkos::Impl::ALL_t. -using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = - Kokkos::ALL_t; -} // namespace Impl -#endif -} // namespace Kokkos - namespace Kokkos { namespace Impl { template struct is_integral_extent_type { - enum : bool { value = std::is_same::value ? 1 : 0 }; + enum : bool { value = std::is_same_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; // Assuming '2 == initializer_list::size()' template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template @@ -93,8 +73,7 @@ struct is_integral_extent { enum : bool { value = is_integral_extent_type::value }; - static_assert(value || std::is_integral::value || - std::is_void::value, + static_assert(value || std::is_integral_v || std::is_void_v, "subview argument must be either integral or integral extent"); }; @@ -112,16 +91,16 @@ struct SubviewLegalArgsCompileTime { enum { - value = (((CurrentArg == RankDest - 1) && - (Kokkos::Impl::is_integral_extent_type::value)) || - ((CurrentArg >= RankDest) && (std::is_integral::value)) || - ((CurrentArg < RankDest) && - (std::is_same::value)) || - ((CurrentArg == 0) && - (Kokkos::Impl::is_integral_extent_type::value))) && - (SubviewLegalArgsCompileTime::value) + value = + (((CurrentArg == RankDest - 1) && + (Kokkos::Impl::is_integral_extent_type::value)) || + ((CurrentArg >= RankDest) && (std::is_integral_v)) || + ((CurrentArg < RankDest) && (std::is_same_v)) || + ((CurrentArg == 0) && + (Kokkos::Impl::is_integral_extent_type::value))) && + (SubviewLegalArgsCompileTime::value) }; }; @@ -129,7 +108,7 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankDest - 1) || (std::is_integral::value)) && + value = ((CurrentArg == RankDest - 1) || (std::is_integral_v)) && (CurrentArg == RankSrc - 1) }; }; @@ -144,10 +123,9 @@ struct SubviewLegalArgsCompileTime::value)) || - ((CurrentArg < RankSrc - RankDest) && - (std::is_integral::value)) || + ((CurrentArg < RankSrc - RankDest) && (std::is_integral_v)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same_v))) && (SubviewLegalArgsCompileTime::value) @@ -158,8 +136,8 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankSrc - 1) && - (std::is_same::value)) + value = + ((CurrentArg == RankSrc - 1) && (std::is_same_v)) }; }; @@ -392,7 +370,7 @@ struct SubviewExtents { const int n = snprintf(buffer, LEN, "Kokkos::subview bounds error ("); error(buffer + n, LEN - n, 0, 0, dim, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE(((void)dim; Kokkos::abort("Kokkos::subview bounds error"); @@ -718,8 +696,8 @@ struct ViewOffset< return *this; } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -885,14 +863,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1071,8 +1052,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1086,7 +1067,11 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride(Padding::stride(arg_layout.dimension[0])) {} + m_stride( + arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG + ? arg_layout.stride + : Padding::stride(arg_layout.dimension[0])) { + } template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1407,8 +1392,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1565,14 +1550,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1614,8 +1602,8 @@ struct ViewOffset< } KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * - m_dim.N2 * m_dim.N1; + return m_stride == static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * + m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1; } /* Strides of dimensions */ @@ -1624,19 +1612,21 @@ struct ViewOffset< return m_dim.N7; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { - return m_dim.N7 * m_dim.N6; + return static_cast(m_dim.N7) * m_dim.N6; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3 * m_dim.N2; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride; @@ -1749,13 +1739,31 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ + + private: + template + KOKKOS_FUNCTION constexpr size_type compute_stride( + const Kokkos::LayoutRight& arg_layout) { + if (arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG) + return arg_layout.stride; + size_type value = m_dim.N1; + if constexpr (dimension_type::rank > 2) value *= m_dim.N2; + if constexpr (dimension_type::rank > 3) value *= m_dim.N3; + if constexpr (dimension_type::rank > 4) value *= m_dim.N4; + if constexpr (dimension_type::rank > 5) value *= m_dim.N5; + if constexpr (dimension_type::rank > 6) value *= m_dim.N6; + if constexpr (dimension_type::rank > 7) value *= m_dim.N7; + return Padding::stride(value); + } + + public: template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( std::integral_constant const&, @@ -1764,37 +1772,7 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride( - Padding:: - stride(/* 2 <= rank */ - m_dim.N1 * - (dimension_type::rank == 2 - ? size_t(1) - : m_dim.N2 * - (dimension_type::rank == 3 - ? size_t(1) - : m_dim.N3 * - (dimension_type::rank == 4 - ? size_t(1) - : m_dim.N4 * - (dimension_type::rank == - 5 - ? size_t(1) - : m_dim.N5 * - (dimension_type:: - rank == - 6 - ? size_t( - 1) - : m_dim.N6 * - (dimension_type:: - rank == - 7 - ? size_t( - 1) - : m_dim - .N7)))))))) { - } + m_stride(compute_stride(arg_layout)) {} template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1886,8 +1864,8 @@ struct ViewStride<0> { static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1901,8 +1879,8 @@ struct ViewStride<1> { static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1916,8 +1894,8 @@ struct ViewStride<2> { size_t S0, S1; static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1931,8 +1909,8 @@ struct ViewStride<3> { size_t S0, S1, S2; static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1946,8 +1924,8 @@ struct ViewStride<4> { size_t S0, S1, S2, S3; static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1961,8 +1939,8 @@ struct ViewStride<5> { size_t S0, S1, S2, S3, S4; static constexpr size_t S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1976,8 +1954,8 @@ struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; static constexpr size_t S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1991,8 +1969,8 @@ struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; static constexpr size_t S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2005,8 +1983,8 @@ template <> struct ViewStride<8> { size_t S0, S1, S2, S3, S4, S5, S6, S7; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2283,8 +2261,8 @@ struct ViewOffset { } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -2398,9 +2376,9 @@ struct ViewDataHandle { template struct ViewDataHandle< Traits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && + std::enable_if_t<(std::is_same_v && + std::is_void_v && Traits::memory_traits::is_atomic)>> { using value_type = typename Traits::value_type; using handle_type = typename Kokkos::Impl::AtomicViewDataHandle; @@ -2422,11 +2400,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - (!Traits::memory_traits::is_aligned) && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + (!Traits::memory_traits::is_aligned) && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; using return_type = typename Traits::value_type& KOKKOS_RESTRICT; @@ -2446,11 +2423,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - (!Traits::memory_traits::is_restrict) && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + (!Traits::memory_traits::is_restrict) && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2485,11 +2461,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2533,11 +2508,10 @@ namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ template class ViewMapping< - Traits, - std::enable_if_t<( - std::is_void::value && - ViewOffset::is_mapping_plugin::value)>> { + Traits, std::enable_if_t<(std::is_void_v && + ViewOffset::is_mapping_plugin::value)>> { public: using offset_type = ViewOffset; @@ -2680,28 +2654,26 @@ class ViewMapping< reference_type reference() const { return m_impl_handle[0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if layout is neither stride nor irregular, - // then just use the handle directly - !(std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if layout is neither stride nor irregular, + // then just use the handle directly + !(std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if the layout is strided or irregular, then - // we have to use the offset - (std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if the layout is strided or irregular, then + // we have to use the offset + (std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[m_impl_offset(i0)]; } @@ -2780,7 +2752,7 @@ class ViewMapping< KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; //---------------------------------------- @@ -2894,29 +2866,34 @@ template class ViewMapping< DstTraits, SrcTraits, std::enable_if_t<( - !(std::is_same:: - value) && // Added to have a new specialization for SrcType of - // LayoutStride + !(std::is_same_v)&& // Added to have a new + // specialization for + // SrcType of + // LayoutStride // default mappings - std::is_void::value && - std::is_void::value && + std::is_void_v && + std::is_void_v && ( // same layout - std::is_same::value || + std::is_same_v || // known layout - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value))))>> { + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>))))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -2926,10 +2903,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -2939,12 +2916,12 @@ class ViewMapping< }; enum { - is_assignable_layout = - std::is_same::value || - std::is_same::value || - (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) + is_assignable_layout = std::is_same_v || + std::is_same_v || + (DstTraits::dimension::rank == 0) || + (DstTraits::dimension::rank == 1) }; public: @@ -3032,22 +3009,21 @@ class ViewMapping< template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - std::is_void::value && - ( - // same layout - std::is_same::value || - // known layout - (std::is_same::value || - std::is_same::value || - std::is_same::value)))>> { + std::enable_if_t<(std::is_same_v && + std::is_void_v && + std::is_void_v && + ( + // same layout + std::is_same_v || + // known layout + (std::is_same_v || + std::is_same_v || + std::is_same_v)))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -3057,10 +3033,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -3091,8 +3067,7 @@ class ViewMapping< bool assignable = true; src.stride(strides); size_t exp_stride = 1; - if (std::is_same::value) { + if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(i - 1); if (strides[i] != exp_stride) { @@ -3100,8 +3075,8 @@ class ViewMapping< break; } } - } else if (std::is_same::value) { + } else if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(src.Rank - i); if (strides[src.Rank - 1 - i] != exp_stride) { @@ -3197,8 +3172,8 @@ struct SubViewDataTypeImpl> { template struct SubViewDataTypeImpl< - std::enable_if_t>::value>, - ValueType, Kokkos::Experimental::Extents, Integral, Args...> + std::enable_if_t>>, ValueType, + Kokkos::Experimental::Extents, Integral, Args...> : SubViewDataTypeImpl, Args...> {}; @@ -3230,13 +3205,13 @@ struct SubViewDataType : SubViewDataTypeImpl {}; template class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, + std::enable_if_t<( + std::is_void_v && + (std::is_same_v || + std::is_same_v || + std::is_same_v))>, SrcTraits, Args...> { private: static_assert(SrcTraits::rank == sizeof...(Args), @@ -3292,14 +3267,14 @@ class ViewMapping< // OutputRank 1 or 2, InputLayout Left, Interval 0 // because single stride one or second index has a stride. (rank <= 2 && R0 && - std::is_same::value) // replace with input rank + std::is_same_v) // replace with input rank || // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] // because single stride one or second index has a stride. (rank <= 2 && R0_rev && - std::is_same::value) // replace input rank + std::is_same_v) // replace input rank ), typename SrcTraits::array_layout, Kokkos::LayoutStride>; diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/packages/kokkos/core/src/View/Kokkos_ViewTracker.hpp similarity index 100% rename from packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewTracker.hpp diff --git a/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp b/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp new file mode 100644 index 000000000000..5eddfc68e070 --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp @@ -0,0 +1,457 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWTRAITS_HPP +#define KOKKOS_VIEWTRAITS_HPP + +#include +#include +#include +#include +#include +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ALL_t { + KOKKOS_FUNCTION + constexpr const ALL_t& operator()() const { return *this; } + + KOKKOS_FUNCTION + constexpr bool operator==(const ALL_t&) const { return true; } +}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +// TODO This alias declaration forces us to fully qualify ALL_t inside the +// Kokkos::Impl namespace to avoid deprecation warnings. Replace the +// fully-qualified name when we remove Kokkos::Impl::ALL_t. +using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = + Kokkos::ALL_t; +} // namespace Impl +#endif + +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp declare target +#endif + +inline constexpr Kokkos::ALL_t ALL{}; + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp end declare target +#endif + +namespace Impl { + +template +struct ViewArrayAnalysis; + +template ::non_const_value_type> +struct ViewDataAnalysis; + +template +class ViewMapping { + public: + enum : bool { is_assignable_data_type = false }; + enum : bool { is_assignable = false }; +}; + +template +constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( + const IntType i0, const IntType i1, const IntType i2, const IntType i3, + const IntType i4, const IntType i5, const IntType i6, const IntType i7) { + static_assert(std::is_integral_v, + "count_valid_integers() must have integer arguments."); + + return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + + (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + + (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + + (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); +} + +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { + (void)(label); + + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } + + if (!n_args_is_dyn_rank && !n_args_is_rank) { + KOKKOS_IF_ON_HOST( + const std::string message = + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + + std::to_string(num_passed_args) + + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " + "mismatched number of arguments.");) + } + } +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +// Class to provide a uniform type +namespace Kokkos { +namespace Impl { +template +struct ViewUniformType; +} +} // namespace Kokkos + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename LayoutFromArrayLayout::type; + using accessor_type = + SpaceAwareAccessor>; + using mdspan_type = mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + */ + +template +struct ViewTraits; + +template <> +struct ViewTraits { + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = void; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + // Ignore an extraneous 'void' + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + HooksPolicy, Prop...> { + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = HooksPolicy; +}; + +template +struct ViewTraits::value>, + ArrayLayout, Prop...> { + // Specify layout, keep subsequent space and memory traits arguments + + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = ArrayLayout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits::value>, Space, + Prop...> { + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::HostMirrorSpace, + void> && + std::is_same_v::array_layout, + void>, + "Only one View Execution or Memory Space template argument"); + + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + using HostMirrorSpace = + typename Kokkos::Impl::HostMirror::Space::memory_space; + using array_layout = typename execution_space::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + MemoryTraits, Prop...> { + // Specify memory trait, should not be any subsequent arguments + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::array_layout, + void> && + std::is_same_v::memory_traits, + void> && + std::is_same_v::hooks_policy, + void>, + "MemoryTrait is the final optional template argument for a View"); + + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = MemoryTraits; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + private: + // Unpack the properties arguments + using prop = ViewTraits; + + using ExecutionSpace = + std::conditional_t, + typename prop::execution_space, + Kokkos::DefaultExecutionSpace>; + + using MemorySpace = + std::conditional_t, + typename prop::memory_space, + typename ExecutionSpace::memory_space>; + + using ArrayLayout = + std::conditional_t, + typename prop::array_layout, + typename ExecutionSpace::array_layout>; + + using HostMirrorSpace = std::conditional_t< + !std::is_void_v, + typename prop::HostMirrorSpace, + typename Kokkos::Impl::HostMirror::Space>; + + using MemoryTraits = + std::conditional_t, + typename prop::memory_traits, + typename Kokkos::MemoryManaged>; + + using HooksPolicy = + std::conditional_t, + typename prop::hooks_policy, + Kokkos::Experimental::DefaultViewHooks>; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + using data_analysis = Kokkos::Impl::ViewDataAnalysis; + + public: + //------------------------------------ + // Data type traits: + + using data_type = typename data_analysis::type; + using const_data_type = typename data_analysis::const_type; + using non_const_data_type = typename data_analysis::non_const_type; + + //------------------------------------ + // Compatible array of trivial type traits: + + using scalar_array_type = typename data_analysis::scalar_array_type; + using const_scalar_array_type = + typename data_analysis::const_scalar_array_type; + using non_const_scalar_array_type = + typename data_analysis::non_const_scalar_array_type; + + //------------------------------------ + // Value type traits: + + using value_type = typename data_analysis::value_type; + using const_value_type = typename data_analysis::const_value_type; + using non_const_value_type = typename data_analysis::non_const_value_type; + + //------------------------------------ + // Mapping traits: + + using array_layout = ArrayLayout; + using dimension = typename data_analysis::dimension; + + using specialize = std::conditional_t< + std::is_void_v, + typename prop::specialize, + typename data_analysis::specialize>; /* mapping specialization tag */ + + static constexpr unsigned rank = dimension::rank; + static constexpr unsigned rank_dynamic = dimension::rank_dynamic; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device; + using memory_traits = MemoryTraits; + using host_mirror_space = HostMirrorSpace; + using hooks_policy = HooksPolicy; + + using size_type = typename MemorySpace::size_type; + + enum { is_hostspace = std::is_same_v }; + enum { is_managed = MemoryTraits::is_unmanaged == 0 }; + enum { is_random_access = MemoryTraits::is_random_access == 1 }; + + //------------------------------------ +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Impl { +template +struct TypeListToViewTraits; + +template +struct TypeListToViewTraits> { + using type = ViewTraits; +}; + +// It is not safe to assume that subviews of views with the Aligned memory trait +// are also aligned. Hence, just remove that attribute for subviews. +template +struct RemoveAlignedMemoryTrait { + private: + using type_list_in = Kokkos::Impl::type_list; + using memory_traits = typename ViewTraits::memory_traits; + using type_list_in_wo_memory_traits = + typename Kokkos::Impl::type_list_remove_first::type; + using new_memory_traits = + Kokkos::MemoryTraits; + using new_type_list = typename Kokkos::Impl::concat_type_list< + type_list_in_wo_memory_traits, + Kokkos::Impl::type_list>::type; + + public: + using type = typename TypeListToViewTraits::type; +}; +} // namespace Impl + +} /* namespace Kokkos */ + +#endif /* KOKKOS_VIEWTRAITS_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp similarity index 88% rename from packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp index 7de2869a0d88..1e476132858c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp @@ -24,11 +24,14 @@ namespace Impl { template struct ViewScalarToDataType { using type = typename ViewScalarToDataType::type *; + using const_type = + typename ViewScalarToDataType::const_type *; }; template struct ViewScalarToDataType { - using type = ScalarType; + using type = ScalarType; + using const_type = const ScalarType; }; template @@ -49,12 +52,13 @@ struct ViewUniformLayout { template struct ViewUniformType { using data_type = typename ViewType::data_type; - using const_data_type = std::add_const_t; + using const_data_type = typename ViewType::const_data_type; using runtime_data_type = typename ViewScalarToDataType::type; - using runtime_const_data_type = typename ViewScalarToDataType< - std::add_const_t, ViewType::rank>::type; + using runtime_const_data_type = + typename ViewScalarToDataType::const_type; using array_layout = typename ViewUniformLayout { } KOKKOS_FUNCTION - constexpr typename offset_policy::data_handle_type offset(data_handle_type p, - size_t i) const - noexcept { + constexpr typename offset_policy::data_handle_type offset( + data_handle_type p, size_t i) const noexcept { return nested_acc.offset(p, i); } @@ -214,6 +212,199 @@ struct AtomicAccessorRelaxed { } }; +//===================================================================== +//============= Reference Counted Accessor and DataHandle ============= +//===================================================================== + +template +class ReferenceCountedDataHandle { + public: + using value_type = ElementType; + using pointer = value_type*; + using reference = value_type&; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + ReferenceCountedDataHandle() = default; + + // this only ever works on host + explicit ReferenceCountedDataHandle(SharedAllocationRecord* rec) { + m_tracker.assign_allocated_record_to_uninitialized(rec); + m_handle = static_cast(get_record()->data()); + } + + KOKKOS_FUNCTION + ReferenceCountedDataHandle(const SharedAllocationTracker& tracker, + pointer data_handle) + : m_tracker(tracker), m_handle(data_handle) {} + + // unmanaged ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle(OtherElementType* ptr) + : m_tracker(), m_handle(ptr) {} + + // subview ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other, OtherElementType* ptr) + : m_tracker(other.m_tracker), m_handle(ptr) {} + + // converting ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + template < + class OtherElementType, class OtherSpace, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + KOKKOS_FUNCTION + pointer get() const noexcept { return m_handle; } + KOKKOS_FUNCTION + explicit operator pointer() const noexcept { return m_handle; } + + bool has_record() const { return m_tracker.has_record(); } + auto* get_record() const { return m_tracker.get_record(); } + int use_count() const noexcept { return m_tracker.use_count(); } + + std::string get_label() const { return m_tracker.get_label(); } + KOKKOS_FUNCTION + const SharedAllocationTracker& tracker() const noexcept { return m_tracker; } + + KOKKOS_FUNCTION + friend bool operator==(const ReferenceCountedDataHandle& lhs, + const value_type* rhs) { + return lhs.m_handle == rhs; + } + + KOKKOS_FUNCTION + friend bool operator==(const value_type* lhs, + const ReferenceCountedDataHandle& rhs) { + return lhs == rhs.m_handle; + } + + private: + template + friend class ReferenceCountedDataHandle; + + template + friend class ReferenceCountedAccessor; + + SharedAllocationTracker m_tracker; + pointer m_handle = nullptr; +}; + +template +class ReferenceCountedAccessor; + +template +struct IsReferenceCountedAccessor : std::false_type {}; + +template +struct IsReferenceCountedAccessor< + ReferenceCountedAccessor> + : std::true_type {}; + +template +class ReferenceCountedAccessor { + public: + using element_type = ElementType; + using data_handle_type = ReferenceCountedDataHandle; + using reference = typename NestedAccessor::reference; + using offset_policy = + ReferenceCountedAccessor; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + constexpr ReferenceCountedAccessor() noexcept = default; + + template < + class OtherElementType, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + std::is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template < + class OtherElementType, class OtherSpace, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)&&std:: + is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template >> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const default_accessor&) {} + + template ::value && + std::is_convertible_v>> + KOKKOS_FUNCTION operator DstAccessor() const { + return m_nested_acc; + } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const { + return m_nested_acc.access(p.get(), i); + } + + KOKKOS_FUNCTION + constexpr data_handle_type offset(data_handle_type p, size_t i) const { + return data_handle_type(p, m_nested_acc.offset(p.get(), i)); + } + + KOKKOS_FUNCTION + constexpr auto nested_accessor() const { return m_nested_acc; } + + private: +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor m_nested_acc; +}; + +template +using CheckedReferenceCountedAccessor = + SpaceAwareAccessor>>; + +template +using CheckedRelaxedAtomicAccessor = + SpaceAwareAccessor>; + +template +using CheckedReferenceCountedRelaxedAtomicAccessor = SpaceAwareAccessor< + MemorySpace, ReferenceCountedAccessor>>; + } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp index 089628137d75..f990d158bfad 100644 --- a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp +++ b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -23,7 +23,11 @@ static_assert(false, #define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP #include "Kokkos_MDSpan_Extents.hpp" -#include +#include + +// The difference between a legacy Kokkos array layout and an +// mdspan layout is that the array layouts can have state, but don't have the +// nested mapping. This file provides interoperability helpers. namespace Kokkos::Impl { @@ -77,32 +81,7 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 7 ? mapping.stride(7) : 0, }; } else { - // FIXME: Kokkos Layouts don't store stride (it's in the mapping) - // We could conceivably fix this by adding an extra ViewCtorProp for - // an abritrary padding. For now we will check for this. - if constexpr (rank > 1 && - (std::is_same_v> || - std::is_same_v>)) { - [[maybe_unused]] constexpr size_t strided_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 1 - : rank - 2; - [[maybe_unused]] constexpr size_t extent_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 0 - : rank - 1; - KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); - } - - return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + ArrayLayout layout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -110,12 +89,98 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + + if constexpr (rank > 1 && + std::is_same_v>) { + layout.stride = mapping.stride(1); + } + if constexpr (std::is_same_v>) { + if constexpr (rank == 2) { + layout.stride = mapping.stride(0); + } + if constexpr (rank > 2) { + if (mapping.stride(rank - 2) != mapping.extents().extent(rank - 1)) + Kokkos::abort( + "Invalid conversion from layout_right_padded to LayoutRight"); + } + } + return layout; } #ifdef KOKKOS_COMPILER_INTEL __builtin_unreachable(); #endif } +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + ArrayLayout layout, std::index_sequence) { + using index_type = typename MappingType::index_type; + using extents_type = typename MappingType::extents_type; + if constexpr (std::is_same_v || + std::is_same_v) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if (layout.stride == KOKKOS_IMPL_CTOR_DEFAULT_ARG || + extents_type::rank() < 2) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if constexpr (std::is_same_v && + extents_type::rank() > 2) { + size_t product_of_dimensions = 1; + for (size_t r = 1; r < extents_type::rank(); r++) + product_of_dimensions *= layout.dimension[r]; + if (product_of_dimensions != layout.stride) + Kokkos::abort( + "Invalid conversion from LayoutRight to layout_right_padded"); + } else { + return MappingType{ + extents_type{ + dextents{ + layout.dimension[Idx]...}}, + layout.stride}; + } + } + } +} +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride layout, std::index_sequence) { + static_assert( + std::is_same_v); + using index_type = typename MappingType::index_type; + index_type strides[MappingType::extents_type::rank()] = { + layout.stride[Idx]...}; + return MappingType{ + mdspan_non_standard_tag(), + static_cast( + dextents{ + layout.dimension[Idx]...}), + strides}; +} + +// specialization for rank 0 to avoid empty array +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride, std::index_sequence<>) { + return MappingType{}; +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout(ArrayLayout layout) { + return mapping_from_array_layout_impl( + layout, std::make_index_sequence()); +} + template KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { using mapping_type = typename MDSpanType::mapping_type; diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp index ebdf2c8211fe..79c137bfddd4 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -28,7 +28,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp index d13c90825c5a..3570ed2b6e14 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -35,6 +35,16 @@ #include #include #include + +namespace Kokkos { +namespace Experimental { +using SYCLDeviceUSMSpace = ::Kokkos::SYCLDeviceUSMSpace; +using SYCLHostUSMSpace = ::Kokkos::SYCLHostUSMSpace; +using SYCLSharedUSMSpace = ::Kokkos::SYCLSharedUSMSpace; +using SYCL = ::Kokkos::SYCL; +} // namespace Experimental +} // namespace Kokkos + #endif #endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp index 400794f86591..399b986041e9 100644 --- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp @@ -19,7 +19,6 @@ #if defined(KOKKOS_ENABLE_SYCL) namespace Kokkos { -namespace Experimental { class SYCLDeviceUSMSpace; ///< Memory space on SYCL device, not accessible from ///< the host class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL @@ -27,7 +26,6 @@ class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL class SYCLHostUSMSpace; ///< Memory space accessible from both the SYCL ///< device and the host (host pinned) class SYCL; ///< Execution space for SYCL -} // namespace Experimental } // namespace Kokkos #endif #endif diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index a44ffefa6b72..a9db2c4cf4a3 100644 --- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -1458,7 +1458,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template struct Tile_Loop_Type<1, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1477,7 +1477,7 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged, template struct Tile_Loop_Type<2, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1496,7 +1496,7 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged, template struct Tile_Loop_Type<3, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1515,7 +1515,7 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged, template struct Tile_Loop_Type<4, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1534,7 +1534,7 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged, template struct Tile_Loop_Type<5, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1553,7 +1553,7 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged, template struct Tile_Loop_Type<6, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1572,7 +1572,7 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged, template struct Tile_Loop_Type<7, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1591,7 +1591,7 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged, template struct Tile_Loop_Type<8, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1616,7 +1616,7 @@ struct HostIterateTile; // For ParallelFor template struct HostIterateTile::value>> { + std::enable_if_t>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -1635,12 +1635,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2000,30 +1999,28 @@ struct HostIterateTile - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v), void> apply(Args&&... args) const { m_func(args...); } template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v), void> apply(Args&&... args) const { m_func(m_tag, args...); } RP const m_rp; Functor const m_func; - std::conditional_t::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce // ValueType - scalar: For reductions template struct HostIterateTile::value && - !std::is_array::value>> { + std::enable_if_t && + !std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2050,12 +2047,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2430,7 +2426,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce @@ -2438,8 +2434,8 @@ struct HostIterateTile struct HostIterateTile::value && - std::is_array::value>> { + std::enable_if_t && + std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2463,12 +2459,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2842,7 +2837,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // ------------------------------------------------------------------ // diff --git a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp index e1273ab9e3bd..e6b2fcbef4bc 100644 --- a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp +++ b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp @@ -41,13 +41,13 @@ struct EmulateCUDADim3 { template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f((Args &&) args...); + f((Args&&)args...); } template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f(Tag{}, (Args &&) args...); + f(Tag{}, (Args&&)args...); } template , Args&&... args) { - _tag_invoke(f, vals[Idxs]..., (Args &&) args...); + _tag_invoke(f, vals[Idxs]..., (Args&&)args...); } template @@ -63,7 +63,7 @@ KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f, T (&vals)[N], Args&&... args) { _tag_invoke_array_helper(f, vals, std::make_index_sequence{}, - (Args &&) args...); + (Args&&)args...); } // ------------------------------------------------------------------ // diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index d77ec0c7537f..b483653021a5 100644 --- a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -143,7 +143,7 @@ struct AnalyzeExecPolicyUseMatcher, Trait, Traits...> { static constexpr auto trigger_error_message = show_name_of_invalid_execution_policy_trait{}; static_assert( - /* always false: */ std::is_void::value, + /* always false: */ std::is_void_v, "Unknown execution policy trait. Search compiler output for " "'show_name_of_invalid_execution_policy_trait' to see the type of the " "invalid trait."); diff --git a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp index d8ab77b20563..4ea0b8d343b5 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp @@ -95,12 +95,12 @@ struct non_owning_variable_size_circular_buffer { non_owning_variable_size_circular_buffer( non_owning_variable_size_circular_buffer const&) = delete; non_owning_variable_size_circular_buffer( - non_owning_variable_size_circular_buffer&&) = default; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer&&) = default; + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer const&) = delete; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer&&) = default; - ~non_owning_variable_size_circular_buffer() = default; + ~non_owning_variable_size_circular_buffer() = default; KOKKOS_FORCEINLINE_FUNCTION constexpr size_type size() const noexcept { return m_size; } @@ -138,7 +138,7 @@ struct ChaseLevDeque { public: template ::value>> + std::is_default_constructible_v>> ChaseLevDeque() : m_array() {} explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} @@ -165,7 +165,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; @@ -226,7 +226,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp index 6e3d99ebd685..ee53fd8bc6d4 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -27,8 +27,9 @@ // To use OpenCL(TM) built-in intrinsics inside kernels, we have to // forward-declare their prototype, also see // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md -#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +#if defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) extern SYCL_EXTERNAL unsigned long __attribute__((overloadable)) intel_get_cycle_counter(); #endif @@ -55,8 +56,10 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept { // Return value of 64-bit hi-res clock register. return clock64(); -#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL We can only return something useful for Intel GPUs and with RDC +#elif defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) return intel_get_cycle_counter(); diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp index e6dd3c63391d..d7319e80c871 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -93,7 +93,7 @@ struct CombinedReducerValueImpl, std::move(arg_values))... {} template - KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { + KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { return this->CombinedReducerValueItemImpl::ref(); } template @@ -181,7 +181,7 @@ struct CombinedReducerImpl, Space, KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl&&) = default; + CombinedReducerImpl&&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( @@ -192,8 +192,8 @@ struct CombinedReducerImpl, Space, template KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( value_type& value, ReducersDeduced&&... reducers) noexcept - : CombinedReducerStorageImpl((ReducersDeduced &&) - reducers)..., + : CombinedReducerStorageImpl( + (ReducersDeduced&&)reducers)..., m_value_view(&value) {} KOKKOS_FUNCTION constexpr void join(value_type& dest, @@ -348,8 +348,8 @@ struct CombinedReductionFunctorWrapperImpl< IndexOrMemberOrTagType1&& arg_first, IndexOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IndexOrMemberOrTagType1 &&) arg_first, - (IndexOrMemberTypesThenValueType &&) args...); + (IndexOrMemberOrTagType1&&)arg_first, + (IndexOrMemberTypesThenValueType&&)args...); } // end call operator }}}2 @@ -369,19 +369,19 @@ struct CombinedReductionFunctorWrapperImpl< template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_same, value_type>::value> + !std::is_same_v, value_type>> _call_op_impl(IdxOrMemberTypes&&... idxs, IdxOrMemberType1&& idx, IdxOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IdxOrMemberTypes &&) idxs..., (IdxOrMemberType1 &&) idx, - (IdxOrMemberTypesThenValueType &&) args...); + (IdxOrMemberTypes&&)idxs..., (IdxOrMemberType1&&)idx, + (IdxOrMemberTypesThenValueType&&)args...); } // base case template KOKKOS_FORCEINLINE_FUNCTION void _call_op_impl(IdxOrMemberTypes&&... idxs, value_type& out) const { - m_functor((IdxOrMemberTypes &&) idxs..., + m_functor((IdxOrMemberTypes&&)idxs..., out.template get()...); } }; @@ -464,8 +464,8 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value( typename _reducer_from_arg_t::value_type...>{ // This helper function is now poorly named after refactoring. - _get_value_from_combined_reducer_ctor_arg((ReferencesOrViewsOrReducers &&) - args)...}; + _get_value_from_combined_reducer_ctor_arg( + (ReferencesOrViewsOrReducers&&)args)...}; //---------------------------------------- } @@ -480,7 +480,7 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( Space, _reducer_from_arg_t...>; return reducer_type(value, _reducer_from_arg_t{ - (ReferencesOrViewsOrReducers &&) args}...); + (ReferencesOrViewsOrReducers&&)args}...); //---------------------------------------- } diff --git a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp index ca4edce5c388..9bde2f72a3ff 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp @@ -110,15 +110,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -132,7 +132,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -194,15 +195,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -216,7 +217,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -262,8 +264,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_and(buffer + (bit >> bits_per_int_lg2) + 1, ~mask); + const uint32_t prev = Kokkos::atomic_fetch_and( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, ~mask); if (!(prev & mask)) { return -1; @@ -273,7 +275,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); // Flush the store-release Kokkos::memory_fence(); @@ -299,8 +301,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_or(buffer + (bit >> bits_per_int_lg2) + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, mask); if (!(prev & mask)) { return -1; @@ -310,7 +312,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return (count & state_used_mask) - 1; } diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp index 6f862718bcb0..72f33ffaab90 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp @@ -138,7 +138,7 @@ int get_device_count() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); return count; #elif defined(KOKKOS_ENABLE_SYCL) - return Kokkos::Experimental::Impl::get_sycl_devices().size(); + return Kokkos::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -183,7 +183,7 @@ std::vector const& Kokkos::Impl::get_visible_devices() { #elif defined(KOKKOS_ENABLE_OPENMPTARGET) int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - int device = Experimental::Impl::SYCLInternal::m_syclDev; + int device = Impl::SYCLInternal::m_syclDev; #else int device = -1; return device; @@ -271,7 +271,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: local rank " << local_rank << " is outside the bounds of resource groups provided by CTest. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the resource types allocated to this resource group @@ -284,7 +284,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_name << " is not specified. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Look for the device type specified in CTEST_KOKKOS_DEVICE_TYPE @@ -308,7 +308,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: device type '" << ctest_kokkos_device_type << "' not included in " << ctest_resource_group_name << ". Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the device ID @@ -324,7 +324,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_id_name << " is not specified. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } auto const* comma = std::strchr(resource_str, ','); @@ -332,7 +332,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: invalid value of " << ctest_resource_group_id_name << ": '" << resource_str << "'. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } std::string id(resource_str + 3, comma - resource_str - 3); @@ -613,7 +613,7 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #endif declare_configuration_metadata("architecture", "Default Device", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); #if defined(KOKKOS_ARCH_A64FX) declare_configuration_metadata("architecture", "CPU architecture", "A64FX"); @@ -666,6 +666,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_RISCV_SG2042) declare_configuration_metadata("architecture", "CPU architecture", "SG2042 (RISC-V)") +#elif defined(KOKKOS_ARCH_RISCV_RVA22V) + declare_configuration_metadata("architecture", "CPU architecture", + "RVA22V (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -738,8 +741,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "GPU architecture", "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX906"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX906"); #elif defined(KOKKOS_ARCH_AMD_GFX908) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); @@ -752,6 +755,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_GFX1100) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX1100"); +#elif defined(KOKKOS_ARCH_AMD_GFX1103) + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX1103"); #else declare_configuration_metadata("architecture", "GPU architecture", "none"); @@ -973,7 +979,7 @@ void Kokkos::Impl::parse_environment_variables( Tools::Impl::parse_environment_variables(tools_init_arguments); if (init_result.result == Tools::Impl::InitializationStatus::environment_argument_mismatch) { - Impl::throw_runtime_exception(init_result.error_message); + Kokkos::abort(init_result.error_message.c_str()); } combine(settings, tools_init_arguments); diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp index c71c21d2ac98..cd00fdadebaf 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp @@ -36,15 +36,22 @@ struct GraphNodeKernelDefaultImpl { // TODO @graphs decide if this should use vtable or intrusive erasure via // function pointers like in the rest of the graph interface virtual void execute_kernel() = 0; + + GraphNodeKernelDefaultImpl() = default; + + explicit GraphNodeKernelDefaultImpl(ExecutionSpace exec) + : m_execution_space(std::move(exec)) {} + + ExecutionSpace m_execution_space; }; // TODO Indicate that this kernel specialization is only for the Host somehow? template class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag::type, - public GraphNodeKernelDefaultImpl { + : public GraphNodeKernelDefaultImpl, + public PatternImplSpecializationFromTag::type { public: using base_t = typename PatternImplSpecializationFromTag - GraphNodeKernelImpl(std::string const&, ExecutionSpace const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...), - execute_kernel_vtable_base_t() {} + GraphNodeKernelImpl(std::string const &, ExecutionSpace const &, + Functor arg_functor, PolicyDeduced &&arg_policy, + ArgsDeduced &&...args) + : execute_kernel_vtable_base_t(arg_policy.space()), + base_t(std::move(arg_functor), (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) {} // FIXME @graph Forward through the instance once that works in the backends template - GraphNodeKernelImpl(ExecutionSpace const& ex, Functor arg_functor, - PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + GraphNodeKernelImpl(ExecutionSpace const &ex, Functor arg_functor, + PolicyDeduced &&arg_policy, ArgsDeduced &&...args) : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) { + // FIXME This constructor seem unused. + } - void execute_kernel() final { this->base_t::execute(); } + void execute_kernel() override final { this->base_t::execute(); } }; // end GraphNodeKernelImpl }}}1 @@ -88,7 +97,7 @@ struct GraphNodeAggregateKernelDefaultImpl using is_graph_kernel = std::true_type; }; using graph_kernel = GraphNodeAggregateKernelDefaultImpl; - void execute_kernel() final {} + void execute_kernel() override final {} }; } // end namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp index 223ae391ab40..31d147ea894b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp @@ -69,10 +69,10 @@ struct GraphNodeBackendSpecificDetails { GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails&&) noexcept = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails const&) = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails&&) noexcept = delete; ~GraphNodeBackendSpecificDetails() = default; @@ -92,6 +92,18 @@ struct GraphNodeBackendSpecificDetails { m_is_aggregate = true; } + // A node is awaitable if it can execute a kernel. + // A root node or an aggregate node cannot be waited for, because it does + // not launch anything. + bool awaitable() const { return (!m_is_root) && (!m_is_aggregate); } + + // Retrieve the execution space instance that has been passed to + // the kernel at construction phase. + const ExecutionSpace& get_execution_space() const { + KOKKOS_EXPECTS(m_kernel_ptr != nullptr) + return m_kernel_ptr->m_execution_space; + } + void set_predecessor( std::shared_ptr> arg_pred_impl) { @@ -104,7 +116,7 @@ struct GraphNodeBackendSpecificDetails { m_predecessors.push_back(std::move(arg_pred_impl)); } - void execute_node() { + void execute_node(const ExecutionSpace& exec) { // This node could have already been executed as the predecessor of some // other KOKKOS_EXPECTS(bool(m_kernel_ptr) || m_has_executed) @@ -115,8 +127,18 @@ struct GraphNodeBackendSpecificDetails { // supported semantics, but instinct I have feels like it should be... m_has_executed = true; for (auto const& predecessor : m_predecessors) { - predecessor->execute_node(); + predecessor->execute_node(exec); } + + // Before executing the kernel, be sure to fence the execution space + // instance of predecessors. + for (const auto& predecessor : m_predecessors) { + if (predecessor->awaitable() && + predecessor->get_execution_space() != this->get_execution_space()) + predecessor->get_execution_space().fence( + "Kokkos::DefaultGraphNode::execute_node: sync with predecessors"); + } + m_kernel_ptr->execute_kernel(); } KOKKOS_ENSURES(m_has_executed) diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 05d485491932..8dfa19a178cf 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -58,12 +58,12 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { // Not movable or copyable; it spends its whole live as a shared_ptr in the // Graph object - GraphImpl() = default; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = default; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; - ~GraphImpl() = default; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() = default; explicit GraphImpl(ExecutionSpace arg_space) : execution_space_instance_storage_base_t(std::move(arg_space)) {} @@ -136,17 +136,40 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { return rv; } - void submit() { + void instantiate() { + KOKKOS_EXPECTS(!m_has_been_instantiated); + m_has_been_instantiated = true; + } + + void submit(const ExecutionSpace& exec) { + if (!m_has_been_instantiated) instantiate(); // This reset is gross, but for the purposes of our simple host // implementation... for (auto& sink : m_sinks) { sink->reset_has_executed(); } + + // We don't know where the nodes will execute, so we need to fence the given + // execution space instance before proceeding. This is the simplest way + // of guaranteeing that the kernels in the graph are correctly "enqueued". + exec.fence( + "Kokkos::DefaultGraph::submit: fencing before launching graph nodes"); + for (auto& sink : m_sinks) { - sink->execute_node(); + sink->execute_node(exec); + } + + // Once all sinks have been executed, we need to fence them. + for (const auto& sink : m_sinks) { + if (sink->awaitable() && sink->get_execution_space() != exec) + sink->get_execution_space().fence( + "Kokkos::DefaultGraph::submit: fencing before ending graph submit"); } } + private: + bool m_has_been_instantiated = false; + // end required customizations }}}2 //---------------------------------------------------------------------------- }; diff --git a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp index 8ba94ba4ccc4..a8a4d6617bcd 100644 --- a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -52,16 +52,16 @@ struct EBOBaseImpl; template class CtorNotOnDevice> struct EBOBaseImpl { template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && !CtorNotOnDevice::value, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&...) noexcept {} template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && CtorNotOnDevice::value, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {} @@ -110,18 +110,18 @@ struct EBOBaseImpl { T m_ebo_object; template ::value && + std::enable_if_t && !CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&... args) noexcept(noexcept(T(std::forward(args)...))) : m_ebo_object(std::forward(args)...) {} template ::value && + std::enable_if_t && CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( noexcept(T(std::forward(args)...))) @@ -167,9 +167,9 @@ struct EBOBaseImpl { template class CtorsNotOnDevice = NoCtorsNotOnDevice> struct StandardLayoutNoUniqueAddressMemberEmulation - : EBOBaseImpl::value, CtorsNotOnDevice> { + : EBOBaseImpl, CtorsNotOnDevice> { private: - using ebo_base_t = EBOBaseImpl::value, CtorsNotOnDevice>; + using ebo_base_t = EBOBaseImpl, CtorsNotOnDevice>; public: using ebo_base_t::ebo_base_t; diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp index 04c5e0bd22a2..58a5de2aa626 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -41,7 +41,7 @@ void team_policy_check_valid_storage_level_argument(int level) { std::stringstream ss; ss << "TeamPolicy::set_scratch_size(/*level*/ " << level << ", ...) storage level argument must be 0 or 1 to be valid\n"; - Impl::throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } } diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp index 58ed54275a64..5805b78ee75b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp @@ -123,14 +123,14 @@ template struct ExecSpaceDerived : ExecSpaceBase { static_assert(check_valid_execution_space()); static_assert(check_is_regular()); - void initialize(InitializationSettings const& settings) final { + void initialize(InitializationSettings const& settings) override final { ExecutionSpace::impl_initialize(settings); } - void finalize() final { ExecutionSpace::impl_finalize(); } - void static_fence(std::string const& label) final { + void finalize() override final { ExecutionSpace::impl_finalize(); } + void static_fence(std::string const& label) override final { ExecutionSpace::impl_static_fence(label); } - void print_configuration(std::ostream& os, bool verbose) final { + void print_configuration(std::ostream& os, bool verbose) override final { ExecutionSpace().print_configuration(os, verbose); } }; diff --git a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp deleted file mode 100644 index 4726a87b97cb..000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp +++ /dev/null @@ -1,279 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP -#define KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP - -#include -#include - -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class FixedBlockSizeMemoryPool - : private MemorySpaceInstanceStorage { - public: - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - - private: - using memory_space_storage_base = - MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { - union { - char ignore; - char data[Size]; - }; - }; - - static constexpr auto actual_size = sizeof(Block); - - // TODO shared allocation tracker - // TODO @optimization put the index values on different cache lines (CPU) or - // pages (GPU)? - - tracker_type m_tracker = {}; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - Kokkos::OwningRawPtr m_first_block = nullptr; - Kokkos::OwningRawPtr m_free_indices = nullptr; - - enum : size_type { IndexInUse = ~size_type(0) }; - - public: - FixedBlockSizeMemoryPool(memory_space const& mem_space, size_type num_blocks) - : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block)); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = - record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks", - num_blocks * sizeof(size_type)); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for (size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool(memory_space const& mem_space, - size_t mempool_capacity, unsigned, unsigned, - unsigned) - : FixedBlockSizeMemoryPool( - mem_space, mempool_capacity / - actual_size) { /* forwarding ctor, must be empty */ - } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool const&) = default; - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept { - (void)alloc_size; - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], - current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if (free_idx == IndexInUse) { - return nullptr; - } else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type /*alloc_size*/) const noexcept { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && - offset / actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } -}; - -#if 0 -template < - class DeviceType, - size_t Size, - size_t Align=1, - class SizeType = typename DeviceType::execution_space::size_type -> -class FixedBlockSizeChaseLevMemoryPool - : private MemorySpaceInstanceStorage -{ -public: - - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - -private: - - using memory_space_storage_base = MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { union { char ignore; char data[Size]; }; }; - - static constexpr auto actual_size = sizeof(Block); - - tracker_type m_tracker = { }; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - - - enum : size_type { IndexInUse = ~size_type(0) }; - -public: - - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_type num_blocks - ) : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) - { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block) - ); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(size_type) - ); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for(size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_t mempool_capacity, - unsigned, unsigned, unsigned - ) : FixedBlockSizeMemoryPool(mem_space, mempool_capacity / actual_size) - { /* forwarding ctor, must be empty */ } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool const&) = default; - - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept - { - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add((volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = - Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if(free_idx == IndexInUse) { - return nullptr; - } - else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type alloc_size) const noexcept - { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && offset/actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add((volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } - -}; -#endif - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp index e844a5295e50..29a365e6e418 100644 --- a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -118,8 +118,8 @@ struct FunctorAnalysis { using functor_has_space = has_execution_space; static_assert(!policy_has_space::value || !functor_has_space::value || - std::is_same::value, + std::is_same_v, "Execution Policy and Functor execution space must match"); //---------------------------------------- @@ -136,9 +136,8 @@ struct FunctorAnalysis { typename std::is_void::type> { using type = typename F::value_type; - static_assert(!std::is_reference::value && - std::rank::value <= 1 && - std::extent::value == 0, + static_assert(!std::is_reference_v && std::rank_v <= 1 && + std::extent_v == 0, "Kokkos Functor::value_type is T or T[]"); }; @@ -149,7 +148,7 @@ struct FunctorAnalysis { template ::type, - bool T = std::is_void::value> + bool T = std::is_void_v> struct deduce_value_type { using type = V; }; @@ -290,8 +289,8 @@ struct FunctorAnalysis { using candidate_type = typename deduce_value_type::type; enum { - candidate_is_void = std::is_void::value, - candidate_is_array = std::rank::value == 1 + candidate_is_void = std::is_void_v, + candidate_is_array = std::rank_v == 1 }; //---------------------------------------- @@ -306,7 +305,7 @@ struct FunctorAnalysis { using value_type = std::remove_extent_t; - static_assert(!std::is_const::value, + static_assert(!std::is_const_v, "Kokkos functor operator reduce argument cannot be const"); private: @@ -614,21 +613,20 @@ struct FunctorAnalysis { }; template - struct DeduceJoinNoTag::value || - (!is_reducer::value && - std::is_void::value)) && - detected_join_no_tag::value>> + struct DeduceJoinNoTag< + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + detected_join_no_tag::value>> : public has_join_no_tag_function { enum : bool { value = true }; }; template struct DeduceJoinNoTag< - F, - std::enable_if_t<(is_reducer::value || - (!is_reducer::value && std::is_void::value)) && - (!detected_join_no_tag::value && - detected_volatile_join_no_tag::value)>> + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + (!detected_join_no_tag::value && + detected_volatile_join_no_tag::value)>> : public has_volatile_join_no_tag_function { enum : bool { value = true }; static_assert(Impl::dependent_false_v, @@ -735,8 +733,8 @@ struct FunctorAnalysis { template struct DeduceInitNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_init_no_tag_function::enable_if( &F::init))>> : public has_init_no_tag_function { @@ -835,8 +833,8 @@ struct FunctorAnalysis { template struct DeduceFinalNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_final_no_tag_function::enable_if( &F::final))>> : public has_final_no_tag_function { @@ -906,14 +904,14 @@ struct FunctorAnalysis { Functor m_functor; template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return m_functor.value_count; } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return candidate_is_void ? 0 : 1; } @@ -973,8 +971,8 @@ struct FunctorAnalysis { DeduceJoin<>::join(&m_functor, dst, src); } - KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const - noexcept { + KOKKOS_INLINE_FUNCTION reference_type + init(ValueType* const dst) const noexcept { DeduceInit<>::init(&m_functor, dst); return reference(dst); } @@ -987,11 +985,11 @@ struct FunctorAnalysis { KOKKOS_INLINE_FUNCTION const Functor& get_functor() const { return m_functor; } - Reducer(Reducer const&) = default; - Reducer(Reducer&&) = default; + Reducer(Reducer const&) = default; + Reducer(Reducer&&) = default; Reducer& operator=(Reducer const&) = delete; - Reducer& operator=(Reducer&&) = delete; - ~Reducer() = default; + Reducer& operator=(Reducer&&) = delete; + ~Reducer() = default; KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( Functor const& arg_functor) noexcept diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp index 56f95c814d88..6d3ebf64befc 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp @@ -56,7 +56,7 @@ struct GraphAccess { static_assert( Kokkos::Impl::is_specialization_of::value, "Kokkos Internal Error in graph interface"); - return std::make_shared((Args &&) args...); + return std::make_shared((Args&&)args...); } template ::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_node_ptr()`)"); - return ((NodeRef &&) node_ref).get_node_ptr(); + return ((NodeRef&&)node_ref).get_node_ptr(); } template @@ -93,7 +93,7 @@ struct GraphAccess { Kokkos::Experimental::GraphNodeRef>::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_graph_weak_ptr()`)"); - return ((NodeRef &&) node_ref).get_graph_weak_ptr(); + return ((NodeRef&&)node_ref).get_graph_weak_ptr(); } // end accessors for private members of public interface }}}2 diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp index 2ab05cb8e439..b02a26547223 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp @@ -54,9 +54,9 @@ template