diff --git a/README.md b/README.md index 6ed651ff..d0362082 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # YASK--Yet Another Stencil Kernel -* New YASK users may want to start with the [YASK tutorial](https://www.ixpug.org/components/com_solutionlibrary/assets/documents/1538169451-IXPUG_Fall_Conf_2018_paper_2%20-%20Rev3%20-%20Charles%20Yount.pdf). +* New YASK users may want to start with the [YASK tutorial](docs/YASK-tutorial.pdf). * Existing YASK users may want to jump to the [backward-compatibility notices](#backward-compatibility-notices). ## Overview @@ -25,12 +25,12 @@ YASK contains a domain-specific compiler to convert scalar stencil code to SIMD- for multi-socket and multi-node operation or Intel(R) Parallel Studio XE Composer Edition for C++ Linux for single-socket only - (2016 or later, 2018 update 2 or later recommended). + (2018 or later; 2019 or later recommended and required when using g++ 8 or later). Building a YASK kernel with the Gnu compiler is possible, but only useful for functional testing. The performance of the kernel built from the Gnu compiler has been observed to be up to 7x lower than the same kernel built using the Intel compiler. -* Gnu C++ compiler, g++ (4.9.0 or later; 6.1.0 or later recommended). +* Gnu C++ compiler, g++ (4.9.0 or later; 8.2.0 or later recommended). * Linux libraries `librt` and `libnuma`. * Perl (5.010 or later). * Awk. @@ -45,7 +45,7 @@ YASK contains a domain-specific compiler to convert scalar stencil code to SIMD- Reading the generated code is only necessary for debug or curiosity. * SWIG (3.0.12 or later), http://www.swig.org, for creating the Python interface. - * Python 2 (2.7.5 or later) or 3 (3.6.1 or later, recommended), + * Python 2 (2.7.5 or later) or 3 (3.6.1 or later), https://www.python.org/downloads, for creating and using the Python interface. * Doxygen (1.8.11 or later), http://doxygen.org, for creating updated API documentation. @@ -58,6 +58,9 @@ YASK contains a domain-specific compiler to convert scalar stencil code to SIMD- for functional testing if you don't have native support for any given instruction set. ### Backward-compatibility notices: +* Version 2.18.00 added the ability to specify the global-domain size, and it will calculate the local-domain sizes from it. +There is no longer a default local-domain size. +Output changed terms "overall-problem" to "global-domain" and "rank-domain" to "local-domain". * Version 2.17.00 determined the host architecture in `make` and `bin/yask.sh` and number of MPI ranks in `bin/yask.sh`. This changed the old behavior of `make` defaulting to `snb` architecture and `bin/yask.sh` requiring `-arch` and `-ranks`. Those options are still available to override the host-based default. diff --git a/docs/YASK-tutorial.pdf b/docs/YASK-tutorial.pdf new file mode 100644 index 00000000..cd1f3835 Binary files /dev/null and b/docs/YASK-tutorial.pdf differ diff --git a/include/yk_solution_api.hpp b/include/yk_solution_api.hpp index d95654ee..016d92af 100644 --- a/include/yk_solution_api.hpp +++ b/include/yk_solution_api.hpp @@ -134,24 +134,31 @@ namespace yask { virtual std::vector get_misc_dim_names() const =0; - /// Set the size of the solution domain for this rank. + /// Set the local-domain size in the specified dimension, i.e., the size of the part of the domain that is in this rank. /** The domain defines the number of elements that will be evaluated with the stencil(s). - If MPI is not enabled, this is the entire problem domain. - If MPI is enabled, this is the domain for the current rank only, - and the problem domain consists of the sum of all rank domains - in each dimension (weak-scaling). - The domain size in each rank does not have to be the same, but - all domains in the same column must have the same width, - all domains in the same row must have the same height, + If MPI is not enabled, this is equivalent to the global-domain size. + If MPI is enabled, this is the domain size for the current rank only, + and the global-domain size is the sum of all local-domain sizes + in each dimension. + The local-domain size in each rank does not have to be the same, but + all local-domains in the same column of ranks must have the same width, + all local-domains in the same row must have the same height, and so forth, for each domain dimension. - The domain size does *not* include the halo area or any padding. - For best performance, set the rank domain + The local-domain size does *not* include the halo area or any padding. + For best performance, set the local-domain size to a multiple of the number of elements in a vector-cluster in - each dimension whenever possible. + each dimension. + + You should set either the local-domain size or the global-domain size + in each dimension. The unspecified (zero) sizes will be calculated based on the + specified ones when prepare_solution() is called. + Setting the local-domain size to a non-zero value will clear the + global-domain size in that dimension until prepare_solution() is called. + See the "Detailed Description" for \ref yk_grid for more information on grid sizes. There is no domain-size setting allowed in the - solution-step dimension (usually "t"). + solution-step dimension (e.g., "t"). */ virtual void set_rank_domain_size(const std::string& dim @@ -159,8 +166,15 @@ namespace yask { the names from get_domain_dim_names(). */, idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0; - /// Get the domain size for this rank. + /// Get the local-domain size in the specified dimension, i.e., the size in this rank. /** + See documentation for set_rank_domain_size(). + + If you have called set_overall_domain_size() in a given dimension, + get_rank_domain_size() will return zero in that dimension until + prepare_solution() is called. After prepare_solution() is called, + the computed size will be returned. + @returns Current setting of rank domain size in specified dimension. */ virtual idx_t @@ -168,6 +182,44 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */) const =0; + /// Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks. + /** + You should set either the local-domain size or the global-domain size + in each dimension. The unspecified (zero) sizes will be calculated based on the + specified ones when prepare_solution() is called. + Setting the global-domain size to a non-zero value will clear the + local-domain size in that dimension until prepare_solution() is called. + + See documentation for set_rank_domain_size(). + See the "Detailed Description" for \ref yk_grid for more information on grid sizes. + There is no domain-size setting allowed in the + solution-step dimension (e.g., "t"). + */ + virtual void + set_overall_domain_size(const std::string& dim + /**< [in] Name of dimension to set. Must be one of + the names from get_domain_dim_names(). */, + idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0; + + /// Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks. + /** + The global-domain indices in the specified dimension will range from + zero (0) to get_overall_domain_size() - 1, inclusive. + Call get_first_rank_domain_index() and get_last_rank_domain_index() + to find the subset of this domain in each rank. + + If you have called set_rank_domain_size() in a given dimension, + get_overall_domain_size() will return zero in that dimension until + prepare_solution() is called. After prepare_solution() is called, + the computed size will be returned. + + @returns Sum of all ranks' domain sizes in the given dimension. + */ + virtual idx_t + get_overall_domain_size(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */ ) const =0; + /// Set the block size in the given dimension. /** This sets the approximate number of elements that are evaluated in @@ -208,8 +260,16 @@ namespace yask { /// Set the number of MPI ranks in the given dimension. /** - The *product* of the number of ranks across all dimensions must - equal yk_env::get_num_ranks(). + If set_num_ranks() is set to a non-zero value in all + dimensions, then + the *product* of the number of ranks across all dimensions must + equal the value returned by yk_env::get_num_ranks(). + If the number of ranks is zero in one or more + dimensions, those values will be set by a heuristic when + prepare_solution() is called. + An exception will be thrown if no legal values are possible + given the specified (non-zero) values. + The curent MPI rank will be assigned a unique location within the overall problem domain based on its MPI rank index. Or, you can set it explicitly via set_rank_index(). @@ -356,22 +416,6 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */ ) const =0; - /// Get the overall problem size in the specified dimension. - /** - The overall domain indices in the specified dimension will range from - zero (0) to get_overall_domain_size() - 1, inclusive. - Call get_first_rank_domain_index() and get_last_rank_domain_index() - to find the subset of this domain in each rank. - - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() obtains the sub-domain sizes from other ranks. - @returns Sum of all ranks' domain sizes in the given dimension. - */ - virtual idx_t - get_overall_domain_size(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */ ) const =0; - /// Run the stencil solution for the specified steps. /** The stencil(s) in the solution are applied to the grid data, setting the diff --git a/src/common/common.mk b/src/common/common.mk index 881b9b4a..7a7d18d3 100644 --- a/src/common/common.mk +++ b/src/common/common.mk @@ -70,6 +70,9 @@ PERL := perl MKDIR := mkdir -p -v BASH := bash +# Options to avoid warnings when compiling SWIG-generated code. +SWIG_CXXFLAGS := -Wno-class-memaccess -Wno-stringop-overflow -Wno-stringop-truncation + # Find include path needed for python interface. # NB: constructing string inside print() to work for python 2 or 3. PYINC := $(addprefix -I,$(shell $(PYTHON) -c 'import distutils.sysconfig; print(distutils.sysconfig.get_python_inc() + " " + distutils.sysconfig.get_python_inc(plat_specific=1))')) diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index 20ea4306..e27109f5 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -46,7 +46,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.17.00"; + const string version = "2.18.00"; string yask_get_version_string() { return version; diff --git a/src/common/tuple.cpp b/src/common/tuple.cpp index bbaf8f6a..43244ab2 100644 --- a/src/common/tuple.cpp +++ b/src/common/tuple.cpp @@ -239,7 +239,7 @@ namespace yask { // For some reason, copying *this and erasing // the element in newt._q causes an exception. Tuple newt; - for (int i = 0; i < size(); i++) { + for (int i = 0; i < getNumDims(); i++) { if (i != posn) newt.addDimBack(getDimName(i), getVal(i)); } diff --git a/src/common/tuple.hpp b/src/common/tuple.hpp index 869975fc..a3fb6865 100644 --- a/src/common/tuple.hpp +++ b/src/common/tuple.hpp @@ -162,15 +162,15 @@ namespace yask { public: Tuple() {} - ~Tuple() {} + ~Tuple() {} // NOT a virtual class! // first-inner (first dim is unit stride) accessors. bool isFirstInner() const { return _firstInner; } void setFirstInner(bool fi) { _firstInner = fi; } // Query number of dims. - int size() const { - return int(_q.size()); + size_t size() const { + return _q.size(); } int getNumDims() const { return int(_q.size()); @@ -328,7 +328,7 @@ namespace yask { // extra values are ignored. If there are fewer values in 'vals' // than 'this', only the number of values supplied will be updated. void setVals(int numVals, const T vals[]) { - int end = int(std::min(numVals, size())); + int end = std::min(numVals, int(_q.size())); for (int i = 0; i < end; i++) setVal(i, vals[i]); } @@ -553,6 +553,9 @@ namespace yask { Tuple negElements() const { return mapElements([&](T in){ return -in; }); } + Tuple absElements() const { + return mapElements([&](T in){ return abs(in); }); + } // make string like "4x3x2" or "4, 3, 2". std::string makeValStr(std::string separator=", ", diff --git a/src/compiler/Makefile b/src/compiler/Makefile index 376fbd77..99b1051c 100644 --- a/src/compiler/Makefile +++ b/src/compiler/Makefile @@ -74,17 +74,17 @@ YC_LFLAGS := -lrt -Wl,-rpath=$(LIB_OUT_DIR) -L$(LIB_OUT_DIR) -l$(YC_BASE) $(YC_OBJ_DIR)/%.o: $(YC_STENCIL_DIR)/%.cpp $(YC_INC_GLOB) $(YC_STENCIL_INC_GLOB) $(MKDIR) $(YC_OBJ_DIR) - $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -DUSE_INTERNAL_DSL -O0 -c -o $@ $< + $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -DUSE_INTERNAL_DSL -O0 -c -o $@ $< @ls -l $@ $(YC_OBJ_DIR)/%.o: $(YC_LIB_SRC_DIR)/%.cpp $(YC_INC_GLOB) $(MKDIR) $(YC_OBJ_DIR) - $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -fPIC -c -o $@ $< + $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -fPIC -c -o $@ $< @ls -l $@ $(YC_OBJ_DIR)/%.o: $(COMM_DIR)/%.cpp $(YC_INC_GLOB) $(MKDIR) $(YC_OBJ_DIR) - $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -fPIC -c -o $@ $< + $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -fPIC -c -o $@ $< @ls -l $@ ######## Primary targets. @@ -127,7 +127,7 @@ $(YC_SWIG_OUT_DIR)/yask_compiler_api_wrap.cpp: $(YC_SWIG_DIR)/yask*.i $(INC_DIR) # https://github.com/swig/swig/issues/773 $(YC_OBJ_DIR)/yask_compiler_api_wrap.o: $(YC_SWIG_OUT_DIR)/yask_compiler_api_wrap.cpp $(MKDIR) $(YC_OBJ_DIR) - $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -DNDEBUG $(PYINC) -fPIC -c -o $@ $< + $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ $(SWIG_CXXFLAGS) -DNDEBUG $(PYINC) -fPIC -c -o $@ $< @ls -l $@ $(YC_PY_LIB): $(YC_OBJS) $(YC_OBJ_DIR)/yask_compiler_api_wrap.o diff --git a/src/compiler/lib/Grid.cpp b/src/compiler/lib/Grid.cpp index 7259289d..f4882fa1 100644 --- a/src/compiler/lib/Grid.cpp +++ b/src/compiler/lib/Grid.cpp @@ -200,13 +200,10 @@ namespace yask { // Can fold if ALL fold dims >1 are used in this grid. -#if 1 - // NB: this will always be true if there is no vectorization. - // We do this because the compiler expects stencils to be vectorizable. - _isFoldable = _numFoldableDims == dims._foldGT1.size(); -#else - _isFoldable = (_numFoldableDims > 0 ) && (_numFoldableDims == dims._foldGT1.size()); -#endif + // NB: this will always be true if there is no vectorization, i.e., + // both are zero. We do this because the compiler expects stencils + // to be vectorizable. + _isFoldable = _numFoldableDims == int(dims._foldGT1.size()); } // Determine whether halo sizes are equal. diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 0d2c9ca7..c4422cb5 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -63,7 +63,6 @@ else ifeq ($(stencil),cube) else ifneq ($(findstring iso3dfd,$(stencil)),) MACROS += MAX_EXCH_DIST=1 radius := 8 - def_rank_args := -d 1024 def_pad_args := -ep 1 ifeq ($(arch),knl) fold_4byte := x=2,y=8 @@ -92,7 +91,6 @@ else ifneq ($(findstring iso3dfd,$(stencil)),) else ifneq ($(findstring awp,$(stencil)),) def_block_args := -b 32 YC_FLAGS += -min-es 1 - def_rank_args := -d 1024 -dz 128 def_pad_args := -ep 1 ifeq ($(arch),knl) fold_4byte := x=4,y=4 @@ -117,16 +115,13 @@ else ifneq ($(findstring awp,$(stencil)),) endif else ifneq ($(findstring ssg,$(stencil)),) - def_rank_args := -d 512 ifneq ($(filter $(arch),skx skl clx),) - def_rank_args := -d 640 -dx 320 fold_4byte := x=4,y=4 def_block_args := -bx 96 -by 16 -bz 80 def_block_threads := 2 endif else ifneq ($(findstring fsg,$(stencil)),) - def_rank_args := -d 256 ifeq ($(arch),knl) omp_region_schedule := guided def_block_args := -b 16 @@ -143,7 +138,6 @@ else ifneq ($(findstring fsg,$(stencil)),) else ifeq ($(stencil),tti) MACROS += MAX_EXCH_DIST=3 radius := 2 - def_rank_args := -d 512 ifneq ($(filter $(arch),skx skl clx),) fold_4byte := x=4,y=4 def_block_args := -bx 80 -by 16 -bz 40 @@ -231,7 +225,6 @@ omp_block_schedule ?= static,1 omp_misc_schedule ?= guided def_thread_divisor ?= 1 def_block_threads ?= 2 -def_rank_args ?= -d 128 def_block_args ?= -b 64 cluster ?= x=1 pfd_l1 ?= 0 @@ -433,7 +426,7 @@ MACROS += ALLOW_NEW_GRIDS=$(allow_new_grid_types) # Default cmd-line args. DEF_ARGS += -thread_divisor $(def_thread_divisor) DEF_ARGS += -block_threads $(def_block_threads) -DEF_ARGS += $(def_rank_args) $(def_block_args) $(def_pad_args) $(more_def_args) +DEF_ARGS += $(def_block_args) $(def_pad_args) $(more_def_args) YK_CXXFLAGS += -DDEF_ARGS='"$(DEF_ARGS) $(EXTRA_DEF_ARGS)"' # arch. @@ -710,7 +703,7 @@ $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.cpp: $(YK_SWIG_DIR)/yask*.i $(INC_DIR)/* $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.o: $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.cpp $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) $(PYINC) -fPIC -c -o $@ $< + $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) -x c++ $(SWIG_CXXFLAGS) $(PYINC) -fPIC -c -o $@ $< @ls -l $@ $(YK_PY_LIB): $(YK_OBJS) $(YK_EXT_OBJS) $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.o @@ -1016,4 +1009,4 @@ help: echo "Example builds with test runs:"; \ echo " $(MAKE) -j all # Normal full API and stencil tests"; \ echo " $(MAKE) -j all YK_CXXOPT=-O2 YK_CXX=g++ mpi=0 ranks=1 # g++ w/o MPI"; \ - echo " $(MAKE) -j all YK_CXXOPT=-O1 ranks=3 check=1 # Run 3 ranks w/checking" + echo " $(MAKE) -j all YK_CXXOPT=-O1 ranks=4 check=1 # Run 4 ranks w/checking" diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index 95256822..ab0a22b9 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -231,7 +231,6 @@ namespace yask { // Some calculated sizes for this rank and overall. IdxTuple rank_domain_offsets; // Domain index offsets for this rank. - IdxTuple overall_domain_sizes; // Total of rank domains over all ranks. idx_t rank_nbytes=0, tot_nbytes=0; idx_t rank_domain_pts=0, tot_domain_pts=0; @@ -565,7 +564,6 @@ namespace yask { virtual idx_t get_first_rank_domain_index(const std::string& dim) const; virtual idx_t get_last_rank_domain_index(const std::string& dim) const; - virtual idx_t get_overall_domain_size(const std::string& dim) const; virtual void run_solution(idx_t first_step_index, idx_t last_step_index); @@ -575,12 +573,14 @@ namespace yask { virtual void share_grid_storage(yk_solution_ptr source); // APIs that access settings. + virtual void set_overall_domain_size(const std::string& dim, idx_t size); virtual void set_rank_domain_size(const std::string& dim, idx_t size); virtual void set_min_pad_size(const std::string& dim, idx_t size); virtual void set_block_size(const std::string& dim, idx_t size); virtual void set_region_size(const std::string& dim, idx_t size); virtual void set_num_ranks(const std::string& dim, idx_t size); virtual void set_rank_index(const std::string& dim, idx_t size); + virtual idx_t get_overall_domain_size(const std::string& dim) const; virtual idx_t get_rank_domain_size(const std::string& dim) const; virtual idx_t get_min_pad_size(const std::string& dim) const; virtual idx_t get_block_size(const std::string& dim) const; diff --git a/src/kernel/lib/settings.cpp b/src/kernel/lib/settings.cpp index 68eb7997..728e9626 100644 --- a/src/kernel/lib/settings.cpp +++ b/src/kernel/lib/settings.cpp @@ -240,31 +240,33 @@ namespace yask { auto& step_dim = dims->_step_dim; // Use both step and domain dims for all size tuples. + _global_sizes = dims->_stencil_dims; + _global_sizes.setValsSame(0); // 0 => calc from rank. + _rank_sizes = dims->_stencil_dims; - _rank_sizes.setValsSame(def_rank); // size of rank. - _rank_sizes.setVal(step_dim, 0); // not used. + _rank_sizes.setValsSame(0); // 0 => calc from global. _region_sizes = dims->_stencil_dims; - _region_sizes.setValsSame(0); // 0 => default settings. + _region_sizes.setValsSame(0); // 0 => rank size. _block_group_sizes = dims->_stencil_dims; _block_group_sizes.setValsSame(0); // 0 => min size. _block_sizes = dims->_stencil_dims; - _block_sizes.setValsSame(def_block); // size of block. + _block_sizes.setValsSame(def_block); // size of block. TODO: calculate good value. _block_sizes.setVal(step_dim, 0); // 0 => default. _mini_block_group_sizes = dims->_stencil_dims; _mini_block_group_sizes.setValsSame(0); // 0 => min size. _mini_block_sizes = dims->_stencil_dims; - _mini_block_sizes.setValsSame(0); // 0 => default settings. + _mini_block_sizes.setValsSame(0); // 0 => calc from block. _sub_block_group_sizes = dims->_stencil_dims; _sub_block_group_sizes.setValsSame(0); // 0 => min size. _sub_block_sizes = dims->_stencil_dims; - _sub_block_sizes.setValsSame(0); // 0 => default settings. + _sub_block_sizes.setValsSame(0); // 0 => calc from mini-block. _min_pad_sizes = dims->_stencil_dims; _min_pad_sizes.setValsSame(0); @@ -274,7 +276,7 @@ namespace yask { // Use only domain dims for MPI tuples. _num_ranks = dims->_domain_dims; - _num_ranks.setValsSame(1); + _num_ranks.setValsSame(0); // 0 => set using heuristic. _rank_indices = dims->_domain_dims; _rank_indices.setValsSame(0); @@ -310,16 +312,21 @@ namespace yask { } // Option for setting all domain dims. + auto shortcut = prefix; + if (shortcut.back() == '_') + shortcut.pop_back(); parser.add_option(new CommandLineParser::MultiIdxOption - (prefix, - "Shorthand for" + multi_help, + (shortcut, + "Shortcut for" + multi_help, multi_vars)); } // Add these settigns to a cmd-line parser. void KernelSettings::add_options(CommandLineParser& parser) { - _add_domain_option(parser, "d", "Rank-domain size", _rank_sizes); + _add_domain_option(parser, "g", "Global-domain (overall-problem) size", _global_sizes); + _add_domain_option(parser, "l", "Local-domain (rank) size", _rank_sizes); + _add_domain_option(parser, "d", "Alias for local-domain size (deprecated)", _rank_sizes); _add_domain_option(parser, "r", "Region size", _region_sizes, true); _add_domain_option(parser, "b", "Block size", _block_sizes, true); _add_domain_option(parser, "mb", "Mini-block size", _mini_block_sizes); @@ -455,14 +462,14 @@ namespace yask { " then this is the unit of work for each wave-front rank tile;\n" " else, there is typically only one region the size of the rank-domain.\n" " Regions are evaluated sequentially within ranks.\n" - " A 'rank-domain' is composed of regions.\n" + " A 'local-domain' or 'rank-domain' is composed of regions.\n" " This is the unit of work for one MPI rank.\n" " Ranks are evaluated in parallel in separate MPI processes.\n" - " The 'overall-problem' is composed of rank-domains.\n" + " The 'global-domain' or 'overall-problem' is composed of local-domains.\n" " This is the unit of work across all MPI ranks.\n" << #ifndef USE_MPI " This binary has NOT been compiled with MPI support,\n" - " so the overall-problem is equivalent to the single rank-domain.\n" << + " so the global-domain is equivalent to the single local-domain.\n" << #endif "\nGuidelines for setting tiling sizes:\n" " The vector and vector-cluster sizes are set at compile-time, so\n" @@ -501,9 +508,13 @@ namespace yask { " The region size in the step dimension affects how often MPI halo-exchanges occur:\n" " A region size of 0 in the step dimension => exchange after every pack.\n" " A region size >0 in the step dimension => exchange after that many steps.\n" - " Set rank-domain sizes to specify the work done on this rank.\n" - " Set the domain sizes to specify the problem size for this rank.\n" + " Set local-domain sizes to specify the work done on this MPI rank.\n" + " A local-domain size of 0 in a given domain dimension =>\n" + " local-domain size is determined by the global-domain size in that dimension.\n" " This and the number of grids affect the amount of memory used.\n" + " Set global-domain sizes to specify the work done across all MPI ranks.\n" + " A global-domain size of 0 in a given domain dimension =>\n" + " global-domain size is the sum of local-domain sizes in that dimension.\n" #ifdef SHOW_GROUPS " Setting 'group' sizes controls only the order of tiles.\n" " These are advanced settings that are not commonly used.\n" @@ -521,17 +532,26 @@ namespace yask { " Num threads used for halo exchange is same as num per region.\n" << #ifdef USE_MPI "\nControlling MPI scaling:\n" - " To 'weak-scale' to a larger overall-problem size, use multiple MPI ranks\n" - " and keep the rank-domain sizes constant.\n" " To 'strong-scale' a given overall-problem size, use multiple MPI ranks\n" - " and reduce the size of each rank-domain appropriately.\n" << + " and keep the global-domain sizes constant.\n" + " To 'weak-scale' to a larger overall-problem size, use multiple MPI ranks\n" + " and keep the local-domain sizes constant.\n" << #endif - appNotes << - "Examples for a 3D (x, y, z) over time (t) problem:\n" - " " << pgmName << " -d 768\n" - " " << pgmName << " -dx 512 -dy 256 -dz 128\n" - " " << pgmName << " -d 2048 -r 512 -rt 10 # temporal rank tiling.\n" - " " << pgmName << " -d 512 -nrx 2 -nry 1 -nrz 2 # multi-rank.\n"; + appNotes; + + // Make example knobs. + string ex1, ex2; + DOMAIN_VAR_LOOP(i, j) { + auto& dname = _dims->_domain_dims.getDimName(j); + ex1 += " -g" + dname + " " + to_string(i * 128); + ex2 += " -nr" + dname + " " + to_string(i + 1); + } + os << + "\nExamples:\n" + " " << pgmName << " -g 768 # global-domain size in all dims.\n" + " " << pgmName << ex1 << " # global-domain size in each dim.\n" + " " << pgmName << " -l 2048 -r 512 -rt 10 # local-domain size and temporal rank tiling.\n" + " " << pgmName << " -g 512" << ex2 << " # number of ranks in each dim.\n"; for (auto ae : appExamples) os << " " << pgmName << " " << ae << endl; os << flush; @@ -610,9 +630,9 @@ namespace yask { // Default region size (if 0) will be size of rank-domain. os << "\nRegions:" << endl; auto nr = findNumSubsets(os, _region_sizes, "region", - _rank_sizes, "rank-domain", + _rank_sizes, "local-domain", cluster_pts, step_dim); - os << " num-regions-per-rank-domain-per-step: " << nr << endl; + os << " num-regions-per-local-domain-per-step: " << nr << endl; os << " Since the region size in the '" << step_dim << "' dim is " << rt << ", temporal wave-front rank tiling is "; if (!rt) os << "NOT "; @@ -626,7 +646,7 @@ namespace yask { _region_sizes, "region", cluster_pts, step_dim); os << " num-blocks-per-region-per-step: " << nb << endl; - os << " num-blocks-per-rank-domain-per-step: " << (nb * nr) << endl; + os << " num-blocks-per-local-domain-per-step: " << (nb * nr) << endl; os << " Since the block size in the '" << step_dim << "' dim is " << bt << ", temporal blocking is "; if (!bt) os << "NOT "; @@ -640,7 +660,7 @@ namespace yask { cluster_pts, step_dim); os << " num-mini-blocks-per-block-per-step: " << nmb << endl; os << " num-mini-blocks-per-region-per-step: " << (nmb * nb) << endl; - os << " num-mini-blocks-per-rank-domain-per-step: " << (nmb * nb * nr) << endl; + os << " num-mini-blocks-per-local-domain-per-step: " << (nmb * nb * nr) << endl; os << " Since the mini-block size in the '" << step_dim << "' dim is " << mbt << ", temporal wave-front block tiling is "; if (!mbt) os << "NOT "; diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp index 71a2b3b2..e916afbb 100644 --- a/src/kernel/lib/settings.hpp +++ b/src/kernel/lib/settings.hpp @@ -449,8 +449,7 @@ namespace yask { protected: // Default sizes. - idx_t def_rank = 128; - idx_t def_block = 32; + idx_t def_block = 32; // TODO: calculate this. // Make a null output stream. yask_output_factory yof; @@ -462,6 +461,7 @@ namespace yask { DimsPtr _dims; // Sizes in elements (points). + IdxTuple _global_sizes; // Overall problem domain sizes. IdxTuple _rank_sizes; // This rank's domain sizes. IdxTuple _region_sizes; // region size (used for wave-front tiling). IdxTuple _block_group_sizes; // block-group size (only used for 'grouped' region loops). @@ -470,8 +470,8 @@ namespace yask { IdxTuple _mini_block_sizes; // mini-block size (used for wave-fronts in blocks). IdxTuple _sub_block_group_sizes; // sub-block-group size (only used for 'grouped' mini-block loops). IdxTuple _sub_block_sizes; // sub-block size (used for each nested thread). - IdxTuple _min_pad_sizes; // minimum spatial padding. - IdxTuple _extra_pad_sizes; // extra spatial padding. + IdxTuple _min_pad_sizes; // minimum spatial padding (including halos). + IdxTuple _extra_pad_sizes; // extra spatial padding (outside of halos). // MPI settings. IdxTuple _num_ranks; // number of ranks in each dim. diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp index ab69aef5..da7bdf49 100644 --- a/src/kernel/lib/setup.cpp +++ b/src/kernel/lib/setup.cpp @@ -70,8 +70,7 @@ namespace yask { // Init various tuples to make sure they have the correct dims. rank_domain_offsets = domain_dims; - rank_domain_offsets.setValsSame(-1); // indicates prepare_solution() not called. - overall_domain_sizes = domain_dims; + rank_domain_offsets.setValsSame(-1); // indicates prepare_solution() not called. TODO: add flag. max_halos = domain_dims; wf_angles = domain_dims; wf_shift_pts = domain_dims; @@ -86,231 +85,427 @@ namespace yask { // Init MPI-related vars and other vars related to my rank's place in // the global problem: rank index, offset, etc. Need to call this even // if not using MPI to properly init these vars. Called from - // prepare_solution(), so it doesn't normally need to be called from user code. + // prepare_solution(). void StencilContext::setupRank() { STATE_VARS(this); - + TRACE_MSG("setupRank()..."); auto me = env->my_rank; - int num_neighbors = 0; + auto nr = env->num_ranks; + + // All ranks should have the same settings for certain options. + assertEqualityOverRanks(nr, env->comm, "total number of MPI ranks"); + assertEqualityOverRanks(idx_t(opts->use_shm), env->comm, "use_shm setting"); + assertEqualityOverRanks(idx_t(opts->find_loc), env->comm, "defined rank indices"); + DOMAIN_VAR_LOOP(i, j) { + auto& dname = domain_dims.getDimName(j); + assertEqualityOverRanks(opts->_global_sizes[i], env->comm, + "global-domain size in '" + dname + "' dimension"); + assertEqualityOverRanks(opts->_num_ranks[j], env->comm, + "number of ranks in '" + dname + "' dimension"); + + // Check that either local or global size is set. + if (!opts->_global_sizes[i] && !opts->_rank_sizes[i]) + THROW_YASK_EXCEPTION("Error: both local-domain size and " + "global-domain size are zero in '" + + dname + "' dimension on rank " + + to_string(me) + "; specify one, " + "and the other will be calculated"); + } + +#ifndef USE_MPI + + // Simple settings. + opts->_num_ranks.setValsSame(0); + opts->_rank_indices.setValsSame(0); + rank_domain_offsets.setValsSame(0); + + // Init vars w/o MPI. + DOMAIN_VAR_LOOP(i, j) { + + // Need to set local size. + if (!opts->_rank_sizes[i]) + opts->_rank_sizes[i] = opts->_global_sizes[i]; + + // Need to set global size. + else if (!opts->_global_sizes[i]) + opts->_global_sizes[i] = opts->_rank_sizes[i]; + + // Check that settings are equal. + else if (opts->_global_sizes[i] != opts->_rank_sizes[i]) { + auto& dname = domain_dims.getDimName(j); + FORMAT_AND_THROW_YASK_EXCEPTION("Error: specified local-domain size of " << + opts->_rank_sizes[i] << + " does not equal specified global-domain size of " << + opts->_global_sizes[i] << " in '" << dname << + "' dimension"); + } + } + +#else + // Set number of ranks in each dim if any is unset (zero). + if (!opts->_num_ranks.product()) { + + // Make list of factors of number of ranks. + vector facts; + for (idx_t n = 1; n <= nr; n++) + if (nr % n == 0) + facts.push_back(n); + + // Keep track of "best" result, where the best is most compact. + IdxTuple best; + + // Try every combo of N-1 factors, where N is the number of dims. + // TODO: make more efficient--need algorithm to directly get + // set of N factors that are valid. + IdxTuple combos; + DOMAIN_VAR_LOOP(i, j) { + auto& dname = domain_dims.getDimName(j); + + // Number of factors. + auto sz = facts.size(); + + // Set first number of options 1 because it will be + // calculated based on the other values, i.e., we don't need + // to search over first dim. Also don't need to search any + // specified value. + if (j == 0 || opts->_num_ranks[j]) + sz = 1; + + combos.addDimBack(dname, sz); + } + TRACE_MSG("setupRank(): checking " << combos.product() << " rank layouts"); + combos.visitAllPoints + ([&](const IdxTuple& combo, size_t idx)->bool { + + // Make tuple w/factors at given indices. + auto num_ranks = combo.mapElements([&](idx_t in) { + return facts.at(in); + }); + + // Override with specified values. + DOMAIN_VAR_LOOP(i, j) { + if (opts->_num_ranks[j]) + num_ranks[j] = opts->_num_ranks[j]; + else if (j == 0) + num_ranks[j] = -1; // -1 => needs to be calculated. + } + + // Replace first factor with computed value if not set. + if (num_ranks[0] == -1) { + num_ranks[0] = 1; + num_ranks[0] = nr / num_ranks.product(); + } + + // Valid? + if (num_ranks.product() == nr) { + TRACE_MSG(" valid layout " << num_ranks.makeDimValStr(" * ") << + " has max size " << num_ranks.max()); + + // Best so far? + // Layout is better if max size is smaller. + if (best.size() == 0 || + num_ranks.max() < best.max()) + best = num_ranks; + } + + return true; // keep looking. + }); + assert(best.size()); + assert(best.product()); + TRACE_MSG(" layout " << best.makeDimValStr(" * ") << " selected"); + opts->_num_ranks = best; + } // Check ranks. idx_t req_ranks = opts->_num_ranks.product(); - if (req_ranks != env->num_ranks) { + if (req_ranks != nr) FORMAT_AND_THROW_YASK_EXCEPTION("error: " << req_ranks << " rank(s) requested (" + opts->_num_ranks.makeDimValStr(" * ") + "), but " << - env->num_ranks << " rank(s) are active"); - } - - // All ranks should have the same settings for using shm. - assertEqualityOverRanks(idx_t(opts->use_shm), env->comm, "use_shm"); + nr << " rank(s) are active"); // Determine my coordinates if not provided already. // TODO: do this more intelligently based on proximity. if (opts->find_loc) opts->_rank_indices = opts->_num_ranks.unlayout(me); - // A table of rank-coordinates for everyone. - idx_t coords[env->num_ranks][nddims]; - - // Init offsets and total sizes. - rank_domain_offsets.setValsSame(0); - overall_domain_sizes.setValsSame(0); - - // Init coords for this rank. - for (int i = 0; i < nddims; i++) - coords[me][i] = opts->_rank_indices[i]; - - // A table of rank-domain sizes for everyone. - idx_t rsizes[env->num_ranks][nddims]; - - // Init sizes for this rank. + // Check rank indices. DOMAIN_VAR_LOOP(i, j) { - auto rsz = opts->_rank_sizes[i]; - rsizes[me][j] = rsz; - overall_domain_sizes[j] = rsz; + auto& dname = domain_dims.getDimName(j); + if (opts->_rank_indices[j] < 0 || + opts->_rank_indices[j] >= opts->_num_ranks[j]) + THROW_YASK_EXCEPTION("Error: rank index of " + + to_string(opts->_rank_indices[j]) + + " is not within allowed range [0 ... " + + to_string(opts->_num_ranks[j] - 1) + + "] in '" + dname + "' dimension on rank " + + to_string(me)); } + + // Init starting indices for this rank. + rank_domain_offsets.setValsSame(0); -#ifdef USE_MPI - // Exchange coord and size info between all ranks. - for (int rn = 0; rn < env->num_ranks; rn++) { - MPI_Bcast(&coords[rn][0], nddims, MPI_INTEGER8, - rn, env->comm); - MPI_Bcast(&rsizes[rn][0], nddims, MPI_INTEGER8, - rn, env->comm); - } - // Now, the tables are filled in for all ranks. + // Tables to share data across ranks. + idx_t coords[nr][nddims]; // rank indices. + idx_t rsizes[nr][nddims]; // rank sizes. - // Loop over all ranks, including myself. - for (int rn = 0; rn < env->num_ranks; rn++) { + // Two passes over ranks: + // 0: sum all specified local sizes. + // 1: set final sums and offsets. + for (int pass : { 0, 1 }) { - // Coord offset of rn from me: prev => negative, self => 0, next => positive. - IdxTuple rcoords(domain_dims); - IdxTuple rdeltas(domain_dims); - for (int di = 0; di < nddims; di++) { - rcoords[di] = coords[rn][di]; - rdeltas[di] = coords[rn][di] - opts->_rank_indices[di]; - } + // Init rank-size sums. + IdxTuple rank_domain_sums(domain_dims); + rank_domain_sums.setValsSame(0); - // Manhattan distance from rn (sum of abs deltas in all dims). - // Max distance in any dim. - int mandist = 0; - int maxdist = 0; - for (int di = 0; di < nddims; di++) { - mandist += abs(rdeltas[di]); - maxdist = max(maxdist, abs(int(rdeltas[di]))); + // Init tables for this rank. + DOMAIN_VAR_LOOP(i, j) { + coords[me][j] = opts->_rank_indices[j]; + rsizes[me][j] = opts->_rank_sizes[i]; } - // Myself. - if (rn == me) { - if (mandist != 0) - FORMAT_AND_THROW_YASK_EXCEPTION - ("Internal error: distance to own rank == " << mandist); + // Exchange coord and size info between all ranks. + for (int rn = 0; rn < nr; rn++) { + MPI_Bcast(&coords[rn][0], nddims, MPI_INTEGER8, + rn, env->comm); + MPI_Bcast(&rsizes[rn][0], nddims, MPI_INTEGER8, + rn, env->comm); } + // Now, the tables are filled in for all ranks. + // Some rank sizes may be zero on the 1st pass, + // but they should all be non-zero on 2nd pass. + + // Loop over all ranks, including myself. + int num_neighbors = 0; + for (int rn = 0; rn < nr; rn++) { + + // Coord offset of rn from me: prev => negative, self => 0, next => positive. + IdxTuple rcoords(domain_dims); + IdxTuple rdeltas(domain_dims); + DOMAIN_VAR_LOOP(i, di) { + rcoords[di] = coords[rn][di]; + rdeltas[di] = coords[rn][di] - coords[me][di]; + } - // Someone else. - else { - if (mandist == 0) - FORMAT_AND_THROW_YASK_EXCEPTION - ("Error: ranks " << me << - " and " << rn << " at same coordinates"); - } + // Manhattan distance from rn (sum of abs deltas in all dims). + // Max distance in any dim. + int mandist = 0; + int maxdist = 0; + DOMAIN_VAR_LOOP(i, di) { + mandist += abs(rdeltas[di]); + maxdist = max(maxdist, abs(int(rdeltas[di]))); + } - // Loop through domain dims. - for (int di = 0; di < nddims; di++) { - auto& dname = opts->_rank_indices.getDimName(di); + // Myself. + if (rn == me) { + if (mandist != 0) + FORMAT_AND_THROW_YASK_EXCEPTION + ("Internal error: distance to own rank == " << mandist); + } - // Is rank 'rn' in-line with my rank in 'dname' dim? - // True when deltas in other dims are zero. - bool is_inline = true; - for (int dj = 0; dj < nddims; dj++) { - if (di != dj && rdeltas[dj] != 0) { - is_inline = false; - break; - } + // Someone else. + else { + if (mandist == 0) + FORMAT_AND_THROW_YASK_EXCEPTION + ("Error: ranks " << me << + " and " << rn << " at same coordinates"); } - // Process ranks that are in-line in 'dname', including self. - if (is_inline) { - - // Accumulate total problem size in each dim for ranks that - // intersect with this rank, not including myself. - if (rn != me) - overall_domain_sizes[dname] += rsizes[rn][di]; - - // Adjust my offset in the global problem by adding all domain - // sizes from prev ranks only. - if (rdeltas[di] < 0) - rank_domain_offsets[dname] += rsizes[rn][di]; - - // Make sure all the other dims are the same size. - // This ensures that all the ranks' domains line up - // properly along their edges and at their corners. - for (int dj = 0; dj < nddims; dj++) { - if (di != dj) { - auto mysz = rsizes[me][dj]; - auto rnsz = rsizes[rn][dj]; - if (mysz != rnsz) { - auto& dnamej = opts->_rank_indices.getDimName(dj); - FORMAT_AND_THROW_YASK_EXCEPTION - ("Error: rank " << rn << " and " << me << - " are both at rank-index " << coords[me][di] << - " in the '" << dname << - "' dimension , but their rank-domain sizes are " << - rnsz << " and " << mysz << - " (resp.) in the '" << dj << - "' dimension, making them unaligned"); + // Loop through domain dims. + DOMAIN_VAR_LOOP(i, di) { + auto& dname = domain_dims.getDimName(di); + + // Is rank 'rn' in-line with my rank in 'dname' dim? + // True when deltas in all other dims are zero. + bool is_inline = true; + DOMAIN_VAR_LOOP(j, dj) { + if (di != dj && rdeltas[dj] != 0) { + is_inline = false; + break; + } + } + + // Process this rank if it is in-line with me in 'dname', including myself. + if (is_inline) { + + // Sum rank sizes in this dim. + rank_domain_sums[di] += rsizes[rn][di]; + + if (pass == 1) { + + // Make sure all the other dims are the same size. + // This ensures that all the ranks' domains line up + // properly along their edges and at their corners. + DOMAIN_VAR_LOOP(j, dj) { + if (di != dj) { + auto& dnamej = domain_dims.getDimName(dj); + auto mysz = rsizes[me][dj]; + auto rnsz = rsizes[rn][dj]; + if (mysz != rnsz) { + FORMAT_AND_THROW_YASK_EXCEPTION + ("Error: rank " << rn << " and " << me << + " are both at rank-index " << coords[me][di] << + " in the '" << dname << + "' dimension, but their local-domain sizes are " << + rnsz << " and " << mysz << + " (resp.) in the '" << dnamej << + "' dimension, making them unaligned"); + } + } + } + + // Adjust my offset in the global problem by adding all domain + // sizes from prev ranks only. + if (rdeltas[di] < 0) + rank_domain_offsets[dname] += rsizes[rn][di]; + + } // 2nd pass. + } // is inline w/me. + } // dims. + + // Rank rn is myself or my immediate neighbor if its distance <= 1 in + // every dim. Assume we do not need to exchange halos except + // with immediate neighbor. We enforce this assumption below by + // making sure that the rank domain size is at least as big as the + // largest halo. + if (pass == 1 && maxdist <= 1) { + + // At this point, rdeltas contains only -1..+1 for each domain dim. + // Add one to -1..+1 to get 0..2 range for my_neighbors offsets. + IdxTuple roffsets = rdeltas.addElements(1); + assert(rdeltas.min() >= -1); + assert(rdeltas.max() <= 1); + assert(roffsets.min() >= 0); + assert(roffsets.max() <= 2); + + // Convert the offsets into a 1D index. + auto rn_ofs = mpiInfo->getNeighborIndex(roffsets); + TRACE_MSG("neighborhood size = " << mpiInfo->neighborhood_sizes.makeDimValStr() << + " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() << + " => " << rn_ofs); + assert(idx_t(rn_ofs) < mpiInfo->neighborhood_size); + + // Save rank of this neighbor into the MPI info object. + mpiInfo->my_neighbors.at(rn_ofs) = rn; + if (rn == me) { + assert(mpiInfo->my_neighbor_index == rn_ofs); + mpiInfo->shm_ranks.at(rn_ofs) = env->my_shm_rank; + } + else { + num_neighbors++; + os << "Neighbor #" << num_neighbors << " is MPI rank " << rn << + " at absolute rank indices " << rcoords.makeDimValStr() << + " (" << rdeltas.makeDimValOffsetStr() << " relative to rank " << + me << ")"; + + // Determine whether neighbor is in my shm group. + // If so, record rank number in shmcomm. + if (opts->use_shm && env->shm_comm != MPI_COMM_NULL) { + int g_rank = rn; + int s_rank = MPI_PROC_NULL; + MPI_Group_translate_ranks(env->group, 1, &g_rank, + env->shm_group, &s_rank); + if (s_rank != MPI_UNDEFINED) { + mpiInfo->shm_ranks.at(rn_ofs) = s_rank; + os << " and is MPI shared-memory rank " << s_rank; + } else { + os << " and will not use shared-memory"; } } + os << ".\n"; } - } // is inline w/me. - } // dims. - - // Rank rn is myself or my immediate neighbor if its distance <= 1 in - // every dim. Assume we do not need to exchange halos except - // with immediate neighbor. We enforce this assumption below by - // making sure that the rank domain size is at least as big as the - // largest halo. - if (maxdist <= 1) { - - // At this point, rdeltas contains only -1..+1 for each domain dim. - // Add one to -1..+1 to get 0..2 range for my_neighbors offsets. - IdxTuple roffsets = rdeltas.addElements(1); - assert(rdeltas.min() >= -1); - assert(rdeltas.max() <= 1); - assert(roffsets.min() >= 0); - assert(roffsets.max() <= 2); - - // Convert the offsets into a 1D index. - auto rn_ofs = mpiInfo->getNeighborIndex(roffsets); - TRACE_MSG("neighborhood size = " << mpiInfo->neighborhood_sizes.makeDimValStr() << - " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() << - " => " << rn_ofs); - assert(idx_t(rn_ofs) < mpiInfo->neighborhood_size); - - // Save rank of this neighbor into the MPI info object. - mpiInfo->my_neighbors.at(rn_ofs) = rn; - if (rn == me) { - assert(mpiInfo->my_neighbor_index == rn_ofs); - mpiInfo->shm_ranks.at(rn_ofs) = env->my_shm_rank; - } - else { - num_neighbors++; - os << "Neighbor #" << num_neighbors << " is MPI rank " << rn << - " at absolute rank indices " << rcoords.makeDimValStr() << - " (" << rdeltas.makeDimValOffsetStr() << " relative to rank " << - me << ")"; - - // Determine whether neighbor is in my shm group. - // If so, record rank number in shmcomm. - if (opts->use_shm && env->shm_comm != MPI_COMM_NULL) { - int g_rank = rn; - int s_rank = MPI_PROC_NULL; - MPI_Group_translate_ranks(env->group, 1, &g_rank, - env->shm_group, &s_rank); - if (s_rank != MPI_UNDEFINED) { - mpiInfo->shm_ranks.at(rn_ofs) = s_rank; - os << " and is MPI shared-memory rank " << s_rank; - } else { - os << " and will not use shared-memory"; + + // Save manhattan dist. + mpiInfo->man_dists.at(rn_ofs) = mandist; + + // Loop through domain dims. + bool vlen_mults = true; + DOMAIN_VAR_LOOP(i, j) { + auto& dname = domain_dims.getDimName(j); + auto nranks = opts->_num_ranks[j]; + bool is_last = (opts->_rank_indices[j] == nranks - 1); + + // Does rn have all VLEN-multiple sizes? + // TODO: allow last rank in each dim to be non-conformant. + auto rnsz = rsizes[rn][j]; + auto vlen = fold_pts[j]; + if (rnsz % vlen != 0) { + TRACE_MSG("cannot use vector halo exchange with rank " << rn << + " because its size in '" << dname << "' is " << rnsz); + vlen_mults = false; } } - os << ".\n"; - } - // Save manhattan dist. - mpiInfo->man_dists.at(rn_ofs) = mandist; + // Save vec-mult flag. + mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults; - // Loop through domain dims. - bool vlen_mults = true; - DOMAIN_VAR_LOOP(i, j) { + } // self or immediate neighbor in any direction. + } // ranks. - // Does rn have all VLEN-multiple sizes? - auto rnsz = rsizes[rn][j]; - auto vlen = fold_pts[j]; - if (rnsz % vlen != 0) { - auto& dname = opts->_rank_indices.getDimName(j); - TRACE_MSG("cannot use vector halo exchange with rank " << rn << - " because its size in '" << dname << "' is " << rnsz); - vlen_mults = false; + // At end of 1st pass, known ranks sizes have + // been summed in each dim. Determine global size + // or other rank sizes for each dim. + if (pass == 0) { + DOMAIN_VAR_LOOP(i, j) { + auto& dname = domain_dims.getDimName(j); + auto nranks = opts->_num_ranks[j]; + auto gsz = opts->_global_sizes[i]; + bool is_last = (opts->_rank_indices[j] == nranks - 1); + + // Need to determine my rank size. + if (!opts->_rank_sizes[i]) { + if (rank_domain_sums[j] != 0) + FORMAT_AND_THROW_YASK_EXCEPTION + ("Error: local-domain size is not specified in the '" << + dname << "' dimension on rank " << me << + ", but it is specified on another rank; " + "it must be specified or unspecified consistently across all ranks"); + + // Divide sum by num of ranks in this dim. + auto rsz = CEIL_DIV(gsz, nranks); + + // Round up to whole vector-clusters. + rsz = ROUND_UP(rsz, dims->_cluster_pts[j]); + + // Remainder for last rank. + auto rem = gsz - (rsz * (nranks - 1)); + if (rem <= 0) + FORMAT_AND_THROW_YASK_EXCEPTION + ("Error: global-domain size of " << gsz << + " is not large enough to split across " << nranks << + " ranks in the '" << dname << "' dimension"); + if (is_last) + rsz = rem; + + // Set rank size depending on whether it is last one. + opts->_rank_sizes[i] = rsz; + TRACE_MSG("local-domain-size[" << dname << "] = " << rem); } - } - // Save vec-mult flag. - mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults; + // Need to determine global size. + // Set it to sum of rank sizes. + else if (!opts->_global_sizes[i]) + opts->_global_sizes[i] = rank_domain_sums[j]; + } + } - } // self or immediate neighbor in any direction. + // After 2nd pass, check for consistency. + else { + DOMAIN_VAR_LOOP(i, j) { + auto& dname = domain_dims.getDimName(j); + if (opts->_global_sizes[i] != rank_domain_sums[j]) { + FORMAT_AND_THROW_YASK_EXCEPTION("Error: sum of local-domain sizes across " << + nr << " ranks is " << + rank_domain_sums[j] << + ", which does not equal global-domain size of " << + opts->_global_sizes[i] << " in '" << dname << + "' dimension"); + } + } + } - } // ranks. + } // passes. #endif - // Set offsets in grids and find WF extensions - // based on the grids' halos. - update_grid_info(); - - // Determine bounding-boxes for all bundles. - // This must be done after finding WF extensions. - find_bounding_boxes(); - } // setupRank(). // Set non-scratch grid sizes and offsets based on settings. @@ -318,6 +513,7 @@ namespace yask { // This should be called anytime a setting or rank offset is changed. void StencilContext::update_grid_info() { STATE_VARS(this); + TRACE_MSG("update_grid_info()..."); // If we haven't finished constructing the context, it's too early // to do this. @@ -392,7 +588,6 @@ namespace yask { auto nranks = opts->_num_ranks[dname]; // Req'd shift in this dim based on max halos. - // TODO: use different angle for L & R side of each pack. idx_t angle = ROUND_UP(max_halos[dname], dims->_fold_pts[dname]); // Determine the spatial skewing angles for WF tiling. We @@ -415,9 +610,10 @@ namespace yask { // when there are multiple ranks? auto min_size = max_halos[dname] + shifts; if (opts->_num_ranks[dname] > 1 && rksize < min_size) { - FORMAT_AND_THROW_YASK_EXCEPTION("Error: rank-domain size of " << rksize << " in '" << - dname << "' dim is less than minimum size of " << min_size << - ", which is based on stencil halos and temporal wave-front sizes"); + FORMAT_AND_THROW_YASK_EXCEPTION + ("Error: local-domain size of " << rksize << " in '" << + dname << "' dim is less than minimum size of " << min_size << + ", which is based on stencil halos and temporal wave-front sizes"); } // If there is another rank to the left, set wave-front @@ -465,6 +661,7 @@ namespace yask { // all packs always done. void StencilContext::update_tb_info() { STATE_VARS(this); + TRACE_MSG("update_tb_info()..."); // Get requested size. tb_steps = opts->_block_sizes[step_dim]; diff --git a/src/kernel/lib/soln_apis.cpp b/src/kernel/lib/soln_apis.cpp index f2c53d72..501256d1 100644 --- a/src/kernel/lib/soln_apis.cpp +++ b/src/kernel/lib/soln_apis.cpp @@ -44,7 +44,7 @@ namespace yask { return expr; \ } GET_SOLN_API(get_num_ranks, opts->_num_ranks[dim], false, true, false, false) - GET_SOLN_API(get_overall_domain_size, overall_domain_sizes[dim], false, true, false, true) + GET_SOLN_API(get_overall_domain_size, opts->_global_sizes[dim], false, true, false, true) GET_SOLN_API(get_rank_domain_size, opts->_rank_sizes[dim], false, true, false, false) GET_SOLN_API(get_region_size, opts->_region_sizes[dim], true, true, false, false) GET_SOLN_API(get_block_size, opts->_block_sizes[dim], true, true, false, false) @@ -63,9 +63,13 @@ namespace yask { update_grid_info(); \ if (reset_prep) rank_bb.bb_valid = ext_bb.bb_valid = false; \ } - SET_SOLN_API(set_rank_index, opts->_rank_indices[dim] = n, false, true, false, true) + SET_SOLN_API(set_rank_index, opts->_rank_indices[dim] = n; + opts->find_loc = false, false, true, false, true) SET_SOLN_API(set_num_ranks, opts->_num_ranks[dim] = n, false, true, false, true) - SET_SOLN_API(set_rank_domain_size, opts->_rank_sizes[dim] = n, false, true, false, true) + SET_SOLN_API(set_overall_domain_size, opts->_global_sizes[dim] = n; + if (n) opts->_rank_sizes[dim] = 0, false, true, false, true) + SET_SOLN_API(set_rank_domain_size, opts->_rank_sizes[dim] = n; + if (n) opts->_global_sizes[dim] = 0, false, true, false, true) SET_SOLN_API(set_region_size, opts->_region_sizes[dim] = n, true, true, false, true) SET_SOLN_API(set_block_size, opts->_block_sizes[dim] = n, true, true, false, true) SET_SOLN_API(set_min_pad_size, opts->_min_pad_sizes[dim] = n, false, true, false, false) @@ -88,6 +92,9 @@ namespace yask { #ifdef MODEL_CACHE os << "*** WARNING: YASK compiled with MODEL_CACHE; ignore performance results.\n"; #endif +#ifdef TRACE + os << "*** WARNING: YASK compiled with TRACE; ignore performance results.\n"; +#endif #ifdef TRACE_MEM os << "*** WARNING: YASK compiled with TRACE_MEM; ignore performance results.\n"; #endif @@ -98,18 +105,6 @@ namespace yask { // reset time keepers. clear_timers(); - // Adjust all settings before setting MPI buffers or sizing grids. - // Prints adjusted settings. - // TODO: print settings again after auto-tuning. - opts->adjustSettings(os); - - // Copy current settings to packs. - // Needed here because settings may have been changed via APIs - // since last call to prepare_solution(). - // This will wipe out any previous auto-tuning. - for (auto& sp : stPacks) - sp->getLocalSettings() = *opts; - // Init auto-tuner to run silently during normal operation. reset_auto_tuner(true, false); @@ -144,10 +139,30 @@ namespace yask { os << "Num grids: " << gridPtrs.size() << endl; os << "Num grids to be updated: " << outputGridPtrs.size() << endl; - // Set up data based on MPI rank, including grid positions. - // Update all the grid sizes. + // Set up data based on MPI rank, including local or global sizes, + // grid positions. setupRank(); + // Adjust all settings before setting MPI buffers or sizing grids. + // Prints adjusted settings. + // TODO: print settings again after auto-tuning. + opts->adjustSettings(os); + + // Set offsets in grids and find WF extensions + // based on the grids' halos. + update_grid_info(); + + // Determine bounding-boxes for all bundles. + // This must be done after finding WF extensions. + find_bounding_boxes(); + + // Copy current settings to packs. Needed here because settings may + // have been changed via APIs or from call to setupRank() since last + // call to prepare_solution(). This will wipe out any previous + // auto-tuning. + for (auto& sp : stPacks) + sp->getLocalSettings() = *opts; + // Alloc grids, scratch grids, MPI bufs. // This is the order in which preferred NUMA nodes (e.g., HBW mem) // will be used. @@ -179,7 +194,7 @@ namespace yask { " wave-front-shift-amounts: " << wf_shift_pts.makeDimValStr() << endl << " left-wave-front-exts: " << left_wf_exts.makeDimValStr() << endl << " right-wave-front-exts: " << right_wf_exts.makeDimValStr() << endl << - " ext-rank-domain: " << ext_bb.bb_begin.makeDimValStr() << + " ext-local-domain: " << ext_bb.bb_begin.makeDimValStr() << " ... " << ext_bb.bb_end.subElements(1).makeDimValStr() << endl << " num-temporal-block-steps: " << tb_steps << endl << " temporal-block-angles: " << tb_angles.makeDimValStr() << endl << @@ -211,12 +226,12 @@ namespace yask { os << "\nWork-unit sizes in points (from smallest to largest):\n" " vector-size: " << dims->_fold_pts.makeDimValStr(" * ") << endl << " cluster-size: " << dims->_cluster_pts.makeDimValStr(" * ") << endl << - " sub-block-size: " << opts->_sub_block_sizes.makeDimValStr(" * ") << endl << + " sub-block-size: " << opts->_sub_block_sizes.removeDim(step_posn).makeDimValStr(" * ") << endl << " mini-block-size: " << opts->_mini_block_sizes.makeDimValStr(" * ") << endl << " block-size: " << opts->_block_sizes.makeDimValStr(" * ") << endl << " region-size: " << opts->_region_sizes.makeDimValStr(" * ") << endl << - " rank-domain-size: " << opts->_rank_sizes.makeDimValStr(" * ") << endl << - " overall-problem-size: " << overall_domain_sizes.makeDimValStr(" * ") << endl; + " local-domain-size: " << opts->_rank_sizes.removeDim(step_posn).makeDimValStr(" * ") << endl << + " global-domain-size: " << opts->_global_sizes.removeDim(step_posn).makeDimValStr(" * ") << endl; #ifdef SHOW_GROUPS os << " sub-block-group-size: " << opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl << @@ -226,13 +241,13 @@ namespace yask { " yask-version: " << yask_get_version_string() << endl << " stencil-name: " << get_name() << endl << " element-size: " << makeByteStr(get_element_bytes()) << endl << - " rank-domain: " << rank_bb.bb_begin.makeDimValStr() << + " local-domain: " << rank_bb.bb_begin.makeDimValStr() << " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl; #ifdef USE_MPI os << " num-ranks: " << opts->_num_ranks.makeDimValStr(" * ") << endl << " rank-indices: " << opts->_rank_indices.makeDimValStr() << endl << - " rank-domain-offsets: " << rank_domain_offsets.makeDimValOffsetStr() << endl; + " local-domain-offsets: " << rank_domain_offsets.makeDimValStr() << endl; if (opts->overlap_comms) os << " mpi-interior: " << mpi_interior.bb_begin.makeDimValStr() << diff --git a/src/kernel/lib/utils.cpp b/src/kernel/lib/utils.cpp index 4db591d6..83415eb3 100644 --- a/src/kernel/lib/utils.cpp +++ b/src/kernel/lib/utils.cpp @@ -463,7 +463,7 @@ namespace yask { #endif if (min_val != rank_val || max_val != rank_val) { - FORMAT_AND_THROW_YASK_EXCEPTION("error: " << descr << " values range from " << min_val << " to " << + FORMAT_AND_THROW_YASK_EXCEPTION("error: " << descr << " ranges from " << min_val << " to " << max_val << " across the ranks; they should all be identical"); } } diff --git a/src/kernel/lib/utils.hpp b/src/kernel/lib/utils.hpp index 66147400..572ca867 100644 --- a/src/kernel/lib/utils.hpp +++ b/src/kernel/lib/utils.hpp @@ -62,16 +62,18 @@ namespace yask { // Fatal error. // TODO: enable exception throwing that works w/SWIG. inline void exit_yask(int code) { + #ifdef USE_MPI int flag; MPI_Initialized(&flag); - if (flag) - MPI_Abort(MPI_COMM_WORLD, code); - else - exit(code); -#else - exit(code); + if (flag) { + int num_ranks = 1; + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + if (num_ranks > 1) + MPI_Abort(MPI_COMM_WORLD, code); + } #endif + exit(code); } // Return num with SI multiplier and "iB" suffix, diff --git a/src/kernel/lib/yask_stencil.hpp b/src/kernel/lib/yask_stencil.hpp index 57faa0ec..efc36f59 100644 --- a/src/kernel/lib/yask_stencil.hpp +++ b/src/kernel/lib/yask_stencil.hpp @@ -63,7 +63,7 @@ IN THE SOFTWARE. // First/last index macros. // These are relative to global problem, not rank. #define FIRST_INDEX(dim) (0) -#define LAST_INDEX(dim) (_context->overall_domain_sizes[DOMAIN_DIM_IDX_ ## dim] - 1) +#define LAST_INDEX(dim) (_context->get_settings().get()->_global_sizes[DOMAIN_DIM_IDX_ ## dim] - 1) // Macros for 1D<->nD transforms. #include "yask_layout_macros.hpp" diff --git a/src/kernel/tests/yask_kernel_api_exception_test.cpp b/src/kernel/tests/yask_kernel_api_exception_test.cpp index 64f93463..bf1d8df9 100644 --- a/src/kernel/tests/yask_kernel_api_exception_test.cpp +++ b/src/kernel/tests/yask_kernel_api_exception_test.cpp @@ -52,16 +52,10 @@ int main() { for (auto dim_name : soln_dims) { // Set domain size in each dim. - soln->set_rank_domain_size(dim_name, 128); + soln->set_overall_domain_size(dim_name, 128); - // Ensure some minimal padding on all grids. - soln->set_min_pad_size(dim_name, 1); - - // Set block size to 64 in z dim and 32 in other dims. - if (dim_name == "z") - soln->set_block_size(dim_name, 64); - else - soln->set_block_size(dim_name, 32); + // Set block size. + soln->set_block_size(dim_name, 32); } // Make a test fixed-size grid. @@ -70,10 +64,6 @@ int main() { fgrid_sizes.push_back(5); auto fgrid = soln->new_fixed_size_grid("fgrid", soln_dims, fgrid_sizes); - // Simple rank configuration in 1st dim only. - auto ddim1 = soln_dims[0]; - soln->set_num_ranks(ddim1, env->get_num_ranks()); - // Exception test cout << "Exception Test: Call 'run_solution' without calling prepare_solution().\n"; try { diff --git a/src/kernel/tests/yask_kernel_api_exception_test.py b/src/kernel/tests/yask_kernel_api_exception_test.py index f2ab9c1b..6f100fca 100755 --- a/src/kernel/tests/yask_kernel_api_exception_test.py +++ b/src/kernel/tests/yask_kernel_api_exception_test.py @@ -187,17 +187,10 @@ def init_grid(grid, timestep) : for dim_name in soln_dims : # Set domain size in each dim. - soln.set_rank_domain_size(dim_name, 128) + soln.set_overall_domain_size(dim_name, 128) - # Ensure some minimal padding on all grids. - soln.set_min_pad_size(dim_name, 1) - - # Set block size to 64 in z dim and 32 in other dims. - # (Not necessarily useful, just as an example.) - if dim_name == "z" : - soln.set_block_size(dim_name, 64) - else : - soln.set_block_size(dim_name, 32) + # Set block size. + soln.set_block_size(dim_name, 32) # Make a test fixed-size grid. fgrid_sizes = () @@ -205,12 +198,6 @@ def init_grid(grid, timestep) : fgrid_sizes += (5,) fgrid = soln.new_fixed_size_grid("fgrid", soln_dims, fgrid_sizes) - # Simple rank configuration in 1st dim only. - # In production runs, the ranks would be distributed along - # all domain dimensions. - ddim1 = soln_dims[0] # name of 1st dim. - soln.set_num_ranks(ddim1, env.get_num_ranks()) # num ranks in this dim. - # Exception test print("Exception Test: Call 'run_solution' without calling prepare_solution().") try: diff --git a/src/kernel/tests/yask_kernel_api_test.cpp b/src/kernel/tests/yask_kernel_api_test.cpp index a7822068..626e0f22 100644 --- a/src/kernel/tests/yask_kernel_api_test.cpp +++ b/src/kernel/tests/yask_kernel_api_test.cpp @@ -66,17 +66,18 @@ int main() { cout << "Following information from rank " << rank_num << ".\n"; ostream& os = *osp; - // Init global settings. + // Init solution settings. auto soln_dims = soln->get_domain_dim_names(); for (auto dim_name : soln_dims) { // Set domain size in each dim. - soln->set_rank_domain_size(dim_name, 128); + soln->set_overall_domain_size(dim_name, 128); // Ensure some minimal padding on all grids. soln->set_min_pad_size(dim_name, 1); // Set block size to 64 in z dim and 32 in other dims. + // NB: just illustrative. if (dim_name == "z") soln->set_block_size(dim_name, 64); else @@ -91,10 +92,6 @@ int main() { fgrid_sizes.push_back(5); auto fgrid = soln->new_fixed_size_grid("fgrid", fgrid_dims, fgrid_sizes); - // Simple rank configuration in 1st dim only. - auto ddim1 = soln_dims[0]; - soln->set_num_ranks(ddim1, env->get_num_ranks()); - // Allocate memory for any grids that do not have storage set. // Set other data structures needed for stencil application. soln->prepare_solution(); diff --git a/src/kernel/tests/yask_kernel_api_test.py b/src/kernel/tests/yask_kernel_api_test.py index 2c74fdf9..edf573f6 100755 --- a/src/kernel/tests/yask_kernel_api_test.py +++ b/src/kernel/tests/yask_kernel_api_test.py @@ -182,7 +182,7 @@ def init_grid(grid, timestep) : for dim_name in soln_dims : # Set domain size in each dim. - soln.set_rank_domain_size(dim_name, 128) + soln.set_overall_domain_size(dim_name, 128) # Ensure some minimal padding on all grids. soln.set_min_pad_size(dim_name, 1) @@ -202,12 +202,6 @@ def init_grid(grid, timestep) : fgrid.set_numa_preferred(yk.cvar.yask_numa_local) fgrid.alloc_storage() - # Simple rank configuration in 1st dim only. - # In production runs, the ranks would be distributed along - # all domain dimensions. - ddim1 = soln_dims[0] # name of 1st dim. - soln.set_num_ranks(ddim1, env.get_num_ranks()) # num ranks in this dim. - # Allocate memory for any grids that do not have storage set. # Set other data structures needed for stencil application. soln.prepare_solution() diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh index 3a732fdc..9f7b97c8 100755 --- a/src/kernel/yask.sh +++ b/src/kernel/yask.sh @@ -25,7 +25,7 @@ # Purpose: run stencil kernel in specified environment. -# Create invocation string. +# Create invocation string w/proper quoting. invo="Invocation: $0" whitespace="[[:space:]]" for i in "$@" @@ -70,12 +70,11 @@ if command -v numactl >/dev/null; then fi fi -# Extra options for exe. -opts="" - # Other defaults. pre_cmd=true -post_cmd=true +post_cmd="" +helping=0 +opts="" # Display stencils in this dir and exit. bindir=`dirname $0` @@ -123,7 +122,7 @@ while true; do echo " -sh_prefix " echo " Run sub-shell under , e.g., a custom ssh command." echo " -exe_prefix " - echo " Run YASK executable under , e.g., 'numactl'." + echo " Run YASK executable under , e.g., 'numactl -N 0'." echo " -pre_cmd " echo " One or more commands to run before YASK executable." echo " -post_cmd " @@ -131,11 +130,10 @@ while true; do echo " -mpi_cmd " echo " Run before the executable (and before the -exe_prefix argument)." echo " -ranks " - echo " Simplified MPI run (x-dimension partition only)." + echo " Simplified MPI run ( ranks on current host)." echo " Shortcut for the following options if > 1:" - echo " -mpi_cmd mpirun -np -nrx " - echo " If a different MPI command or config is needed, use -mpi_cmd " - echo " explicitly and -nr* options as needed instead." + echo " -mpi_cmd 'mpirun -np '" + echo " If a different MPI command is needed, use -mpi_cmd explicitly." if [[ -n "$nranks" ]]; then echo " The default for this host is '$nranks'." fi @@ -148,7 +146,16 @@ while true; do echo " " echo " Set environment variable to ." echo " Repeat as necessary to set multiple vars." - exit 1 + exit 0 + + elif [[ "$1" == "-help" ]]; then + helping=1 + nranks=1 + logfile='/dev/null' + + # Pass option to executable. + opts+=" $1" + shift elif [[ "$1" == "-show_arch" ]]; then echo $arch @@ -218,12 +225,12 @@ while true; do shift # Pass all remaining options to executable and stop parsing. - opts="$opts $@" + opts+=" $@" break else # Pass this unknown option to executable. - opts="$opts $1" + opts+=" $1" shift fi @@ -240,7 +247,6 @@ fi # Simplified MPI in x-dim only. if [[ -n "$nranks" && $nranks > 1 ]]; then true ${mpi_cmd="mpirun -np $nranks"} - opts="-nrx $nranks $opts" # Put this opt at beginning to allow override. fi # Bail on errors past this point, but only errors @@ -315,16 +321,27 @@ config_cmds="uname -a; sleep 1; uptime; sed '/^$/q' /proc/cpuinfo; lscpu; $dump # Command sequence to be run in a shell. # Captures -cmds="cd $dir; $config_cmds; ldd $exe; date; $pre_cmd; env $envs $mpi_cmd $exe_prefix $exe $opts; $post_cmd; date" +cmds="cd $dir; $config_cmds; ldd $exe; date; $pre_cmd; env $envs $mpi_cmd $exe_prefix $exe $opts" +if [[ -n "$post_cmd" ]]; then + cmds+="; $post_cmd" +fi echo "===================" | tee -a $logfile +# Finally, invoke the binary. if [[ -z "$sh_prefix" ]]; then sh -c -x "$cmds" 2>&1 | tee -a $logfile else echo "Running shell under '$sh_prefix'..." $sh_prefix "sh -c -x '$cmds'" 2>&1 | tee -a $logfile fi +date +echo "===================" | tee -a $logfile + +# Exit if just getting help. +if [[ $helping == 1 ]]; then + exit 0 +fi echo $invo echo "Log saved in '$logfile'." @@ -335,14 +352,14 @@ exe_str="'$mpi_cmd $exe_prefix $exe $opts'" # Return a non-zero exit condition if test failed. if [[ `grep -c 'TEST FAILED' $logfile` > 0 ]]; then echo $exe_str did not pass internal validation test. | tee -a $logfile - exit 1; + exit 1 fi # Return a non-zero exit condition if executable didn't exit cleanly. if [[ `grep -c 'YASK DONE' $logfile` == 0 ]]; then echo $exe_str did not exit cleanly. | tee -a $logfile - exit 1; + exit 1 fi echo $exe_str ran successfully. | tee -a $logfile -exit 0; +exit 0 diff --git a/src/kernel/yask_main.cpp b/src/kernel/yask_main.cpp index c2c1237d..5186d9b9 100644 --- a/src/kernel/yask_main.cpp +++ b/src/kernel/yask_main.cpp @@ -53,13 +53,15 @@ struct AppSettings : public KernelSettings { // A custom option-handler for '-v'. class ValOption : public CommandLineParser::OptionBase { AppSettings& _as; + static constexpr idx_t _lsz=63, _bsz=24; public: ValOption(AppSettings& as) : OptionBase("v", "Minimal validation: shortcut for '-validate -no-pre-auto_tune -no-auto_tune" - " -no-warmup -t 1 -trial_steps 1 -d 63 -b 24'."), + " -no-warmup -t 1 -trial_steps 1 -l " + to_string(_lsz) + + " -b " + to_string(_bsz) + "'."), _as(as) { } // Set multiple vars. @@ -73,8 +75,8 @@ struct AppSettings : public KernelSettings { _as.trial_steps = 1; for (auto dim : _as._dims->_domain_dims.getDims()) { auto& dname = dim.getName(); - _as._rank_sizes[dname] = 63; - _as._block_sizes[dname] = 24; + _as._rank_sizes[dname] = _lsz; + _as._block_sizes[dname] = _bsz; } return true; } @@ -159,12 +161,12 @@ struct AppSettings : public KernelSettings { if (help) { string appNotes = - "Validation is very slow and uses 2x memory,\n" + "\nValidation is very slow and uses 2x memory,\n" " so run with very small sizes and number of time-steps.\n" " If validation fails, it may be due to rounding error;\n" " try building with 8-byte reals.\n"; vector appExamples; - appExamples.push_back("-t 2"); + appExamples.push_back("-g 768 -t 2"); appExamples.push_back("-v"); print_usage(cout, parser, argv[0], appNotes, appExamples); exit_yask(1); diff --git a/utils/bin/analyze_trace.pl b/utils/bin/analyze_trace.pl index a401d6d1..da426eb1 100755 --- a/utils/bin/analyze_trace.pl +++ b/utils/bin/analyze_trace.pl @@ -143,6 +143,6 @@ ($$$) print " ".(scalar @{$writes{$key}})." $key write(s) checked.\n"; } print " $nissues issue(s) flagged.\n"; -print " (Ignore issues outside of rank domain when using temporal tiling and MPI.)\n" +print " (Ignore issues outside of local domain when using temporal tiling and MPI.)\n" if $nissues; exit $nissues; diff --git a/utils/bin/yask_tuner.pl b/utils/bin/yask_tuner.pl index fff714b6..bb08e6e4 100755 --- a/utils/bin/yask_tuner.pl +++ b/utils/bin/yask_tuner.pl @@ -104,13 +104,13 @@ sub usage { "\nsearch-space options:\n". " -= Force to fixed value .\n". " Run with -check for list of genes and default ranges.\n". - " Setting rank-domain size (d) also sets upper block and region sizes.\n". + " Setting local-domain size (l) also sets upper block and region sizes.\n". " Leave off 'x', 'y', 'z' suffix to set these 3 vars to same val.\n". - " Examples: '-d=512' Set problem size to 512^3.\n". + " Examples: '-l=512' Set local-domain size to 512^3.\n". " '-bx=64' Set block size to 64 in 'x' dim.\n". " '-ep=0' Disable extra padding.\n". " '-c=1' Allow only one vector in a cluster.\n". - " '-r=0' Allow only one OpenMP region (region size=0 => rank size).\n". + " '-r=0' Allow only one OpenMP region (region size=0 => local-domain size).\n". " -=- Restrict between and , inclusive.\n". " Example: '-bx=8-128'.\n". " See the notes above on specification.\n". @@ -263,8 +263,8 @@ sub usage { usage("min value $min for '$key' > max value $max.") if ($min > $max); - # special case for problem size: also set default for other max sizes. - if ($key =~ /^d[xyz]?$/ && $max > 0) { + # special case for local-domain size: also set default for other max sizes. + if ($key =~ /^[ld][xyz]?$/ && $max > 0) { my @szs = qw(r b mb sb); push @szs, qw(bg mbg sbg) if $showGroups; for my $i (@szs) { @@ -396,9 +396,9 @@ sub usage { my @rangesAll = ( # rank size. - [ $minDim, $maxDim, 16, 'dx' ], - [ $minDim, $maxDim, 16, 'dy' ], - [ $minDim, $maxDim, 16, 'dz' ], + [ $minDim, $maxDim, 16, 'lx' ], + [ $minDim, $maxDim, 16, 'ly' ], + [ $minDim, $maxDim, 16, 'lz' ], # region size. [ 1, $maxTimeBlock, 1, 'rt' ], @@ -1201,13 +1201,13 @@ sub fitness { if ($debugCheck) { print "Sizes:\n"; - print " rank size = $dPts\n"; + print " local-domain size = $dPts\n"; print " region size = $rPts\n"; print " block size = $bPts\n"; print " sub-block size = $sbPts\n"; print " cluster size = $cPts\n"; print " fold size = $fPts\n"; - print " regions per rank = $dRegs\n"; + print " regions per local-domain = $dRegs\n"; print " blocks per region = $rBlks\n"; print " clusters per block = $bCls\n"; print " mini-blocks per block = $bMbs\n"; @@ -1259,13 +1259,13 @@ sub fitness { $numChecks++; $checkStats{'ok'} += $ok; addStat($ok, 'mem estimate', $overallSize); - addStat($ok, 'rank size', $dPts); + addStat($ok, 'local-domain size', $dPts); addStat($ok, 'region size', $rPts); addStat($ok, 'block size', $bPts); addStat($ok, 'mini-block size', $mbPts); addStat($ok, 'sub-block size', $sbPts); addStat($ok, 'cluster size', $cPts); - addStat($ok, 'regions per rank', $dRegs); + addStat($ok, 'regions per local-domain', $dRegs); addStat($ok, 'blocks per region', $rBlks); addStat($ok, 'clusters per block', $bCls); addStat($ok, 'mini-blocks per block', $bMbs); diff --git a/utils/lib/YaskUtils.pm b/utils/lib/YaskUtils.pm index 36728dde..71ed28d6 100644 --- a/utils/lib/YaskUtils.pm +++ b/utils/lib/YaskUtils.pm @@ -61,15 +61,15 @@ our @log_keys = 'num threads per region', 'num threads per block', 'total overall allocation', - 'overall problem size', - 'rank-domain size', + 'global-domain size', + 'local-domain size', 'region size', 'block size', 'mini-block size', 'sub-block size', 'cluster size', 'vector size', - 'num regions per rank-domain per step', + 'num regions per local-domain per step', 'num blocks per region per step', 'num mini-blocks per block per step', 'num sub-blocks per mini-block per step', @@ -171,7 +171,7 @@ sub getResultsFromLine($$) { chomp($line); - # pre-process keys. + # pre-process keys one time. if (scalar keys %proc_keys == 0) { undef %proc_keys; for my $m (@log_keys) { @@ -193,6 +193,10 @@ sub getResultsFromLine($$) { $proc_keys{$sk}{$pm} = $m; } } + + # Substitutions to handle old formats. + $line =~ s/overall.problem/global-domain/g; + $line =~ s/rank.domain/local-domain/g; # special cases for manual parsing... # TODO: catch output of auto-tuner and update relevant results.