From f81806357445a1d7751dd67ada044e1fd49fb498 Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Mon, 8 May 2023 23:07:38 -0700 Subject: [PATCH 01/34] Squash --- .../toss_3_x86_64_ib/compilers.yaml | 65 +++---------------- 1 file changed, 10 insertions(+), 55 deletions(-) diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml index 3d9648a7..a4af3f37 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml @@ -1,11 +1,11 @@ compilers: - compiler: - spec: clang@10.0.1 + spec: clang@14.0.6 paths: - cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + cc: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang + cxx: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran flags: cflags: -march=native -mtune=native cxxflags: -march=native -mtune=native @@ -15,12 +15,12 @@ compilers: environment: {} extra_rpaths: [] - compiler: - spec: clang@11.0.1 + spec: gcc@12.1.1 paths: - cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + cc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/g++ + f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran flags: cflags: -march=native -mtune=native cxxflags: -march=native -mtune=native @@ -29,48 +29,3 @@ compilers: modules: [] environment: {} extra_rpaths: [] -- compiler: - spec: gcc@7.3.0 - paths: - cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ - f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@8.3.1 - paths: - cc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-8.3.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@19.1.2 - paths: - cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc - cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc - f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort - fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native - cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] From 9ed565c0ea525894bfc71a47207dde1ce35478ad Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Thu, 26 Jan 2023 10:58:05 -0800 Subject: [PATCH 02/34] Fixed cmake so that GEOSX TPLs work and also updated spack. --- cmake/SetupTPL.cmake | 51 +++---- host-configs/LLNL/lassen-base.cmake | 3 - host-configs/LLNL/quartz-base.cmake | 3 - scripts/uberenv/packages/lvarray/package.py | 26 ++-- scripts/uberenv/project.json | 4 +- .../blueos_3_ppc64le_ib_p9/compilers.yaml | 132 +++++++++++++++--- .../toss_3_x86_64_ib_python/packages.yaml | 7 + 7 files changed, 161 insertions(+), 65 deletions(-) diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake index bff94834..29a60128 100644 --- a/cmake/SetupTPL.cmake +++ b/cmake/SetupTPL.cmake @@ -1,19 +1,22 @@ set(thirdPartyLibs "") -################################ +############################### # CAMP -################################ -if(NOT EXISTS ${CAMP_DIR}) - message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") -endif() +############################### +if(CAMP_DIR STREQUAL RAJA_DIR) + message(STATUS "LvArray using CAMP from RAJA.") +else() + if(NOT EXISTS ${CAMP_DIR}) + message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") + endif() -message(STATUS "Using CAMP from ${CAMP_DIR}") + message(STATUS "LvArray using CAMP from ${CAMP_DIR}") -find_package(camp REQUIRED PATHS ${CAMP_DIR}) + find_package(camp REQUIRED PATHS ${CAMP_DIR}) -set(ENABLE_CAMP ON CACHE BOOL "") + set(thirdPartyLibs ${thirdPartyLibs} camp) +endif() -set(thirdPartyLibs ${thirdPartyLibs} camp) ################################ # RAJA @@ -22,7 +25,7 @@ if(NOT EXISTS ${RAJA_DIR}) message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.") endif() -message(STATUS "Using RAJA from ${RAJA_DIR}") +message(STATUS "LvArray using RAJA from ${RAJA_DIR}") find_package(RAJA REQUIRED PATHS ${RAJA_DIR}) @@ -39,14 +42,14 @@ if(ENABLE_UMPIRE) message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.") endif() - message(STATUS "Using Umpire from ${UMPIRE_DIR}") + message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}") find_package(umpire REQUIRED PATHS ${UMPIRE_DIR}) set(thirdPartyLibs ${thirdPartyLibs} umpire) else() - message(STATUS "Not using Umpire.") + message(STATUS "LvArray not using Umpire.") endif() ################################ @@ -65,32 +68,32 @@ if(ENABLE_CHAI) message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.") endif() - message(STATUS "Using CHAI from ${CHAI_DIR}") + message(STATUS "LvArray using CHAI from ${CHAI_DIR}") find_package(chai REQUIRED PATHS ${CHAI_DIR}) - - # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. - get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) - list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) - set_target_properties(chai - PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") + + # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. + # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) + # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) + # set_target_properties(chai + # PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") set(thirdPartyLibs ${thirdPartyLibs} chai) else() - message(STATUS "Not using CHAI.") + message(STATUS "LvArray not using CHAI.") endif() -################################ +############################### # CALIPER -################################ +############################### if(ENABLE_CALIPER) if(NOT EXISTS ${CALIPER_DIR}) message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.") endif() - message(STATUS "Using caliper from ${CALIPER_DIR}") + message(STATUS "LvArray using caliper from ${CALIPER_DIR}") find_package(caliper REQUIRED PATHS ${CALIPER_DIR}) @@ -102,7 +105,7 @@ if(ENABLE_CALIPER) set(thirdPartyLibs ${thirdPartyLibs} caliper) else() - message(STATUS "Not using caliper.") + message(STATUS "LvArray not using caliper.") endif() ################################ diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake index 5a443bb9..31a8f048 100644 --- a/host-configs/LLNL/lassen-base.cmake +++ b/host-configs/LLNL/lassen-base.cmake @@ -11,9 +11,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") set(ENABLE_CHAI ON CACHE BOOL "") set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") -set(ENABLE_CALIPER ON CACHE BOOL "") -set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "") - set(ENABLE_ADDR2LINE ON CACHE BOOL "") # Cuda options diff --git a/host-configs/LLNL/quartz-base.cmake b/host-configs/LLNL/quartz-base.cmake index b7eb21df..ef4128a6 100644 --- a/host-configs/LLNL/quartz-base.cmake +++ b/host-configs/LLNL/quartz-base.cmake @@ -12,9 +12,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") set(ENABLE_CHAI ON CACHE BOOL "") set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") -set(ENABLE_CALIPER ON CACHE BOOL "") -set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "") - # set(ENABLE_PYLVARRAY ON CACHE BOOL "") # set(PYTHON_DIR /usr/tce/packages/python/python-3.7.2 CACHE PATH "") diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index 9c4b47d9..cf9d5548 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -56,32 +56,36 @@ class Lvarray(CMakePackage, CudaPackage): variant('docs', default=False, description='Build docs') variant('addr2line', default=True, description='Build support for addr2line.') - + depends_on('blt', when='@0.2.0:', type='build') depends_on('camp') - depends_on('camp+cuda', when='+cuda') depends_on('raja') - depends_on('raja+cuda', when='+cuda') - # At the moment Umpire doesn't support shared when building with CUDA. depends_on('umpire', when='+umpire') - depends_on('umpire+cuda~shared', when='+umpire+cuda') depends_on('chai+raja', when='+chai') - depends_on('chai+raja+cuda', when='+chai+cuda') depends_on('caliper', when='+caliper') depends_on('python +shared +pic', when='+pylvarray') - depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray') - depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray') + depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray') + depends_on('py-scipy@1.5.2:', when='+pylvarray') depends_on('py-pip', when='+pylvarray') depends_on('doxygen@1.8.13:', when='+docs', type='build') depends_on('py-sphinx@1.6.3:', when='+docs', type='build') + with when('+cuda'): + for sm_ in CudaPackage.cuda_arch_values: + depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + + phases = ['hostconfig', 'cmake', 'build', 'install'] @run_after('build') @@ -285,10 +289,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#{0}\n\n".format("-" * 80)) if "+caliper" in spec: - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Caliper\n") - cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) else: @@ -297,6 +297,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write('#{0}\n'.format('-' * 80)) cfg.write('# Python\n') cfg.write('#{0}\n\n'.format('-' * 80)) + if '+pylvarray' in spec: cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True)) cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3'))) @@ -306,6 +307,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#{0}\n".format("-" * 80)) cfg.write("# Documentation\n") cfg.write("#{0}\n\n".format("-" * 80)) + if "+docs" in spec: cfg.write(cmake_cache_option("ENABLE_DOCS", True)) sphinx_dir = spec['py-sphinx'].prefix diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json index 9822f975..703db7a4 100644 --- a/scripts/uberenv/project.json +++ b/scripts/uberenv/project.json @@ -3,8 +3,8 @@ "package_version" : "develop", "package_final_phase" : "hostconfig", "package_source_dir" : "../..", - "spack_url": "https://github.com/corbett5/spack", - "spack_branch": "package/corbett/lvarray-update", + "spack_url": "https://github.com/spack/spack", + "spack_branch": "v0.19.0", "spack_activate" : {}, "spack_clean_packages": ["lvarray"], "build_jobs": 100 diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml index b1bf26cb..b8353dd0 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml @@ -2,30 +2,90 @@ compilers: - compiler: spec: clang@10.0.1 paths: - cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] - compiler: spec: clang@11.0.1 paths: - cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 operating_system: rhel7 - target: ppc64le + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@12.0.1 + paths: + cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@13.0.1 + paths: + cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@14.0.4 + paths: + cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang + cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@7.3.0 + paths: + cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ + f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 modules: [] environment: {} extra_rpaths: [] @@ -37,25 +97,55 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@9.3.1 + paths: + cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@10.2.1 + paths: + cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] - compiler: - spec: xl@16.1.1 + spec: intel@19.1.2 paths: - cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc - cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC - f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf - fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf + cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc + cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc + f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort + fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort flags: - cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 - cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native + cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml index 0c6b833b..a6fbda09 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml @@ -107,3 +107,10 @@ packages: externals: - spec: pkg-config@0.27.1 prefix: /usr/bin/ + + ninja: + buildable: False + externals: + - spec: ninja@kitware + prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ + From 329d7f1fd8b79439adeaf7bf6e632b1f27be3bd2 Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Mon, 8 May 2023 22:53:05 -0700 Subject: [PATCH 03/34] Squash --- Notes.txt | 1 + ...quartz-toss_3_x86_64_ib-clang@10.0.1.cmake | 93 +++++++++++++++++++ .../toss_3_x86_64_ib_python/packages.yaml | 2 +- 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 Notes.txt create mode 100644 new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake diff --git a/Notes.txt b/Notes.txt new file mode 100644 index 00000000..d07775c5 --- /dev/null +++ b/Notes.txt @@ -0,0 +1 @@ +./scripts/uberenv/uberenv.py --prefix=../uberenv-libs/ --spack-config-dir=./scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/ --spec="%clang@10.0.1 +umpire +chai +caliper +pylvarray +benchmarks +examples ^caliper ~adiak ~mpi ~libunwind ~libdw ~papi" \ No newline at end of file diff --git a/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake new file mode 100644 index 00000000..90ac014b --- /dev/null +++ b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake @@ -0,0 +1,93 @@ +################################################################################# +# Generated host-config - Edit at own risk! +################################################################################# +#-------------------------------------------------------------------------------- +# SYS_TYPE: toss_3_x86_64_ib +# Compiler Spec: clang@10.0.1 +# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake +#-------------------------------------------------------------------------------- + +set(BLT_SOURCE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/blt-0.5.2-6nztad6saell6ikor6wtxp6qycxtfwh4" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Compilers +#-------------------------------------------------------------------------------- + +set(CMAKE_C_COMPILER "/usr/tce/bin/clang-10.0.1" CACHE PATH "") + +set(CMAKE_CXX_COMPILER "/usr/tce/bin/clang++-10.0.1" CACHE PATH "") + +set(CMAKE_C_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "") + +set(CMAKE_CXX_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(ENABLE_CUDA OFF CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# CAMP +#-------------------------------------------------------------------------------- + +set(CAMP_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/camp-2022.03.2-2q75xbq2h4ykcyvasoqg55torawlabkw" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# RAJA +#-------------------------------------------------------------------------------- + +set(RAJA_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/raja-2022.03.0-jkp4hp7ifyxkxzkbho5ngdnk4x3opaoy" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Umpire +#-------------------------------------------------------------------------------- + +set(ENABLE_UMPIRE ON CACHE BOOL "") + +set(UMPIRE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/umpire-2022.03.1-aerit7injc3hmn2ripnsxtnlwxicjmuu" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# CHAI +#-------------------------------------------------------------------------------- + +set(ENABLE_CHAI ON CACHE BOOL "") + +set(CHAI_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/chai-2022.03.0-s6w2gsrreu7krgzboekmlukmfestpg7k" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Caliper +#-------------------------------------------------------------------------------- + +#-------------------------------------------------------------------------------- +# Caliper +#-------------------------------------------------------------------------------- + +set(ENABLE_CALIPER ON CACHE BOOL "") + +set(CALIPER_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/caliper-2.8.0-3fwkrbu4bhnc4bqvhrqcydrzxslq6ryz" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Python +#-------------------------------------------------------------------------------- + +set(ENABLE_PYLVARRAY OFF CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# Documentation +#-------------------------------------------------------------------------------- + +set(ENABLE_DOCS OFF CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# addr2line +#-------------------------------------------------------------------------------- + +set(ENABLE_ADDR2LINE ON CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# Other +#-------------------------------------------------------------------------------- + diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml index a6fbda09..5b3c9fbe 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml @@ -112,5 +112,5 @@ packages: buildable: False externals: - spec: ninja@kitware - prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ + prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ninja From d31c0ffdcdc83d875336d50550c546f1e37d12be Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Tue, 9 May 2023 00:54:59 -0700 Subject: [PATCH 04/34] Got Spack working on Quartz again, need to do Lassen and figure out python stuff. --- host-configs/LLNL/lassen-base.cmake | 3 +++ host-configs/LLNL/quartz-base.cmake | 3 +++ scripts/uberenv/packages/lvarray/package.py | 13 +++++++++++-- .../toss_3_x86_64_ib_python/packages.yaml | 4 ++-- .../compilers.yaml | 4 ++-- .../packages.yaml | 0 6 files changed, 21 insertions(+), 6 deletions(-) rename scripts/uberenv/spack_configs/{toss_3_x86_64_ib => toss_4_x86_64_ib}/compilers.yaml (94%) rename scripts/uberenv/spack_configs/{toss_3_x86_64_ib => toss_4_x86_64_ib}/packages.yaml (100%) diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake index 31a8f048..5a443bb9 100644 --- a/host-configs/LLNL/lassen-base.cmake +++ b/host-configs/LLNL/lassen-base.cmake @@ -11,6 +11,9 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") set(ENABLE_CHAI ON CACHE BOOL "") set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") +set(ENABLE_CALIPER ON CACHE BOOL "") +set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "") + set(ENABLE_ADDR2LINE ON CACHE BOOL "") # Cuda options diff --git a/host-configs/LLNL/quartz-base.cmake b/host-configs/LLNL/quartz-base.cmake index ef4128a6..b7eb21df 100644 --- a/host-configs/LLNL/quartz-base.cmake +++ b/host-configs/LLNL/quartz-base.cmake @@ -12,6 +12,9 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") set(ENABLE_CHAI ON CACHE BOOL "") set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") +set(ENABLE_CALIPER ON CACHE BOOL "") +set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "") + # set(ENABLE_PYLVARRAY ON CACHE BOOL "") # set(PYTHON_DIR /usr/tce/packages/python/python-3.7.2 CACHE PATH "") diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index cf9d5548..b377bdfa 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -56,6 +56,8 @@ class Lvarray(CMakePackage, CudaPackage): variant('docs', default=False, description='Build docs') variant('addr2line', default=True, description='Build support for addr2line.') + variant('tpl_build_type', default='none', description='TPL build type', + values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none')) depends_on('blt', when='@0.2.0:', type='build') @@ -82,9 +84,16 @@ class Lvarray(CMakePackage, CudaPackage): depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) - depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) - depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='+chai cuda_arch={0}'.format(sm_)) + depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='+caliper cuda_arch={0}'.format(sm_)) + for bt in ('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel'): + with when('tpl_build_type={}'.format(bt)): + depends_on('camp build_type={}'.format(bt)) + depends_on('raja build_type={}'.format(bt)) + depends_on('umpire build_type={}'.format(bt)) + depends_on('chai build_type={}'.format(bt), when='+chai') + depends_on('caliper build_type={}'.format(bt), when='+caliper') phases = ['hostconfig', 'cmake', 'build', 'install'] diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml index 5b3c9fbe..43971e78 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml @@ -111,6 +111,6 @@ packages: ninja: buildable: False externals: - - spec: ninja@kitware - prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ninja + - spec: ninja@1.11.0 + prefix: /usr/tce/packages/ninja/ninja-1.11.0 diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml similarity index 94% rename from scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml rename to scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml index a4af3f37..15bdbccd 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml +++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml @@ -9,7 +9,7 @@ compilers: flags: cflags: -march=native -mtune=native cxxflags: -march=native -mtune=native - operating_system: rhel7 + operating_system: rhel8 target: x86_64 modules: [] environment: {} @@ -24,7 +24,7 @@ compilers: flags: cflags: -march=native -mtune=native cxxflags: -march=native -mtune=native - operating_system: rhel7 + operating_system: rhel8 target: x86_64 modules: [] environment: {} diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml similarity index 100% rename from scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml rename to scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml From e09e98707edb6e6f6387fc29e68f630d2680c511 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Tue, 22 Feb 2022 18:15:46 -0500 Subject: [PATCH 05/34] small umpire and raja versioning and rocm changes --- CMakeLists.txt | 3 ++ cmake/blt | 2 +- .../{ => ORNL}/ascent-gcc@8.1.1.cmake | 0 host-configs/ORNL/crusher-cce@13.0.1.cmake | 41 +++++++++++++++++++ host-configs/ORNL/spock-cce@12.0.3.cmake | 39 ++++++++++++++++++ src/CMakeLists.txt | 4 +- src/bufferManipulation.hpp | 15 +++++++ unitTests/testUtils.hpp | 1 + 8 files changed, 102 insertions(+), 3 deletions(-) rename host-configs/{ => ORNL}/ascent-gcc@8.1.1.cmake (100%) create mode 100644 host-configs/ORNL/crusher-cce@13.0.1.cmake create mode 100644 host-configs/ORNL/spock-cce@12.0.3.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index fc8cf73d..e53d193d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ if( NOT is_submodule ) option( ENABLE_ADDR2LINE "Enable addr2line usage in stacktraces" ON ) option( ENABLE_CUDA "Build with CUDA" OFF ) + option( ENABLE_HIP "Build with HIP" OFF ) option( ENABLE_UMPIRE "Build with UMPIRE" OFF ) option( ENABLE_CHAI "Build with CHAI" OFF ) option( ENABLE_CALIPER "Build with Caliper" OFF ) @@ -80,6 +81,8 @@ blt_list_append( TO lvarray_dependencies ELEMENTS chai IF ENABLE_CHAI ) blt_list_append( TO lvarray_dependencies ELEMENTS cuda IF ENABLE_CUDA ) +blt_list_append( TO lvarray_dependencies ELEMENTS hip hip_runtime IF ENABLE_HIP ) + blt_list_append( TO lvarray_dependencies ELEMENTS caliper IF ENABLE_CALIPER ) diff --git a/cmake/blt b/cmake/blt index c253509a..ddd5a0ca 160000 --- a/cmake/blt +++ b/cmake/blt @@ -1 +1 @@ -Subproject commit c253509ab2daf759eb857958597f6f34ab8c1713 +Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb diff --git a/host-configs/ascent-gcc@8.1.1.cmake b/host-configs/ORNL/ascent-gcc@8.1.1.cmake similarity index 100% rename from host-configs/ascent-gcc@8.1.1.cmake rename to host-configs/ORNL/ascent-gcc@8.1.1.cmake diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake new file mode 100644 index 00000000..2a359fd5 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -0,0 +1,41 @@ +set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") + +# Set up the tpls +set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" ) +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" ) + +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-aeczo5gctizktwwt5x7xlmuyoarwipag/" CACHE PATH "" ) +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-yubforuougga3ujwwpfz3tmybqhroczp/" CACHE PATH "" ) + +set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) +set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" ) + +# C++ options +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "") + +set(CMAKE_CXX_STANDARD 14 CACHE STRING "") + +set( ENABLE_MPI ON CACHE BOOL "" FORCE ) +set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) + +# HIP Options +set( ENABLE_HIP ON CACHE BOOL "" FORCE ) +set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) +set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) +set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) +set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) +#set( CMAKE_CXX_FLAGS "-std=c++14 -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE ) + +set( HIP_HIPCC_INCLUDE_ARGS "$<$:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE ) +set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) +set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) + +# GTEST options +set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") +set(gtest_disable_pthreads ON CACHE BOOL "") diff --git a/host-configs/ORNL/spock-cce@12.0.3.cmake b/host-configs/ORNL/spock-cce@12.0.3.cmake new file mode 100644 index 00000000..f0764c32 --- /dev/null +++ b/host-configs/ORNL/spock-cce@12.0.3.cmake @@ -0,0 +1,39 @@ +set(CONFIG_NAME "spock-cce@12.0.3" CACHE PATH "") + +# Set up the tpls +set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen2/cce-12.0.3" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-qpmhf6p7n5sarmks55hgjnzff3ncs7jd/" CACHE PATH "" ) +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-frggdmwjevbxy4a6kw7ctgrhyv7erfhr/" CACHE PATH "" ) + +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-nkdetdg5tjyzzf5yjzo32jxwkmwfjjqn/" CACHE PATH "" ) +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-wun25mr5qf7vo6x2vblhzh2ivs7vr4g6/" CACHE PATH "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-a5ponjo23u7smy7w4a4jj7im47shrsxk/" CACHE PATH "" ) + +set(METIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/metis-5.1.0-rbblqiymq6eoursordyaq2ghimzpd22v/" CACHE PATH "" ) +set(PARMETIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/parmetis-4.0.3-mliemgo6vxrahsz4f6u5agdqyfpk2yd2/" CACHE PATH "" ) + +# C++ options +#set(CMAKE_C_COMPILER "/opt/cray/pe/cce/12.0.3/bin/craycc" CACHE PATH "") +#set(CMAKE_CXX_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayCC" CACHE PATH "") +#set(CMAKE_Fortran_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayftn" CACHE PATH "") + +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.11/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.11/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.11/bin/ftn" CACHE PATH "") + +set(CMAKE_CXX_STANDARD 14 CACHE STRING "") + +set( ENABLE_MPI ON CACHE BOOL "" FORCE ) +set( ENABLE_FIND_MPI OFF CACHE BOOL "" FORCE ) + +# HIP Options +set( ENABLE_HIP ON CACHE BOOL "" FORCE ) +set( HIP_ROOT "/opt/rocm-4.2.0" CACHE PATH "" ) +set( HIP_VERSION_STRING "4.2.0" CACHE STRING "" ) +set( CMAKE_HIP_ARCHITECTURES "gfx908" CACHE STRING "" FORCE ) + +# GTEST options +set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") +set(gtest_disable_pthreads ON CACHE BOOL "") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 03f627c2..da7c512f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -38,7 +38,7 @@ set( lvarray_headers sortedArrayManipulationHelpers.hpp system.hpp tensorOps.hpp - totalview/tv_data_display.h +# totalview/tv_data_display.h typeManipulation.hpp umpireInterface.hpp ) @@ -46,7 +46,7 @@ blt_list_append( TO lvarray_headers ELEMENTS ChaiBuffer.hpp IF ENABLE_CHAI ) set( lvarray_sources system.cpp - totalview/tv_data_display.c +# totalview/tv_data_display.c umpireInterface.cpp ) blt_add_library( NAME lvarray diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp index 62b94539..b06a4e4c 100644 --- a/src/bufferManipulation.hpp +++ b/src/bufferManipulation.hpp @@ -69,6 +69,21 @@ namespace bufferManipulation */ HAS_MEMBER_FUNCTION_NO_RTYPE( move, MemorySpace::host, true ); + +template < typename T > +struct ContainerShim +{ + ContainerShim( T * begin, T * end ) + : m_begin( begin ) + , m_end( end ) + {} + T * begin() const { return m_begin; } + T * end() const { return m_end; } + T * m_begin; + T * m_end; +}; + + /** * @class VoidBuffer * @brief This class implements the default behavior for the Buffer methods related diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp index 5a2db2bf..161c8453 100644 --- a/unitTests/testUtils.hpp +++ b/unitTests/testUtils.hpp @@ -20,6 +20,7 @@ // TPL includes #include +#include #include // System includes From f5a81fee1c4302c53c9821ea8fba4c9c663de083 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Mon, 21 Mar 2022 16:04:07 -0400 Subject: [PATCH 06/34] ongoing crusher/rocm work --- cmake/Config.cmake | 1 + host-configs/ORNL/crusher-cce@13.0.1.cmake | 21 +- src/ArrayOfArraysView.hpp | 3 + src/ChaiBuffer.hpp | 18 +- src/LvArrayConfig.hpp.in | 2 + src/Macros.hpp | 6 +- src/SortedArrayView.hpp | 3 + unitTests/testArray1DOfArray1D.cpp | 2 +- unitTests/testArray1DOfArray1DOfArray1D.cpp | 2 +- unitTests/testArrayOfArrays.cpp | 4 +- unitTests/testArrayOfSets.cpp | 2 +- unitTests/testArray_ChaiBuffer.cpp | 42 +- unitTests/testCRSMatrix.cpp | 4 +- unitTests/testChaiBuffer.cpp | 132 ++- unitTests/testMath.cpp | 10 +- unitTests/testMemcpy.cpp | 101 ++- unitTests/testSortedArray.cpp | 2 +- unitTests/testSortedArrayManipulation.cpp | 4 +- unitTests/testSparsityPattern.cpp | 4 +- unitTests/testStackArray.cpp | 2 +- unitTests/testTensorOpsEigen.cpp | 2 +- unitTests/testTensorOpsFixedSize.cpp | 2 +- unitTests/testTensorOpsInverse.hpp | 4 +- unitTests/testTensorOpsNoSize.cpp | 2 +- unitTests/testTensorOpsOneSize.cpp | 2 +- unitTests/testTensorOpsThreeSizes.hpp | 2 +- unitTests/testTensorOpsTwoSizes.hpp | 2 +- unitTests/testTensorOpsTwoSizes1.cpp | 930 +++++++++++++++++++- unitTests/testTypeManipulation.cpp | 17 + unitTests/testUtils.hpp | 23 +- 30 files changed, 1305 insertions(+), 46 deletions(-) diff --git a/cmake/Config.cmake b/cmake/Config.cmake index 0a44fd1b..cf8ff35b 100644 --- a/cmake/Config.cmake +++ b/cmake/Config.cmake @@ -2,6 +2,7 @@ set( PREPROCESSOR_DEFINES UMPIRE CHAI CUDA + HIP TOTALVIEW_OUTPUT CALIPER ) diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake index 2a359fd5..65830097 100644 --- a/host-configs/ORNL/crusher-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -7,8 +7,10 @@ set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" ) set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" ) +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-aeczo5gctizktwwt5x7xlmuyoarwipag/" CACHE PATH "" ) set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" ) +set(ENABLE_CHAI TRUE CACHE BOOL "" ) set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-yubforuougga3ujwwpfz3tmybqhroczp/" CACHE PATH "" ) set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) @@ -28,14 +30,27 @@ set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) set( ENABLE_HIP ON CACHE BOOL "" FORCE ) set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) -#set( CMAKE_CXX_FLAGS "-std=c++14 -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE ) + +#set( CMAKE_CXX_FLAGS "--offload-arch=gfx90a -x hip -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE ) set( HIP_HIPCC_INCLUDE_ARGS "$<$:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE ) -set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) -set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) +set( HIP_HIPCC_FLAGS "-std=c++14 --amdgpu-target=gfx90a" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE ) + +set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link +set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} ) # GTEST options set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") set(gtest_disable_pthreads ON CACHE BOOL "") + +set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) +#set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) +set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) +set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) +set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) + +#BLT +set(ENABLE_FIND_MPI FALSE CACHE BOOL "") \ No newline at end of file diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 706f2014..52c8df15 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -587,6 +587,9 @@ class ArrayOfArraysView #if defined(LVARRAY_USE_CUDA) if( space == MemorySpace::cuda ) touch = false; + #endif + #if defined(LVARRAY_USE_HIP) + if( space == MemorySpace::hip ) touch = false; #endif m_offsets.move( space, touch ); } diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp index b5d26fa1..6b0d45ec 100644 --- a/src/ChaiBuffer.hpp +++ b/src/ChaiBuffer.hpp @@ -56,7 +56,11 @@ inline chai::ExecutionSpace toChaiExecutionSpace( MemorySpace const space ) if( space == MemorySpace::host ) return chai::CPU; #if defined(LVARRAY_USE_CUDA) - if( space == MemorySpace::cuda || space == MemorySpace::hip ) + if( space == MemorySpace::cuda ) + return chai::GPU; +#endif +#if defined(LVARRAY_USE_HIP) + if( space == MemorySpace::hip ) return chai::GPU; #endif @@ -79,6 +83,10 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space ) if( space == chai::GPU ) return MemorySpace::cuda; #endif +#if defined(LVARRAY_USE_HIP) + if( space == chai::GPU ) + return MemorySpace::hip; +#endif LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) ); @@ -185,7 +193,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) + #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) ) move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true ); #endif } @@ -203,7 +211,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) + #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) ) moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true ); #else LVARRAY_UNUSED_VARIABLE( size ); @@ -370,7 +378,7 @@ class ChaiBuffer inline void moveNested( MemorySpace const space, std::ptrdiff_t const size, bool const touch ) const { - #if defined(LVARRAY_USE_CUDA) + #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space ); if( m_pointerRecord == nullptr || m_capacity == 0 || @@ -398,7 +406,7 @@ class ChaiBuffer */ void move( MemorySpace const space, bool const touch ) const { - #if defined(LVARRAY_USE_CUDA) + #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space ); if( m_pointerRecord == nullptr || m_capacity == 0 || diff --git a/src/LvArrayConfig.hpp.in b/src/LvArrayConfig.hpp.in index 2c997ab5..dcbd30b3 100644 --- a/src/LvArrayConfig.hpp.in +++ b/src/LvArrayConfig.hpp.in @@ -26,6 +26,8 @@ #cmakedefine LVARRAY_USE_CUDA +#cmakedefine LVARRAY_USE_HIP + #cmakedefine LVARRAY_USE_TOTALVIEW_OUTPUT #cmakedefine LVARRAY_USE_CALIPER diff --git a/src/Macros.hpp b/src/Macros.hpp index 544f5e19..a2060c1a 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -22,7 +22,7 @@ #include #include -#if defined(LVARRAY_USE_CUDA) +#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) #include #endif @@ -91,7 +91,7 @@ * and a stack trace along with the provided message. On device none of this is * guaranteed. In fact it is only guaranteed to abort the current kernel. */ -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #if !defined(NDEBUG) #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ @@ -535,7 +535,7 @@ */ #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" ) -#if defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) +#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIPCC__) ) /// Mark a function for both host and device usage. #define LVARRAY_HOST_DEVICE __host__ __device__ diff --git a/src/SortedArrayView.hpp b/src/SortedArrayView.hpp index ab7ca790..8559a3fc 100644 --- a/src/SortedArrayView.hpp +++ b/src/SortedArrayView.hpp @@ -274,6 +274,9 @@ class SortedArrayView { #if defined(LVARRAY_USE_CUDA) if( space == MemorySpace::cuda ) touch = false; + #endif + #if defined(LVARRAY_USE_HIP) + if( space == MemorySpace::hip ) touch = false; #endif m_values.move( space, touch ); } diff --git a/unitTests/testArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1D.cpp index faa53b52..7ff271cb 100644 --- a/unitTests/testArray1DOfArray1D.cpp +++ b/unitTests/testArray1DOfArray1D.cpp @@ -233,7 +233,7 @@ using Array1DOfArray1DTestTypes = ::testing::Types< , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy > , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArray1DOfArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1DOfArray1D.cpp index 5dc93fe8..cdd17fe2 100644 --- a/unitTests/testArray1DOfArray1DOfArray1D.cpp +++ b/unitTests/testArray1DOfArray1DOfArray1D.cpp @@ -272,7 +272,7 @@ using Array1DOfArray1DOfArray1DTestTypes = ::testing::Types< , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy > , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI) , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArrayOfArrays.cpp b/unitTests/testArrayOfArrays.cpp index 784fd448..aa20086b 100644 --- a/unitTests/testArrayOfArrays.cpp +++ b/unitTests/testArrayOfArrays.cpp @@ -1284,7 +1284,7 @@ using ArrayOfArraysViewTestTypes = ::testing::Types< , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif @@ -1467,7 +1467,7 @@ using ArrayOfArraysViewAtomicTestTypes = ::testing::Types< , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArrayOfSets.cpp b/unitTests/testArrayOfSets.cpp index d3b9f540..ac71a76b 100644 --- a/unitTests/testArrayOfSets.cpp +++ b/unitTests/testArrayOfSets.cpp @@ -925,7 +925,7 @@ using ArrayOfSetsViewTestTypes = ::testing::Types< , std::pair< ArrayOfSets< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< ArrayOfSets< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< ArrayOfSets< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArray_ChaiBuffer.cpp b/unitTests/testArray_ChaiBuffer.cpp index 34825981..5ef2a6a1 100644 --- a/unitTests/testArray_ChaiBuffer.cpp +++ b/unitTests/testArray_ChaiBuffer.cpp @@ -42,6 +42,10 @@ class ArrayTest : public ::testing::Test auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda }; std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; + #elif defined(LVARRAY_USE_HIP) + auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); + std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip }; + std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; #else std::initializer_list< MemorySpace > const spaces = { MemorySpace::host }; std::initializer_list< umpire::Allocator > const allocators = { hostPool }; @@ -61,13 +65,19 @@ class ArrayTest : public ::testing::Test array.move( MemorySpace::cuda, true ); EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" ); + array.move( MemorySpace::host, true ); + EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" ); + #elif defined(LVARRAY_USE_HIP) + array.move( MemorySpace::hip, true ); + EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" ); + array.move( MemorySpace::host, true ); EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" ); #endif } #if defined( LVARRAY_USE_CUDA ) - void testDeviceAlloc() + void testCudaDeviceAlloc() { Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array; @@ -86,6 +96,26 @@ class ArrayTest : public ::testing::Test } } #endif +#if defined(LVARRAY_USE_HIP) + void testHIPDeviceAlloc() + { + Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array; + + array.resizeWithoutInitializationOrDestruction( MemorySpace::hip, 100 ); + + T * const devPtr = array.data(); + forall< parallelDevicePolicy< 32 > >( array.size(), [devPtr] LVARRAY_DEVICE ( int const i ) + { + new ( &devPtr[ i ] ) T( i ); + } ); + + array.move( MemorySpace::host, true ); + for( int i = 0; i < array.size(); ++i ) + { + EXPECT_EQ( array[ i ], T( i ) ); + } + } +#endif }; /// The list of types to instantiate ArrayTest with. @@ -104,9 +134,17 @@ TYPED_TEST( ArrayTest, AllocatorConstruction ) TYPED_TEST( ArrayTest, DeviceAlloc ) { - this->testDeviceAlloc(); + this->testCudaDeviceAlloc(); } +#endif +#if defined(LVARRAY_USE_HIP) + +TYPED_TEST( ArrayTest, DeviceAlloc ) +{ + this->testHIPDeviceAlloc(); +} + #endif } // namespace testing diff --git a/unitTests/testCRSMatrix.cpp b/unitTests/testCRSMatrix.cpp index 987aa4e9..3c6c0556 100644 --- a/unitTests/testCRSMatrix.cpp +++ b/unitTests/testCRSMatrix.cpp @@ -1036,7 +1036,7 @@ using CRSMatrixViewTestTypes = ::testing::Types< , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif @@ -1276,7 +1276,7 @@ using CRSMatrixViewAtomicTestTypes = ::testing::Types< , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testChaiBuffer.cpp b/unitTests/testChaiBuffer.cpp index 8c6d9937..ae12886f 100644 --- a/unitTests/testChaiBuffer.cpp +++ b/unitTests/testChaiBuffer.cpp @@ -41,6 +41,10 @@ class ChaiBufferTest : public ::testing::Test auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda }; std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; + #elif defined( LVARRAY_USE_HIP ) + auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); + std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip }; + std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; #else std::initializer_list< MemorySpace > const spaces = { MemorySpace::host }; std::initializer_list< umpire::Allocator > const allocators = { hostPool }; @@ -62,6 +66,12 @@ class ChaiBufferTest : public ::testing::Test buffer.move( MemorySpace::cuda, true ); EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" ); + buffer.move( MemorySpace::host, true ); + EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" ); + #elif defined(LVARRAY_USE_HIP) + buffer.move( MemorySpace::hip, true ); + EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" ); + buffer.move( MemorySpace::host, true ); EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" ); #endif @@ -188,6 +198,126 @@ class ChaiBufferTest : public ::testing::Test EXPECT_EQ( buffer[ i ], T( i ) ); } + bufferManipulation::free( buffer, size ); + } +#elif defined( LVARRAY_USE_HIP ) + void testMove() + { + ChaiBuffer< T > buffer( true ); + + int const size = 100; + buffer.reallocate( 0, MemorySpace::host, size ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + + for( int i = 0; i < size; ++i ) + { + new ( &buffer[ i ] ) T( i ); + } + + buffer.move( MemorySpace::hip, true ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + T * const devPtr = buffer.data(); + + forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i ) + { + devPtr[ i ] += devPtr[ i ]; + } ); + + // Check that the device changes are seen on the host. Then modify the values without touching. + buffer.move( MemorySpace::host, false ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + for( int i = 0; i < size; ++i ) + { + EXPECT_EQ( buffer[ i ], T( i ) + T( i ) ); + buffer[ i ] = T( 0 ); + } + + buffer.move( MemorySpace::hip, true ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i ) + { + devPtr[ i ] += devPtr[ i ]; + } ); + + buffer.move( MemorySpace::host, false ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + for( int i = 0; i < size; ++i ) + { + EXPECT_EQ( buffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) ); + } + + bufferManipulation::free( buffer, size ); + } + + void testCapture() + { + ChaiBuffer< T > buffer( true ); + + int const size = 100; + buffer.reallocate( 0, MemorySpace::host, size ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + + for( int i = 0; i < size; ++i ) + { + new ( &buffer[ i ] ) T( i ); + } + + forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i ) + { + buffer[ i ] += buffer[ i ]; + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + + + // Check that the device changes are seen on the host. Then modify the values without touching. + ChaiBuffer< T const > constBuffer( buffer ); + forall< serialPolicy >( size, [constBuffer] ( int const i ) + { + EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) ); + const_cast< T & >( constBuffer[ i ] ) = T( 0 ); + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host ); + + forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i ) + { + buffer[ i ] += buffer[ i ]; + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + + forall< serialPolicy >( size, [constBuffer] ( int const i ) + { + EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) ); + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host ); + + bufferManipulation::free( buffer, size ); + } + + void testDeviceRealloc() + { + ChaiBuffer< T > buffer( true ); + + int const size = 100; + buffer.reallocate( 0, MemorySpace::hip, size ); + + T * const devPtr = buffer.data(); + forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i ) + { + new ( &devPtr[ i ] ) T( i ); + } ); + + buffer.move( MemorySpace::host, true ); + for( int i = 0; i < size; ++i ) + { + EXPECT_EQ( buffer[ i ], T( i ) ); + } + bufferManipulation::free( buffer, size ); } #endif @@ -205,7 +335,7 @@ TYPED_TEST( ChaiBufferTest, AllocatorConstruction ) this->testAllocatorConstruction(); } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) TYPED_TEST( ChaiBufferTest, Move ) { diff --git a/unitTests/testMath.cpp b/unitTests/testMath.cpp index 08502c4f..d7c76b19 100644 --- a/unitTests/testMath.cpp +++ b/unitTests/testMath.cpp @@ -145,14 +145,15 @@ using TestMathTypes = ::testing::Types< , std::pair< long long int, serialPolicy > , std::pair< float, serialPolicy > , std::pair< double, serialPolicy > -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) , std::pair< int, parallelDevicePolicy< 32 > > , std::pair< long int, parallelDevicePolicy< 32 > > , std::pair< long long int, parallelDevicePolicy< 32 > > , std::pair< float, parallelDevicePolicy< 32 > > , std::pair< double, parallelDevicePolicy< 32 > > - , std::pair< __half, parallelDevicePolicy< 32 > > #endif +#if defined( LVARRAY_USE_CUDA ) + , std::pair< __half, parallelDevicePolicy< 32 > > >; TYPED_TEST_SUITE( TestMath, TestMathTypes, ); @@ -331,7 +332,7 @@ struct TestMath2 : public ::testing::Test } }; -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) using TestMath2Types = ::testing::Types< std::pair< __half2, parallelDevicePolicy< 32 > > @@ -403,7 +404,8 @@ void forAllHalvesinMinus1to1( bool const include1, LAMBDA && lambda ) } } ); } - +#endif +#if defined(LVARRAY_USE_CUDA) void asinHalfAccuracy() { RAJA::ReduceMax< RAJA::cuda_reduce, double > maxDiff( 0 ); diff --git a/unitTests/testMemcpy.cpp b/unitTests/testMemcpy.cpp index f3adcece..0e44243d 100644 --- a/unitTests/testMemcpy.cpp +++ b/unitTests/testMemcpy.cpp @@ -242,7 +242,106 @@ void testAsyncMemcpyDevice() EXPECT_EQ( x[ i ], -i ); } } +#elif defined(LVARRAY_USE_HIP) +template< template< typename > class BUFFER_TYPE > +void testMemcpyDevice() +{ + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + x[ i ] = i; + } + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() ); + y.move( MemorySpace::hip ); + int * yPtr = y.data(); + + memcpy< 0, 0 >( y, {}, x.toViewConst(), {} ); + + forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + PORTABLE_EXPECT_EQ( yPtr[ i ], i ); + yPtr[ i ] *= 2; + } ); + + memcpy< 0, 0 >( x, {}, y.toViewConst(), {} ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], 2 * i ); + } + + // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing + // to host memory but the subsequent memcpy should pick up that it's previous space is on device. + y.move( MemorySpace::host ); + + ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView(); + forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + yView[ i ] = -i; + } ); + + memcpy< 0, 0 >( x, {}, y.toViewConst(), {} ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], -i ); + } +} + +template< template< typename > class BUFFER_TYPE > +void testAsyncMemcpyDevice() +{ + camp::resources::Resource stream{ camp::resources::Hip{} }; + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + x[ i ] = i; + } + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() ); + y.move( MemorySpace::hip ); + int * yPtr = y.data(); + + camp::resources::Event e = memcpy< 0, 0 >( stream, y.toView(), {}, x.toViewConst(), {} ); + stream.wait_for( &e ); + + forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + PORTABLE_EXPECT_EQ( yPtr[ i ], i ); + yPtr[ i ] *= 2; + } ); + + e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} ); + stream.wait_for( &e ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], 2 * i ); + } + + // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing + // to host memory but the subsequent memcpy should pick up that it's previous space is on device. + y.move( MemorySpace::host ); + + ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView(); + forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + yView[ i ] = -i; + } ); + + e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} ); + stream.wait_for( &e ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], -i ); + } +} #endif TEST( TestMemcpy, MallocBuffer1D ) @@ -282,7 +381,7 @@ TEST( TestMemcpy, ChaiBuffer2D ) testMemcpy2D< ChaiBuffer >(); } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) TEST( TestMemcpy, ChaiBufferDevice ) { diff --git a/unitTests/testSortedArray.cpp b/unitTests/testSortedArray.cpp index 5198bd24..ae145fbd 100644 --- a/unitTests/testSortedArray.cpp +++ b/unitTests/testSortedArray.cpp @@ -451,7 +451,7 @@ using SortedArrayViewTestTypes = ::testing::Types< std::pair< SortedArray< int, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > , std::pair< SortedArray< Tensor, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > , std::pair< SortedArray< TestString, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI) , std::pair< SortedArray< int, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< SortedArray< Tensor, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testSortedArrayManipulation.cpp b/unitTests/testSortedArrayManipulation.cpp index 2d784cb2..ae376cb4 100644 --- a/unitTests/testSortedArrayManipulation.cpp +++ b/unitTests/testSortedArrayManipulation.cpp @@ -190,7 +190,7 @@ using SingleArrayTestTypes = ::testing::Types< , std::tuple< TestString, sortedArrayManipulation::less< TestString >, serialPolicy > , std::tuple< TestString, sortedArrayManipulation::greater< TestString >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > > , std::tuple< int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > > , std::tuple< Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > > @@ -290,7 +290,7 @@ using DualArrayTestTypes = ::testing::Types< , std::tuple< TestString, TestString, sortedArrayManipulation::less< TestString >, serialPolicy > , std::tuple< TestString, TestString, sortedArrayManipulation::greater< TestString >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) )&& defined(LVARRAY_USE_CHAI) , std::tuple< int, int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > > , std::tuple< int, int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > > , std::tuple< Tensor, Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > > diff --git a/unitTests/testSparsityPattern.cpp b/unitTests/testSparsityPattern.cpp index 50ec30f9..fee7a995 100644 --- a/unitTests/testSparsityPattern.cpp +++ b/unitTests/testSparsityPattern.cpp @@ -1016,7 +1016,7 @@ using SparsityPatternViewTestTypes = ::testing::Types< #endif #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< SparsityPattern< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #if !defined( __ibmxl__ ) , std::pair< SparsityPattern< uint, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > @@ -1171,7 +1171,7 @@ using CRSMatrixTestTypes = ::testing::Types< std::pair< CRSMatrix< int, int, std::ptrdiff_t, MallocBuffer >, serialPolicy > , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, MallocBuffer >, serialPolicy > , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, MallocBuffer >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testStackArray.cpp b/unitTests/testStackArray.cpp index 249ccebb..e29206ab 100644 --- a/unitTests/testStackArray.cpp +++ b/unitTests/testStackArray.cpp @@ -281,7 +281,7 @@ using StackArrayCaptureTestTypes = ::testing::Types< , std::pair< RAJA::PERM_KIJ, serialPolicy > , std::pair< RAJA::PERM_KJI, serialPolicy > -#if defined(LVARRAY_USE_CUDA) +#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) , std::pair< RAJA::PERM_I, parallelDevicePolicy< 32 > > , std::pair< RAJA::PERM_IJ, parallelDevicePolicy< 32 > > , std::pair< RAJA::PERM_JI, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsEigen.cpp b/unitTests/testTensorOpsEigen.cpp index 46ff354d..2c556ec7 100644 --- a/unitTests/testTensorOpsEigen.cpp +++ b/unitTests/testTensorOpsEigen.cpp @@ -243,7 +243,7 @@ using TestEigendecompositionTypes = ::testing::Types< , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< std::int64_t, double, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< std::int64_t, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsFixedSize.cpp b/unitTests/testTensorOpsFixedSize.cpp index e66fd5a3..c4ba14cb 100644 --- a/unitTests/testTensorOpsFixedSize.cpp +++ b/unitTests/testTensorOpsFixedSize.cpp @@ -569,7 +569,7 @@ using FixedSizeSquareMatrixTestTypes = ::testing::Types< std::tuple< double, std::integral_constant< int, 2 >, serialPolicy > , std::tuple< double, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp index 4909a686..9edfa950 100644 --- a/unitTests/testTensorOpsInverse.hpp +++ b/unitTests/testTensorOpsInverse.hpp @@ -375,7 +375,7 @@ using InverseTestTypes = ::testing::Types< , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< int, double, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< int, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > @@ -400,7 +400,7 @@ using InverseFloatOnlyTestTypes = ::testing::Types< , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< double, double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsNoSize.cpp b/unitTests/testTensorOpsNoSize.cpp index b08e5ae1..8c1112d4 100644 --- a/unitTests/testTensorOpsNoSize.cpp +++ b/unitTests/testTensorOpsNoSize.cpp @@ -349,7 +349,7 @@ using NoSizeTestTypes = ::testing::Types< std::tuple< double, serialPolicy > , std::tuple< int, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, parallelDevicePolicy< 32 > > , std::tuple< int, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testTensorOpsOneSize.cpp b/unitTests/testTensorOpsOneSize.cpp index fc351c75..78946638 100644 --- a/unitTests/testTensorOpsOneSize.cpp +++ b/unitTests/testTensorOpsOneSize.cpp @@ -693,7 +693,7 @@ using OneSizeTestTypes = ::testing::Types< , std::tuple< int, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< double, std::integral_constant< int, 6 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< int, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > , std::tuple< double, std::integral_constant< int, 6 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsThreeSizes.hpp b/unitTests/testTensorOpsThreeSizes.hpp index 5a27092a..b4546a9b 100644 --- a/unitTests/testTensorOpsThreeSizes.hpp +++ b/unitTests/testTensorOpsThreeSizes.hpp @@ -530,7 +530,7 @@ using ThreeSizesTestTypes = ::testing::Types< std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, diff --git a/unitTests/testTensorOpsTwoSizes.hpp b/unitTests/testTensorOpsTwoSizes.hpp index 07978011..5492b2b5 100644 --- a/unitTests/testTensorOpsTwoSizes.hpp +++ b/unitTests/testTensorOpsTwoSizes.hpp @@ -930,7 +930,7 @@ using TwoSizesTestTypes = ::testing::Types< , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy > , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > > , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsTwoSizes1.cpp b/unitTests/testTensorOpsTwoSizes1.cpp index 7f5a97d5..101c4671 100644 --- a/unitTests/testTensorOpsTwoSizes1.cpp +++ b/unitTests/testTensorOpsTwoSizes1.cpp @@ -6,13 +6,941 @@ */ // Source includes -#include "testTensorOpsTwoSizes.hpp" +//#include "testTensorOpsTwoSizes.hpp" + +// Source includes +#include "tensorOps.hpp" +#include "Array.hpp" +#include "testUtils.hpp" +#include "output.hpp" +#include "testTensorOpsCommon.hpp" + +// TPL includes +#include namespace LvArray { namespace testing { +template< typename T_N_M_POLICY_TUPLE > +class TwoSizesTest : public ::testing::Test +{ +public: + using T = std::tuple_element_t< 0, T_N_M_POLICY_TUPLE >; + static constexpr std::ptrdiff_t N = std::tuple_element_t< 1, T_N_M_POLICY_TUPLE > {}; + static constexpr std::ptrdiff_t M = std::tuple_element_t< 2, T_N_M_POLICY_TUPLE > {}; + using POLICY = std::tuple_element_t< 3, T_N_M_POLICY_TUPLE >; + + void SetUp() override + { + fill( m_matrixA_IJK.toSlice(), m_matrixASeed ); + fill( m_matrixA_IKJ.toSlice(), m_matrixASeed ); + fill( m_matrixA_KJI.toSlice(), m_matrixASeed ); + fill( m_matrixA_local, m_matrixASeed ); + + fill( m_matrixB_IJK.toSlice(), m_matrixBSeed ); + fill( m_matrixB_IKJ.toSlice(), m_matrixBSeed ); + fill( m_matrixB_KJI.toSlice(), m_matrixBSeed ); + fill( m_matrixB_local, m_matrixBSeed ); + + fill( m_matrixNN_IJK.toSlice(), m_matrixNNSeed ); + fill( m_matrixNN_IKJ.toSlice(), m_matrixNNSeed ); + fill( m_matrixNN_KJI.toSlice(), m_matrixNNSeed ); + fill( m_matrixNN_local, m_matrixNNSeed ); + + fill( m_matrixMN_IJK.toSlice(), m_matrixMNSeed ); + fill( m_matrixMN_IKJ.toSlice(), m_matrixMNSeed ); + fill( m_matrixMN_KJI.toSlice(), m_matrixMNSeed ); + fill( m_matrixMN_local, m_matrixMNSeed ); + + fill( m_vectorN_IJ.toSlice(), m_vectorNSeed ); + fill( m_vectorN_JI.toSlice(), m_vectorNSeed ); + fill( m_vectorN_local, m_vectorNSeed ); + + fill( m_vectorM_IJ.toSlice(), m_vectorMSeed ); + fill( m_vectorM_JI.toSlice(), m_vectorMSeed ); + fill( m_vectorM_local, m_vectorMSeed ); + } + + void testScale() + { + T scale = T( 3.14 ); + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_matrixA_local[ i ][ j ] * scale; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + std::ptrdiff_t const aSeed = m_matrixASeed; + forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, aSeed] LVARRAY_HOST_DEVICE ( int ) + { + tensorOps::scale< N, M >( matrixA_IJK[ 0 ], scale ); + CHECK_EQUALITY_2D( N, M, matrixA_IJK[ 0 ], result ); + + tensorOps::scale< N, M >( matrixA_IKJ[ 0 ], scale ); + CHECK_EQUALITY_2D( N, M, matrixA_IKJ[ 0 ], result ); + + tensorOps::scale< N, M >( matrixA_KJI[ 0 ], scale ); + CHECK_EQUALITY_2D( N, M, matrixA_KJI[ 0 ], result ); + + T matrix_local[ N ][ M ]; + fill( matrix_local, aSeed ); + tensorOps::scale< N, M >( matrix_local, scale ); + CHECK_EQUALITY_2D( N, M, matrix_local, result ); + } ); + } + + void testFill() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI] LVARRAY_HOST_DEVICE ( int ) + { + for( int i = 0; i < 3; ++i ) + { + T const value = 3.14 * i; + tensorOps::fill< N, M >( matrixA_IJK[ 0 ], value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrixA_IJK( 0, j, k ), value ); + } + } + + tensorOps::fill< N, M >( matrixA_IKJ[ 0 ], value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrixA_IKJ( 0, j, k ), value ); + } + } + + tensorOps::fill< N, M >( matrixA_KJI[ 0 ], value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrixA_KJI( 0, j, k ), value ); + } + } + + T matrix_local[ N ][ M ]; + tensorOps::fill< N, M >( matrix_local, value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrix_local[ j ][ k ], value ); + } + } + } + } ); + } + + void testAiBj() + { + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_vectorN_local[ i ] * m_vectorM_local[ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local, + vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( matrix, matrixSeed ); \ + tensorOps::Rij_eq_AiBj< N, M >( matrix, vectorN, vectorM ); \ + CHECK_EQUALITY_2D( N, M, matrix, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAiBj() + { + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_vectorN_local[ i ] * m_vectorM_local[ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( matrix, matrixSeed ); \ + tensorOps::Rij_add_AiBj< N, M >( matrix, vectorN, vectorM ); \ + CHECK_EQUALITY_2D( N, M, matrix, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAijBj() + { + T result[ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ]; + } + result[ i ] = dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const vectorNSeed = m_vectorNSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorN, vectorNSeed ); \ + tensorOps::Ri_eq_AijBj< N, M >( vectorN, matrix, vectorM ); \ + CHECK_EQUALITY_1D( N, vectorN, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorN_local[ N ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAijBj() + { + T result[ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ]; + } + result[ i ] = m_vectorN_local[ i ] + dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const vectorNSeed = m_vectorNSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorN, vectorNSeed ); \ + tensorOps::Ri_add_AijBj< N, M >( vectorN, matrix, vectorM ); \ + CHECK_EQUALITY_1D( N, vectorN, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorN_local[ N ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAjiBj() + { + T result[ M ]; + for( std::ptrdiff_t i = 0; i < M; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ]; + } + result[ i ] = dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView(); + + std::ptrdiff_t const vectorMSeed = m_vectorMSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorM, vectorMSeed ); \ + tensorOps::Ri_eq_AjiBj< M, N >( vectorM, matrix, vectorN ); \ + CHECK_EQUALITY_1D( M, vectorM, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorM_local[ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAjiBj() + { + T result[ M ]; + for( std::ptrdiff_t i = 0; i < M; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ]; + } + result[ i ] = m_vectorM_local[ i ] + dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView(); + + std::ptrdiff_t const vectorMSeed = m_vectorMSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorM, vectorMSeed ); \ + tensorOps::Ri_add_AjiBj< M, N >( vectorM, matrix, vectorN ); \ + CHECK_EQUALITY_1D( M, vectorM, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorM_local[ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testCopy() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst(); + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::copy< N, M >( dstMatrix, srcMatrix ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, srcMatrix ) + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testScaledCopy() + { + T scale = T( 3.14 ); + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = scale * m_matrixB_local[ i ][ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst(); + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::scaledCopy< N, M >( dstMatrix, srcMatrix, scale ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, result ) + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAdd() + { + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_matrixB_local[ i ][ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst(); + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::add< N, M >( dstMatrix, srcMatrix ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, result ) + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testScaledAdd() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toView(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toView(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toView(); + + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed ] + LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::scaledAdd< N, M >( dstMatrix, srcMatrix, scale ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, result ); \ + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrixA_local[ N ][ M ]; + fill( matrixA_local, matrixSeed ); + + T const scale = T( 3.14 ); + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = matrixA_local[ i ][ j ] + scale * matrixB_local[ i ][ j ]; + } + } + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAkiAkj() + { + T result[ N ][ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + T dot = 0; + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + dot += m_matrixMN_local[ k ][ i ] * m_matrixMN_local[ k ][ j ]; + } + result[ i ][ j ] = dot; + } + } + + ArrayViewT< T const, 3, 2 > const matrixMN_IJK = m_matrixMN_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixMN_IKJ = m_matrixMN_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixMN_KJI = m_matrixMN_KJI.toViewConst(); + T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local; + + ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView(); + + std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed; + + forall< POLICY >( 1, + [result, matrixMN_IJK, matrixMN_IKJ, matrixMN_KJI, matrixMN_local, matrixNN_IJK, + matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( matrixNN, matrixMN ) \ + fill( matrixNN, matrixNNSeed ); \ + tensorOps::Rij_eq_AkiAkj< N, M >( matrixNN, matrixMN ); \ + CHECK_EQUALITY_2D( N, N, matrixNN, result ) + + #define _TEST_PERMS( matrixNN, matrixMN0, matrixMN1, matrixMN2, matrixMN3 ) \ + _TEST( matrixNN, matrixMN0 ); \ + _TEST( matrixNN, matrixMN1 ); \ + _TEST( matrixNN, matrixMN2 ); \ + _TEST( matrixNN, matrixMN3 ) + + T matrixNN_local[ N ][ N ]; + + _TEST_PERMS( matrixNN_IJK[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixNN_KJI[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixNN_local, matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAikAjk() + { + T result[ N ][ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + T dot = 0; + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + dot += m_matrixA_local[ i ][ k ] * m_matrixA_local[ j ][ k ]; + } + result[ i ][ j ] = m_matrixNN_local[ i ][ j ] + dot; + } + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrixA_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView(); + + std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixA_local, matrixNN_IJK, + matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( matrixNN, matrixA ) \ + fill( matrixNN, matrixNNSeed ); \ + tensorOps::Rij_add_AikAjk< N, M >( matrixNN, matrixA ); \ + CHECK_EQUALITY_2D( N, N, matrixNN, result ) + + #define _TEST_PERMS( matrixNN, matrixA0, matrixA1, matrixA2, matrixA3 ) \ + _TEST( matrixNN, matrixA0 ); \ + _TEST( matrixNN, matrixA1 ); \ + _TEST( matrixNN, matrixA2 ); \ + _TEST( matrixNN, matrixA3 ) + + T matrixNN_local[ N ][ N ]; + + _TEST_PERMS( matrixNN_IJK[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + _TEST_PERMS( matrixNN_KJI[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + _TEST_PERMS( matrixNN_local, matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testTranspose() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixMN_IJK_view = m_matrixMN_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixMN_IKJ_view = m_matrixMN_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixMN_KJI_view = m_matrixMN_KJI.toViewConst(); + T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [=] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::transpose< N, M >( dstMatrix, srcMatrix ); \ + for( int i = 0; i < N; ++i ) \ + { \ + for( int j = 0; j < M; ++j ) \ + { \ + PORTABLE_EXPECT_EQ( dstMatrix[ i ][ j ], srcMatrix[ j ][ i ] ); \ + } \ + } + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + +private: + std::ptrdiff_t const m_matrixASeed = 0; + ArrayT< T, RAJA::PERM_IJK > m_matrixA_IJK { 1, N, M }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixA_IKJ { 1, N, M }; + ArrayT< T, RAJA::PERM_KJI > m_matrixA_KJI { 1, N, M }; + T m_matrixA_local[ N ][ M ]; + + std::ptrdiff_t const m_matrixBSeed = m_matrixASeed + N * M; + ArrayT< T, RAJA::PERM_IJK > m_matrixB_IJK { 1, N, M }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixB_IKJ { 1, N, M }; + ArrayT< T, RAJA::PERM_KJI > m_matrixB_KJI { 1, N, M }; + T m_matrixB_local[ N ][ M ]; + + std::ptrdiff_t const m_matrixNNSeed = m_matrixBSeed + N * M; + ArrayT< T, RAJA::PERM_IJK > m_matrixNN_IJK { 1, N, N }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixNN_IKJ { 1, N, N }; + ArrayT< T, RAJA::PERM_KJI > m_matrixNN_KJI { 1, N, N }; + T m_matrixNN_local[ N ][ N ]; + + std::ptrdiff_t const m_matrixMNSeed = m_matrixNNSeed + N * N; + ArrayT< T, RAJA::PERM_IJK > m_matrixMN_IJK { 1, M, N }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixMN_IKJ { 1, M, N }; + ArrayT< T, RAJA::PERM_KJI > m_matrixMN_KJI { 1, M, N }; + T m_matrixMN_local[ M ][ N ]; + + std::ptrdiff_t const m_vectorNSeed = m_matrixMNSeed + N * M; + ArrayT< T, RAJA::PERM_IJ > m_vectorN_IJ { 1, N }; + ArrayT< T, RAJA::PERM_JI > m_vectorN_JI { 1, N }; + T m_vectorN_local[ N ]; + + std::ptrdiff_t const m_vectorMSeed = m_vectorNSeed + N; + ArrayT< T, RAJA::PERM_IJ > m_vectorM_IJ { 1, M }; + ArrayT< T, RAJA::PERM_JI > m_vectorM_JI { 1, M }; + T m_vectorM_local[ M ]; +}; + + +using TwoSizesTestTypes = ::testing::Types< + std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, serialPolicy > + , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy > + , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy > + +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) + , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > + , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > > + , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > +#endif + >; + +TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes ); + + TYPED_TEST( TwoSizesTest, scale ) { this->testScale(); diff --git a/unitTests/testTypeManipulation.cpp b/unitTests/testTypeManipulation.cpp index 5bef9a4c..45ad8380 100644 --- a/unitTests/testTypeManipulation.cpp +++ b/unitTests/testTypeManipulation.cpp @@ -78,6 +78,23 @@ CUDA_TEST( typeManipulation, forEachArg ) }, intReducer, floatReducer, doubleReducer ); } ); + EXPECT_EQ( intReducer.get(), 2 ); + EXPECT_EQ( floatReducer.get(), 4 ); + EXPECT_EQ( doubleReducer.get(), 7 ); +#eli defined(LVARRAY_USE_HIP) + // Test on device. + RAJA::ReduceSum< RAJA::hip_reduce, int > intReducer( 1 ); + RAJA::ReduceSum< RAJA::hip_reduce, float > floatReducer( 3 ); + RAJA::ReduceSum< RAJA::hip_reduce, double > doubleReducer( 6 ); + forall< parallelDevicePolicy< 32 > >( 1, [intReducer, floatReducer, doubleReducer] LVARRAY_DEVICE ( int ) + { + // This has to be a host-device lambda to avoid errors. + typeManipulation::forEachArg( [] LVARRAY_HOST_DEVICE ( auto & reducer ) + { + reducer += 1; + }, intReducer, floatReducer, doubleReducer ); + } ); + EXPECT_EQ( intReducer.get(), 2 ); EXPECT_EQ( floatReducer.get(), 4 ); EXPECT_EQ( doubleReducer.get(), 7 ); diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp index 161c8453..639b20ad 100644 --- a/unitTests/testUtils.hpp +++ b/unitTests/testUtils.hpp @@ -73,6 +73,19 @@ struct RAJAHelper< RAJA::cuda_exec< N > > static constexpr MemorySpace space = MemorySpace::cuda; }; +#elif defined(LVARRAY_USE_HIP) + +template< unsigned long THREADS_PER_BLOCK > +using parallelDevicePolicy = RAJA::hip_exec< THREADS_PER_BLOCK >; + +template< unsigned long N > +struct RAJAHelper< RAJA::hip_exec< N > > +{ + using ReducePolicy = RAJA::hip_reduce; + using AtomicPolicy = RAJA::hip_atomic; + static constexpr MemorySpace space = MemorySpace::hip; +}; + #endif template< typename POLICY, typename INDEX_TYPE, typename LAMBDA > @@ -104,14 +117,14 @@ LAYOUT const & getRAJAViewLayout( RAJA::View< T, LAYOUT > const & view ) } -#ifndef __CUDA_ARCH__ -#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R ) -#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \ - STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ); -#else +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #define PORTABLE_EXPECT_EQ( L, R ) LVARRAY_ERROR_IF_NE( L, R ) #define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) LVARRAY_ERROR_IF_GE_MSG( math::abs( ( L ) -( R ) ), EPSILON, \ STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ) ); +#else +#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R ) +#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \ + STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ); #endif // Comparator that compares a std::pair by it's first object. From 0e2996f424a4c8ad06807f13dc569b2d9a1d48c9 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Fri, 25 Mar 2022 14:43:35 -0400 Subject: [PATCH 07/34] hip changes, crusher tpl installs --- CMakeLists.txt | 2 +- host-configs/ORNL/crusher-cce@13.0.1.cmake | 29 ++++++++++++++-------- src/ChaiBuffer.hpp | 11 ++++++-- src/Macros.hpp | 26 ++++++++++++++++--- src/system.cpp | 7 +++++- 5 files changed, 57 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e53d193d..f682d16b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,7 +81,7 @@ blt_list_append( TO lvarray_dependencies ELEMENTS chai IF ENABLE_CHAI ) blt_list_append( TO lvarray_dependencies ELEMENTS cuda IF ENABLE_CUDA ) -blt_list_append( TO lvarray_dependencies ELEMENTS hip hip_runtime IF ENABLE_HIP ) +blt_list_append( TO lvarray_dependencies ELEMENTS blt::hip IF ENABLE_HIP ) blt_list_append( TO lvarray_dependencies ELEMENTS caliper IF ENABLE_CALIPER ) diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake index 65830097..39684b93 100644 --- a/host-configs/ORNL/crusher-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -4,14 +4,18 @@ set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") -set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" ) -set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" ) +#set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" ) +#set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" ) +set(CAMP_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" ) set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) -set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-aeczo5gctizktwwt5x7xlmuyoarwipag/" CACHE PATH "" ) -set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" ) +set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-xpf6nnrxjhhggterbeto5ugxdgftpmon" CACHE PATH "" ) + set(ENABLE_CHAI TRUE CACHE BOOL "" ) -set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-yubforuougga3ujwwpfz3tmybqhroczp/" CACHE PATH "" ) +set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-develop-6yofhoaebc3bnz5wbzqnweeknbpomgrt" CACHE PATH "" ) + +#set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" ) +set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-develop-6rh55pqg6dxvconxa52itkvdnptm3mfl" CACHE PATH "" ) set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" ) @@ -28,19 +32,24 @@ set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) # HIP Options set( ENABLE_HIP ON CACHE BOOL "" FORCE ) +set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) -#set( CMAKE_CXX_FLAGS "--offload-arch=gfx90a -x hip -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE ) +# set( CMAKE_CXX_FLAGS "-D__HIP_PLATFORM_AMD__ -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1 --rocm-path=${HIP_ROOT} -x hip" CACHE STRING "" FORCE ) + +# set( HIP_HIPCC_INCLUDE_ARGS "$<$:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE ) +# set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE ) -set( HIP_HIPCC_INCLUDE_ARGS "$<$:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE ) -set( HIP_HIPCC_FLAGS "-std=c++14 --amdgpu-target=gfx90a" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE ) +# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link +# set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} ) -set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link -set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} ) +set(CMAKE_CXX_FLAGS "-D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE) +set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror # GTEST options set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp index 6b0d45ec..e82bd499 100644 --- a/src/ChaiBuffer.hpp +++ b/src/ChaiBuffer.hpp @@ -78,14 +78,20 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space ) if( space == chai::NONE ) return MemorySpace::undefined; if( space == chai::CPU ) + { + std::cout << "toHost" << std::endl; return MemorySpace::host; + } #if defined(LVARRAY_USE_CUDA) if( space == chai::GPU ) return MemorySpace::cuda; #endif #if defined(LVARRAY_USE_HIP) if( space == chai::GPU ) + { + std::cout << "toHIPGPU" << std::endl; return MemorySpace::hip; + } #endif LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) ); @@ -149,6 +155,7 @@ class ChaiBuffer for( int space = chai::CPU; space < chai::NUM_EXECUTION_SPACES; ++space ) { + // std::cout << space << std::endl m_pointerRecord->m_allocators[ space ] = internal::getArrayManager().getAllocatorId( chai::ExecutionSpace( space ) ); } } @@ -193,7 +200,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) ) + #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) ) move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true ); #endif } @@ -211,7 +218,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) ) + #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) ) moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true ); #else LVARRAY_UNUSED_VARIABLE( size ); diff --git a/src/Macros.hpp b/src/Macros.hpp index a2060c1a..e5549bb6 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -22,7 +22,20 @@ #include #include -#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) +#if defined(LVARRAY_USE_CUDA) + #define LVARRAY_GPU_LANG CUDA +#elif defined(LVARRAY_USE_HIP) + #define LVARRAY_GPU_LANG HIP +#endif + +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + #define LVARRAY_ON_DEVICE 1 +#else + #define LVARRAY_ON_DEVICE 0 +#endif + + +#if defined(LVARRAY_GPU_LANG) #include #endif @@ -91,7 +104,7 @@ * and a stack trace along with the provided message. On device none of this is * guaranteed. In fact it is only guaranteed to abort the current kernel. */ -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +#if defined(__CUDA_ARCH__) #if !defined(NDEBUG) #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ @@ -118,6 +131,7 @@ } \ } while( false ) #endif +//#elif defined(__HIP_DEVICE_COMPILE__) #else #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ @@ -535,7 +549,7 @@ */ #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" ) -#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIPCC__) ) +#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIP_DEVICE_COMPILE__) ) /// Mark a function for both host and device usage. #define LVARRAY_HOST_DEVICE __host__ __device__ @@ -549,7 +563,11 @@ * call host only code. This is safe as long as the host only instantiations are only called on * the host. To use place directly above a the template. */ -#define DISABLE_HD_WARNING _Pragma("hd_warning_disable") +#if defined(LVARRAY_USE_CUDA) + #define DISABLE_HD_WARNING _Pragma("hd_warning_disable") +#else + #define DISABLE_HD_WARNING +#endif #else /// Mark a function for both host and device usage. #define LVARRAY_HOST_DEVICE diff --git a/src/system.cpp b/src/system.cpp index 25a2ec13..a6532ac5 100644 --- a/src/system.cpp +++ b/src/system.cpp @@ -417,11 +417,16 @@ std::string calculateSize( size_t const bytes ) suffix = "MB"; shift = 20; } - else + else if( bytes >> 10 != 0 ) { suffix = "KB"; shift = 10; } + else + { + suffix = "B"; + shift = 0; + } double const units = double( bytes ) / ( 1 << shift ); From ada2118dce68bc47eb3916663478f39b7863b4ea Mon Sep 17 00:00:00 2001 From: William Tobin Date: Mon, 4 Apr 2022 13:59:22 -0400 Subject: [PATCH 08/34] ongoing hip work and debugging --- host-configs/ORNL/crusher-cce@13.0.1.cmake | 21 +++--- src/Array.hpp | 10 +-- src/ArrayOfArraysView.hpp | 4 ++ src/ArraySlice.hpp | 4 +- src/ArrayView.hpp | 3 + src/CRSMatrix.hpp | 2 +- src/CRSMatrixView.hpp | 3 +- src/ChaiBuffer.hpp | 11 +-- src/Macros.hpp | 64 +++++++++-------- src/arrayManipulation.hpp | 3 +- src/bufferManipulation.hpp | 2 +- src/math.hpp | 82 +++++++++++++--------- src/sortedArrayManipulation.hpp | 2 +- unitTests/testTensorOpsFixedSize.cpp | 1 - 14 files changed, 114 insertions(+), 98 deletions(-) diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake index 39684b93..e12e2ec6 100644 --- a/host-configs/ORNL/crusher-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -1,21 +1,19 @@ + set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") # Set up the tpls set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") -#set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" ) -#set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" ) set(CAMP_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" ) +set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" ) + set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) -set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-xpf6nnrxjhhggterbeto5ugxdgftpmon" CACHE PATH "" ) +set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" ) set(ENABLE_CHAI TRUE CACHE BOOL "" ) -set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-develop-6yofhoaebc3bnz5wbzqnweeknbpomgrt" CACHE PATH "" ) - -#set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" ) -set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-develop-6rh55pqg6dxvconxa52itkvdnptm3mfl" CACHE PATH "" ) +set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" ) set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" ) @@ -45,10 +43,13 @@ set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) # set( HIP_HIPCC_INCLUDE_ARGS "$<$:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE ) # set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE ) -# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link +# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fgpu-rdc --hip-link # set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} ) -set(CMAKE_CXX_FLAGS "-D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE) +# BLT WTF +#set(CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE) +#set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE) + set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror # GTEST options @@ -56,7 +57,7 @@ set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") set(gtest_disable_pthreads ON CACHE BOOL "") set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) -#set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) +set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) diff --git a/src/Array.hpp b/src/Array.hpp index d05769cd..28ef6f95 100644 --- a/src/Array.hpp +++ b/src/Array.hpp @@ -91,10 +91,10 @@ class Array : public ArrayView< T, { this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims ); -#if !defined(__CUDA_ARCH__) +#if !defined(LVARRAY_DEVICE_COMPILE) setName( "" ); #endif -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) Array::TV_ttf_display_type( nullptr ); #endif } @@ -121,10 +121,10 @@ class Array : public ArrayView< T, { this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims ); -#if !defined(__CUDA_ARCH__) +#if !defined(LVARRAY_DEVICE_COMPILE) setName( "" ); #endif -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) Array::TV_ttf_display_type( nullptr ); #endif } @@ -588,7 +588,7 @@ class Array : public ArrayView< T, void setName( std::string const & name ) { this->m_dataBuffer.template setName< decltype(*this) >( name ); } -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) /** * @brief Static function that will be used by Totalview to display the array contents. * @param av A pointer to the array that is being displayed. diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 52c8df15..e3aeb72a 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -202,12 +202,14 @@ class ArrayOfArraysView * @brief A constructor to create an uninitialized ArrayOfArraysView. * @note An uninitialized ArrayOfArraysView should not be used until it is assigned to. */ + LVARRAY_HOST_DEVICE ArrayOfArraysView() = default; /** * @brief Default copy constructor. * @note The copy constructor will trigger the copy constructor for @tparam BUFFER_TYPE */ + LVARRAY_HOST_DEVICE ArrayOfArraysView( ArrayOfArraysView const & ) = default; /** @@ -244,6 +246,7 @@ class ArrayOfArraysView * @brief Default copy assignment operator. * @return *this. */ + LVARRAY_HOST_DEVICE inline ArrayOfArraysView & operator=( ArrayOfArraysView const & ) = default; @@ -252,6 +255,7 @@ class ArrayOfArraysView * @param src the SparsityPatternView to be moved from. * @return *this. */ + LVARRAY_HOST_DEVICE inline ArrayOfArraysView & operator=( ArrayOfArraysView && src ) { diff --git a/src/ArraySlice.hpp b/src/ArraySlice.hpp index b4e22345..374979e3 100644 --- a/src/ArraySlice.hpp +++ b/src/ArraySlice.hpp @@ -126,7 +126,7 @@ class ArraySlice m_dims( inputDimensions ), m_strides( inputStrides ) { -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK) ArraySlice::TV_ttf_display_type( nullptr ); #endif } @@ -341,7 +341,7 @@ class ArraySlice ///@} -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK) /** * @brief Static function that will be used by Totalview to display the array contents. * @param av A pointer to the array that is being displayed. diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp index aabd48bf..799ac461 100644 --- a/src/ArrayView.hpp +++ b/src/ArrayView.hpp @@ -118,6 +118,7 @@ class ArrayView * @brief A constructor to create an uninitialized ArrayView. * @note An uninitialized ArrayView should not be used until it is assigned to. */ + LVARRAY_HOST_DEVICE ArrayView() = default; /** @@ -185,6 +186,7 @@ class ArrayView * ArrayView< int, 1, 0, std::ptrdiff_t, MallocBuffer > anotherView = std::move( view ); * @endcode */ + //LVARRAY_HOST_DEVICE ArrayView( ArrayView && source ) = default; /** @@ -206,6 +208,7 @@ class ArrayView {} /// The default destructor. + LVARRAY_HOST_DEVICE ~ArrayView() = default; /** diff --git a/src/CRSMatrix.hpp b/src/CRSMatrix.hpp index ddd786c5..daffdd9e 100644 --- a/src/CRSMatrix.hpp +++ b/src/CRSMatrix.hpp @@ -139,7 +139,7 @@ class CRSMatrix : protected CRSMatrixView< T, COL_TYPE, INDEX_TYPE, BUFFER_TYPE RAJA::forall< POLICY >( RAJA::TypedRangeSegment< INDEX_TYPE >( 0, numRows() ), [view] LVARRAY_HOST_DEVICE ( INDEX_TYPE const row ) { - INDEX_TYPE const nnz = view.numNonZeros( row ); + INDEX_TYPE const nnz = view.numNonZeros( row ); T * const entries = view.getEntries( row ); arrayManipulation::destroy( entries, nnz ); } ); diff --git a/src/CRSMatrixView.hpp b/src/CRSMatrixView.hpp index bc954672..fe3c7c99 100644 --- a/src/CRSMatrixView.hpp +++ b/src/CRSMatrixView.hpp @@ -106,12 +106,13 @@ class CRSMatrixView : protected SparsityPatternView< COL_TYPE, INDEX_TYPE, BUFFE /** * @brief Default copy constructor. */ + LVARRAY_HOST_DEVICE CRSMatrixView( CRSMatrixView const & ) = default; /** * @brief Default move constructor. */ - inline + LVARRAY_HOST_DEVICE inline CRSMatrixView( CRSMatrixView && ) = default; /** diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp index e82bd499..f78998a8 100644 --- a/src/ChaiBuffer.hpp +++ b/src/ChaiBuffer.hpp @@ -78,20 +78,14 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space ) if( space == chai::NONE ) return MemorySpace::undefined; if( space == chai::CPU ) - { - std::cout << "toHost" << std::endl; return MemorySpace::host; - } #if defined(LVARRAY_USE_CUDA) if( space == chai::GPU ) return MemorySpace::cuda; #endif #if defined(LVARRAY_USE_HIP) if( space == chai::GPU ) - { - std::cout << "toHIPGPU" << std::endl; return MemorySpace::hip; - } #endif LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) ); @@ -155,7 +149,6 @@ class ChaiBuffer for( int space = chai::CPU; space < chai::NUM_EXECUTION_SPACES; ++space ) { - // std::cout << space << std::endl m_pointerRecord->m_allocators[ space ] = internal::getArrayManager().getAllocatorId( chai::ExecutionSpace( space ) ); } } @@ -200,7 +193,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) ) + #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE) move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true ); #endif } @@ -218,7 +211,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) ) + #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE) moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true ); #else LVARRAY_UNUSED_VARIABLE( size ); diff --git a/src/Macros.hpp b/src/Macros.hpp index e5549bb6..d4ff7562 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -12,6 +12,8 @@ #pragma once +#pragma clang diagnostic ignored "-Wfloat-equal" + // Source includes #include "LvArrayConfig.hpp" #include "system.hpp" @@ -22,22 +24,23 @@ #include #include -#if defined(LVARRAY_USE_CUDA) - #define LVARRAY_GPU_LANG CUDA -#elif defined(LVARRAY_USE_HIP) - #define LVARRAY_GPU_LANG HIP + +#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) + #define LVARRAY_USE_DEVICE #endif #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - #define LVARRAY_ON_DEVICE 1 -#else - #define LVARRAY_ON_DEVICE 0 + #define LVARRAY_DEVICE_COMPILE #endif +#if defined(__CUDACC__) || defined(__HIPCC__) + #define LVARRAY_DECORATE +#endif -#if defined(LVARRAY_GPU_LANG) + +//#if !defined(NDEBUG) && defined(LVARRAY_DEVICE_COMPILE) #include -#endif +//#endif /** * @brief Convert @p A into a string. @@ -51,6 +54,8 @@ */ #define STRINGIZE( A ) STRINGIZE_NX( A ) +//#pragma message "LVARRAY_DEVICE_COMPILE: " STRINGIZE(LVARRAY_DEVICE_COMPILE) + /** * @brief Mark @p X as an unused argument, used to silence compiler warnings. * @param X the unused argument. @@ -104,8 +109,8 @@ * and a stack trace along with the provided message. On device none of this is * guaranteed. In fact it is only guaranteed to abort the current kernel. */ -#if defined(__CUDA_ARCH__) - #if !defined(NDEBUG) +#if defined(LVARRAY_DEVICE_COMPILE) +// #if !defined(NDEBUG) #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ { \ @@ -114,24 +119,23 @@ assert( false && "EXP = " STRINGIZE( EXP ) "MSG = " STRINGIZE( MSG ) ); \ } \ } while( false ) - #else -#define LVARRAY_ERROR_IF( EXP, MSG ) \ - do \ - { \ - if( EXP ) \ - { \ - constexpr char const * formatString = "***** ERROR\n" \ - "***** LOCATION: " LOCATION "\n" \ - "***** Block: [%u, %u, %u]\n" \ - "***** Thread: [%u, %u, %u]\n" \ - "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ - "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ - printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \ - asm ( "trap;" ); \ - } \ - } while( false ) - #endif -//#elif defined(__HIP_DEVICE_COMPILE__) +// #else +// #define LVARRAY_ERROR_IF( EXP, MSG ) \ +// do \ +// { \ +// if( EXP ) \ +// { \ +// constexpr char const * formatString = "***** ERROR\n" \ +// "***** LOCATION: " LOCATION "\n" \ +// "***** Block: [%u, %u, %u]\n" \ +// "***** Thread: [%u, %u, %u]\n" \ +// "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ +// "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ +// printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \ +// asm ( "trap;" ); \ +// } \ +// } while( false ) +// #endif #else #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ @@ -549,7 +553,7 @@ */ #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" ) -#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIP_DEVICE_COMPILE__) ) +#if defined(LVARRAY_DECORATE) /// Mark a function for both host and device usage. #define LVARRAY_HOST_DEVICE __host__ __device__ diff --git a/src/arrayManipulation.hpp b/src/arrayManipulation.hpp index 5409e60f..4b5c2d55 100644 --- a/src/arrayManipulation.hpp +++ b/src/arrayManipulation.hpp @@ -297,8 +297,7 @@ void resize( T * const LVARRAY_RESTRICT ptr, { if( newSize - size > 0 ) { - std::size_t const sizeDiff = integerConversion< std::size_t >( newSize - size ); - std::memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) ); + memset( reinterpret_cast< void * >( ptr + size ), 0, ( newSize - size ) * sizeof( T ) ); } } else diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp index b06a4e4c..548cfe2b 100644 --- a/src/bufferManipulation.hpp +++ b/src/bufferManipulation.hpp @@ -292,7 +292,7 @@ void resize( BUFFER & buf, std::ptrdiff_t const size, std::ptrdiff_t const newSi arrayManipulation::resize( buf.data(), size, newSize, std::forward< ARGS >( args )... ); -#if !defined(__CUDA_ARCH__) +#if !defined(LVARRAY_DEVICE_COMPILE) if( newSize > 0 ) { buf.registerTouch( MemorySpace::host ); diff --git a/src/math.hpp b/src/math.hpp index f832e0fa..3bf2d9fd 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -134,7 +134,7 @@ __half2 convert( __half2 const, U const u ) LVARRAY_HOST_DEVICE inline __half2 convert( __half2 const, __half const u ) { -#if defined( __CUDA_ARCH__ ) +#if defined( LVARRAY_DEVICE_COMPILE ) return __half2half2( u ); #else return __float2half2_rn( u ); @@ -164,7 +164,7 @@ __half2 convert( __half2 const, U const u, V const v ) LVARRAY_HOST_DEVICE inline __half2 convert( __half2 const, __half const u, __half const v ) { -#if defined( __CUDA_ARCH__ ) +#if defined( LVARRAY_DEVICE_COMPILE ) return __halves2half2( u, v ); #else return __floats2half2_rn( u, v ); @@ -310,7 +310,7 @@ LVARRAY_HOST_DEVICE inline constexpr std::enable_if_t< std::is_arithmetic< T >::value, T > max( T const a, T const b ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::max( a, b ); #else return std::max( a, b ); @@ -323,8 +323,10 @@ max( T const a, T const b ) LVARRAY_DEVICE inline __half max( __half const a, __half const b ) { -#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmax( a, b ); +#elif defined(LVARRAY_USE_HIP) + return __hgt( a, b ) ? a : b; #else return a > b ? a : b; #endif @@ -334,8 +336,10 @@ __half max( __half const a, __half const b ) LVARRAY_DEVICE inline __half2 max( __half2 const a, __half2 const b ) { -#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) - return __hmax2( a, b ); +#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) + return __hmax2( a, b ); +#elif defined(LVARRAY_USE_HIP) + return __hgt2( a, b ) ? a : b; #else __half2 const aFactor = __hge2( a, b ); __half2 const bFactor = convert< __half2 >( 1 ) - aFactor; @@ -357,7 +361,7 @@ LVARRAY_HOST_DEVICE inline constexpr std::enable_if_t< std::is_arithmetic< T >::value, T > min( T const a, T const b ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::min( a, b ); #else return std::min( a, b ); @@ -370,8 +374,10 @@ min( T const a, T const b ) LVARRAY_DEVICE inline __half min( __half const a, __half const b ) { -#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmin( a, b ); +#elif defined(LVARRAY_USE_HIP) + return __hlt( a, b ) ? a : b; #else return a < b ? a : b; #endif @@ -381,8 +387,10 @@ __half min( __half const a, __half const b ) LVARRAY_DEVICE inline __half2 min( __half2 const a, __half2 const b ) { -#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmin2( a, b ); +#elif defined(LVARRAY_USE_HIP) + return __hlt2( a, b ) ? a : b; #else __half2 const aFactor = __hle2( a, b ); __half2 const bFactor = convert< __half2 >( 1 ) - aFactor; @@ -401,7 +409,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline constexpr T abs( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::abs( x ); #else return std::abs( x ); @@ -460,7 +468,7 @@ T square( T const x ) LVARRAY_HOST_DEVICE inline float sqrt( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sqrtf( x ); #else return std::sqrt( x ); @@ -472,7 +480,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double sqrt( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sqrt( double( x ) ); #else return std::sqrt( x ); @@ -502,7 +510,7 @@ __half2 sqrt( __half2 const x ) LVARRAY_HOST_DEVICE inline float invSqrt( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::rsqrtf( x ); #else return 1 / std::sqrt( x ); @@ -514,7 +522,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double invSqrt( T const x ) { -#if defined( __CUDA_ARCH__ ) +#if defined( LVARRAY_DEVICE_COMPILE ) return ::rsqrt( double( x ) ); #else return 1 / std::sqrt( x ); @@ -551,7 +559,7 @@ __half2 invSqrt( __half2 const x ) LVARRAY_HOST_DEVICE inline float sin( float const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sinf( theta ); #else return std::sin( theta ); @@ -563,7 +571,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double sin( T const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sin( double( theta ) ); #else return std::sin( theta ); @@ -593,7 +601,7 @@ __half2 sin( __half2 const theta ) LVARRAY_HOST_DEVICE inline float cos( float const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::cosf( theta ); #else return std::cos( theta ); @@ -605,7 +613,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double cos( T const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::cos( double( theta ) ); #else return std::cos( theta ); @@ -635,8 +643,12 @@ __half2 cos( __half2 const theta ) LVARRAY_HOST_DEVICE inline void sincos( float const theta, float & sinTheta, float & cosTheta ) { -#if defined(__CUDA_ARCH__) - ::sincos( theta, &sinTheta, &cosTheta ); +#if defined(LVARRAY_DEVICE_COMPILE) + #if defined(LVARRAY_USE_CUDA) + ::sincos( theta, &sinTheta, &cosTheta ); + #elif defined(LVARRAY_USE_HIP) + ::sincosf( theta, &sinTheta, &cosTheta ); + #endif #else sinTheta = std::sin( theta ); cosTheta = std::cos( theta ); @@ -648,8 +660,8 @@ template< typename T > LVARRAY_HOST_DEVICE inline void sincos( double const theta, double & sinTheta, double & cosTheta ) { -#if defined(__CUDA_ARCH__) - ::sincos( theta, &sinTheta, &cosTheta ); +#if defined(LVARRAY_DEVICE_COMPILE) + ::sincos( theta, &sinTheta, &cosTheta ); // hip and cuda versions both use double #else sinTheta = std::sin( theta ); cosTheta = std::cos( theta ); @@ -661,7 +673,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline void sincos( T const theta, double & sinTheta, double & cosTheta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) double s, c; ::sincos( theta, &s, &c ); sinTheta = s; @@ -701,7 +713,7 @@ void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta ) LVARRAY_HOST_DEVICE inline float tan( float const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::tanf( theta ); #else return std::tan( theta ); @@ -713,7 +725,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double tan( T const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::tan( double( theta ) ); #else return std::tan( theta ); @@ -845,7 +857,7 @@ T atan2Impl( T const y, T const x ) LVARRAY_HOST_DEVICE inline float asin( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::asinf( x ); #else return std::asin( x ); @@ -857,7 +869,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double asin( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::asin( double( x ) ); #else return std::asin( x ); @@ -887,7 +899,7 @@ __half2 asin( __half2 const x ) LVARRAY_HOST_DEVICE inline float acos( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::acosf( x ); #else return std::acos( x ); @@ -899,7 +911,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double acos( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::acos( double( x ) ); #else return std::acos( x ); @@ -930,7 +942,7 @@ __half2 acos( __half2 const x ) LVARRAY_HOST_DEVICE inline float atan2( float const y, float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::atan2f( y, x ); #else return std::atan2( y, x ); @@ -942,7 +954,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double atan2( T const y, T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::atan2( double( y ), double( x ) ); #else return std::atan2( y, x ); @@ -979,7 +991,7 @@ __half2 atan2( __half2 const y, __half2 const x ) LVARRAY_HOST_DEVICE inline float exp( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::expf( x ); #else return std::exp( x ); @@ -991,7 +1003,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double exp( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::exp( double( x ) ); #else return std::exp( x ); @@ -1021,7 +1033,7 @@ __half2 exp( __half2 const x ) LVARRAY_HOST_DEVICE inline float log( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::logf( x ); #else return std::log( x ); @@ -1033,7 +1045,7 @@ template< typename T > LVARRAY_HOST_DEVICE inline double log( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::log( double( x ) ); #else return std::log( x ); diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp index 7e9cae5d..d4bdbeed 100644 --- a/src/sortedArrayManipulation.hpp +++ b/src/sortedArrayManipulation.hpp @@ -211,7 +211,7 @@ LVARRAY_HOST_DEVICE inline void makeSorted( RandomAccessIterator const first, RandomAccessIterator const last, Compare && comp=Compare() ) { -#ifdef __CUDA_ARCH__ +#if defined(LVARRAY_DEVICE_COMPILE) if( last - first > internal::INTROSORT_THRESHOLD ) { internal::introsortLoop( first, last, comp ); diff --git a/unitTests/testTensorOpsFixedSize.cpp b/unitTests/testTensorOpsFixedSize.cpp index c4ba14cb..21392a17 100644 --- a/unitTests/testTensorOpsFixedSize.cpp +++ b/unitTests/testTensorOpsFixedSize.cpp @@ -616,6 +616,5 @@ TYPED_TEST( FixedSizeSquareMatrixTest, denseToSymmetric ) { this->denseToSymmetric(); } - } // namespace testing } // namespace LvArray From dc6df9d0623ecba7d0686c5050f140370dd13944 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 12 May 2022 10:10:55 -0700 Subject: [PATCH 09/34] reactivate device error macro --- src/Macros.hpp | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/Macros.hpp b/src/Macros.hpp index d4ff7562..b38f757b 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -110,7 +110,7 @@ * guaranteed. In fact it is only guaranteed to abort the current kernel. */ #if defined(LVARRAY_DEVICE_COMPILE) -// #if !defined(NDEBUG) + #if !defined(NDEBUG) #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ { \ @@ -119,23 +119,24 @@ assert( false && "EXP = " STRINGIZE( EXP ) "MSG = " STRINGIZE( MSG ) ); \ } \ } while( false ) -// #else -// #define LVARRAY_ERROR_IF( EXP, MSG ) \ -// do \ -// { \ -// if( EXP ) \ -// { \ -// constexpr char const * formatString = "***** ERROR\n" \ -// "***** LOCATION: " LOCATION "\n" \ -// "***** Block: [%u, %u, %u]\n" \ -// "***** Thread: [%u, %u, %u]\n" \ -// "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ -// "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ -// printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \ -// asm ( "trap;" ); \ -// } \ -// } while( false ) -// #endif + #else +#define LVARRAY_ERROR_IF( EXP, MSG ) \ + do \ + { \ + if( EXP ) \ + { \ + constexpr char const * formatString = "***** ERROR\n" \ + "***** LOCATION: " LOCATION "\n" \ + "***** Block: [%u, %u, %u]\n" \ + "***** Thread: [%u, %u, %u]\n" \ + "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ + "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ + printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \ + asm ( "trap;" ); \ + } \ + } while( false ) + #endif + #else #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ From 08ef87505014303e0c7c95d7802aa03022d2c09f Mon Sep 17 00:00:00 2001 From: wrtobin Date: Fri, 13 May 2022 13:20:59 -0700 Subject: [PATCH 10/34] hip device namespace issue --- src/arrayManipulation.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/arrayManipulation.hpp b/src/arrayManipulation.hpp index 4b5c2d55..21f708e1 100644 --- a/src/arrayManipulation.hpp +++ b/src/arrayManipulation.hpp @@ -297,7 +297,8 @@ void resize( T * const LVARRAY_RESTRICT ptr, { if( newSize - size > 0 ) { - memset( reinterpret_cast< void * >( ptr + size ), 0, ( newSize - size ) * sizeof( T ) ); + std::size_t const sizeDiff = integerConversion< std::size_t >( newSize - size ); + memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) ); } } else From cee3b1cb3f6468f943598a3c929c57b345184b25 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Thu, 12 May 2022 13:16:34 -0400 Subject: [PATCH 11/34] more crusher changes --- host-configs/ORNL/crusher-cce@13.0.1.cmake | 16 ++++++++++------ src/ArrayView.hpp | 10 +++++----- src/indexing.hpp | 2 +- src/sortedArrayManipulation.hpp | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake index e12e2ec6..d76cde9b 100644 --- a/host-configs/ORNL/crusher-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -4,16 +4,17 @@ set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") # Set up the tpls set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") +set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "") -set(CAMP_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" ) +set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" ) -set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" ) +set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" ) set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) -set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" ) set(ENABLE_CHAI TRUE CACHE BOOL "" ) -set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" ) set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" ) @@ -46,9 +47,12 @@ set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) # set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fgpu-rdc --hip-link # set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} ) +set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) +set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) + # BLT WTF -#set(CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE) -#set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE) +# set(CMAKE_HIP_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE) +# set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc " CACHE STRING "" FORCE) set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp index 799ac461..7f9f8df3 100644 --- a/src/ArrayView.hpp +++ b/src/ArrayView.hpp @@ -186,7 +186,7 @@ class ArrayView * ArrayView< int, 1, 0, std::ptrdiff_t, MallocBuffer > anotherView = std::move( view ); * @endcode */ - //LVARRAY_HOST_DEVICE + LVARRAY_HOST_DEVICE ArrayView( ArrayView && source ) = default; /** @@ -515,7 +515,7 @@ class ArrayView * @note This method is only active when NDIM > 1. */ template< int _NDIM=NDIM > - LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK + LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > > operator[]( INDEX_TYPE const index ) const & noexcept { @@ -534,7 +534,7 @@ class ArrayView * prevents that from happening. */ template< int _NDIM=NDIM > - LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK + LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > > operator[]( INDEX_TYPE const index ) const && noexcept = delete; @@ -544,7 +544,7 @@ class ArrayView * @note This method is only active when NDIM == 1. */ template< int _NDIM=NDIM > - LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK + LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK std::enable_if_t< _NDIM == 1, T & > operator[]( INDEX_TYPE const index ) const & noexcept { @@ -558,7 +558,7 @@ class ArrayView * @param indices The indices of the value to access. */ template< typename ... INDICES > - LVARRAY_HOST_DEVICE inline constexpr + LVARRAY_HOST_DEVICE __forceinline__ constexpr T & operator()( INDICES... indices ) const { static_assert( sizeof ... (INDICES) == NDIM, "number of indices does not match NDIM" ); diff --git a/src/indexing.hpp b/src/indexing.hpp index 2dca4597..dbb6219f 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -44,7 +44,7 @@ struct ConditionalMultiply * @param b The right multiplication operand. */ template< typename A, typename B > - static inline LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b ) + static __forceinline__ LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b ) { return a * b; } }; diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp index d4bdbeed..1291929d 100644 --- a/src/sortedArrayManipulation.hpp +++ b/src/sortedArrayManipulation.hpp @@ -407,7 +407,7 @@ bool isSortedUnique( ITER first, ITER const last, Compare && comp=Compare() ) */ DISABLE_HD_WARNING template< typename T, typename Compare=less< T > > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE __forceinline__ std::ptrdiff_t find( T const * const LVARRAY_RESTRICT ptr, std::ptrdiff_t const size, T const & value, From 9926a7013b9f8290c1d39ab78bf083a07573d803 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Fri, 13 May 2022 17:03:19 -0400 Subject: [PATCH 12/34] raja api change --- src/ArrayOfArraysView.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index e3aeb72a..d042d091 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -732,12 +732,7 @@ class ArrayOfArraysView auto const fillOffsets = [&]() { m_offsets[ 0 ] = 0; -// RAJA::inclusive_scan< POLICY >( capacities, -// capacities + numSubArrays, -// m_offsets.data() + 1 ); - - RAJA::inclusive_scan< POLICY >( RAJA::make_span< INDEX_TYPE const * >( capacities, numSubArrays ), - RAJA::make_span< INDEX_TYPE * >( m_offsets.data()+1, numSubArrays ) ); + RAJA::inclusive_scan< POLICY >( RAJA::make_span(capacities, numSubArrays), RAJA::make_span(m_offsets.data() + 1, numSubArrays) ); }; resizeFromOffsetsImpl( numSubArrays, fillOffsets, buffers ... ); } From 50cb3431390413b253c6cfaef508da01b66a3740 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Fri, 20 May 2022 14:00:33 -0400 Subject: [PATCH 13/34] cleanup and changes related to cpu-only build post 05.17.22 crusher downtime --- host-configs/ORNL/crusher-cce@13.0.1.cmake | 30 ++++------ .../ORNL/crusher-cpu-cce@13.0.1.cmake | 60 +++++++++++++++++++ src/ArrayView.hpp | 8 +-- src/indexing.hpp | 2 +- src/sortedArrayManipulation.hpp | 2 +- 5 files changed, 76 insertions(+), 26 deletions(-) create mode 100644 host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake index d76cde9b..157fc0ab 100644 --- a/host-configs/ORNL/crusher-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -31,30 +31,20 @@ set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) # HIP Options set( ENABLE_HIP ON CACHE BOOL "" FORCE ) -set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation -set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) -set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation -set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) -set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) + set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) -# set( CMAKE_CXX_FLAGS "-D__HIP_PLATFORM_AMD__ -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1 --rocm-path=${HIP_ROOT} -x hip" CACHE STRING "" FORCE ) + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() -# set( HIP_HIPCC_INCLUDE_ARGS "$<$:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE ) -# set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE ) - -# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fgpu-rdc --hip-link -# set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} ) - -set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) -set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) - -# BLT WTF -# set(CMAKE_HIP_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE) -# set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc " CACHE STRING "" FORCE) - -set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror +set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror # GTEST options set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake new file mode 100644 index 00000000..1a12fc7d --- /dev/null +++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake @@ -0,0 +1,60 @@ + +set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") + +# Set up the tpls +set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") +set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-mej6trivmy7o5vlr6a52cml6tzxb5fvk" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-tmukf35ms7f2pkfswpejbnt3jtnpkakc" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-2022.03.0-unirfq5er4vtyr2koymgi3xxq6h2f5l5" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-aggyh463v2rz6s44laqshylc4xeeg4h7" CACHE PATH "" ) + +set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) +set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" ) + +# C++ options +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "") + +set(CMAKE_CXX_STANDARD 14 CACHE STRING "") + +set( ENABLE_MPI ON CACHE BOOL "" FORCE ) +set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) + +# HIP Options +set( ENABLE_HIP OFF CACHE BOOL "" FORCE ) + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) + set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() + +set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror + +# GTEST options +set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") +set(gtest_disable_pthreads ON CACHE BOOL "") + +set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) +set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) +set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) +set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) +set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) + +#BLT +set(ENABLE_FIND_MPI FALSE CACHE BOOL "") \ No newline at end of file diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp index 7f9f8df3..1a2a3d03 100644 --- a/src/ArrayView.hpp +++ b/src/ArrayView.hpp @@ -515,7 +515,7 @@ class ArrayView * @note This method is only active when NDIM > 1. */ template< int _NDIM=NDIM > - LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK + LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > > operator[]( INDEX_TYPE const index ) const & noexcept { @@ -534,7 +534,7 @@ class ArrayView * prevents that from happening. */ template< int _NDIM=NDIM > - LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK + LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > > operator[]( INDEX_TYPE const index ) const && noexcept = delete; @@ -544,7 +544,7 @@ class ArrayView * @note This method is only active when NDIM == 1. */ template< int _NDIM=NDIM > - LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK + LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK std::enable_if_t< _NDIM == 1, T & > operator[]( INDEX_TYPE const index ) const & noexcept { @@ -558,7 +558,7 @@ class ArrayView * @param indices The indices of the value to access. */ template< typename ... INDICES > - LVARRAY_HOST_DEVICE __forceinline__ constexpr + LVARRAY_HOST_DEVICE inline constexpr T & operator()( INDICES... indices ) const { static_assert( sizeof ... (INDICES) == NDIM, "number of indices does not match NDIM" ); diff --git a/src/indexing.hpp b/src/indexing.hpp index dbb6219f..2dca4597 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -44,7 +44,7 @@ struct ConditionalMultiply * @param b The right multiplication operand. */ template< typename A, typename B > - static __forceinline__ LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b ) + static inline LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b ) { return a * b; } }; diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp index 1291929d..d4bdbeed 100644 --- a/src/sortedArrayManipulation.hpp +++ b/src/sortedArrayManipulation.hpp @@ -407,7 +407,7 @@ bool isSortedUnique( ITER first, ITER const last, Compare && comp=Compare() ) */ DISABLE_HD_WARNING template< typename T, typename Compare=less< T > > -LVARRAY_HOST_DEVICE __forceinline__ +LVARRAY_HOST_DEVICE inline std::ptrdiff_t find( T const * const LVARRAY_RESTRICT ptr, std::ptrdiff_t const size, T const & value, From db28fd4513a27dfcbacd17f447e3a82a5f5c6216 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Fri, 20 May 2022 15:35:08 -0400 Subject: [PATCH 14/34] formatting --- src/Macros.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Macros.hpp b/src/Macros.hpp index b38f757b..cffdaed6 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -109,8 +109,11 @@ * and a stack trace along with the provided message. On device none of this is * guaranteed. In fact it is only guaranteed to abort the current kernel. */ +// cce processes __host__ functions with __hip_device_compile__=1 when -x hip? +// the entire compilation unit has __hip_device_compile__=1, whereas __cuda_arch__ +// seems to be scope-defined as it isn't defined in __host__ functions #if defined(LVARRAY_DEVICE_COMPILE) - #if !defined(NDEBUG) + #if !defined(NDEBUG) || __HIP_DEVICE_COMPILE__ #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ { \ @@ -127,16 +130,15 @@ { \ constexpr char const * formatString = "***** ERROR\n" \ "***** LOCATION: " LOCATION "\n" \ - "***** Block: [%u, %u, %u]\n" \ - "***** Thread: [%u, %u, %u]\n" \ - "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ - "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ + "***** Block: [%u, %u, %u]\n" \ + "***** Thread: [%u, %u, %u]\n" \ + "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ + "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \ asm ( "trap;" ); \ } \ } while( false ) #endif - #else #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ From eee17ae2bf135ee6c9760b924af4e523652b22eb Mon Sep 17 00:00:00 2001 From: William Tobin Date: Fri, 3 Jun 2022 15:05:01 -0400 Subject: [PATCH 15/34] cce@14.0.0 and rocm@5.2.0 --- host-configs/ORNL/crusher-cce@14.0.0.cmake | 69 +++++++ src/Macros.hpp | 4 + src/math.hpp | 206 +++++++++++---------- 3 files changed, 178 insertions(+), 101 deletions(-) create mode 100644 host-configs/ORNL/crusher-cce@14.0.0.cmake diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake new file mode 100644 index 00000000..6a509960 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake @@ -0,0 +1,69 @@ + +set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "") + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" ) + +set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" ) +set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.15") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +set(CMAKE_CXX_STANDARD 14 CACHE STRING "") + +set( ENABLE_MPI ON CACHE BOOL "" FORCE ) +set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) + +# HIP Options +set( ENABLE_HIP ON CACHE BOOL "" FORCE ) + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() + +# suppress -Werror for now +set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) + +# GTEST +set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") +set(gtest_disable_pthreads ON CACHE BOOL "") + +# disable most binaries and doc generation +set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) +set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) +set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) +set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) +set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) + +# BLT trying to find MPI fails on cray with cce +set(ENABLE_FIND_MPI FALSE CACHE BOOL "") + + + + + + diff --git a/src/Macros.hpp b/src/Macros.hpp index cffdaed6..60a545a5 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -31,6 +31,9 @@ #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #define LVARRAY_DEVICE_COMPILE + #define LVARRAY_FORCE_INLINE __forceinline__ +#else + #define LVARRAY_FORCE_INLINE inline #endif #if defined(__CUDACC__) || defined(__HIPCC__) @@ -38,6 +41,7 @@ #endif + //#if !defined(NDEBUG) && defined(LVARRAY_DEVICE_COMPILE) #include //#endif diff --git a/src/math.hpp b/src/math.hpp index 3bf2d9fd..d2b07191 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -45,7 +45,7 @@ namespace internal * @return @p u converted to @tparam T. */ template< typename T, typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T convert( T const, U const u ) { return u; } @@ -55,7 +55,7 @@ T convert( T const, U const u ) * @return The number of values stored in @tparam T, by default this is 1. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr int numValues( T const ) { return 1; } @@ -76,7 +76,7 @@ struct SingleType * @param x The value to return. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr SingleType< T > getFirst( T const x ) { return x; } @@ -86,7 +86,7 @@ SingleType< T > getFirst( T const x ) * @param x The value to return. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr SingleType< T > getSecond( T const x ) { return x; } @@ -96,10 +96,17 @@ SingleType< T > getSecond( T const x ) * @param x The first value. * @param y The second value. */ -template< typename T > -LVARRAY_HOST_DEVICE inline constexpr -T lessThan( T const x, T const y ) -{ return __hlt( x, y ); } + LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE +__half lessThan( __half const x, __half const y ) +{ + return __hlt( x, y ); +} + +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE +__half2 lessThan( __half2 const x, __half2 const y ) +{ + return __hlt2( x, y ); +} #if defined( LVARRAY_USE_CUDA ) /** @@ -110,7 +117,7 @@ T lessThan( T const x, T const y ) * @return @p u converted to @c __half. */ template< typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr __half convert( __half const, U const u ) { return __float2half_rn( u ); } @@ -122,7 +129,7 @@ __half convert( __half const, U const u ) * @return A @c __half2 with both halves having value @p u converted to @c __half. */ template< typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr __half2 convert( __half2 const, U const u ) { return __float2half2_rn( u ); } @@ -131,7 +138,7 @@ __half2 convert( __half2 const, U const u ) * @param u The value to convert. * @return A @c __half2 with both halves having value @p u. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE __half2 convert( __half2 const, __half const u ) { #if defined( LVARRAY_DEVICE_COMPILE ) @@ -151,7 +158,7 @@ __half2 convert( __half2 const, __half const u ) * @return A @c __half2 containing @p u as the first value and @p v as the second. */ template< typename U, typename V > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr __half2 convert( __half2 const, U const u, V const v ) { return __floats2half2_rn( u, v ); } @@ -161,7 +168,7 @@ __half2 convert( __half2 const, U const u, V const v ) * @param v The second value to convert. * @return A @c __half2 containing @p u as the first value and @p v as the second. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE __half2 convert( __half2 const, __half const u, __half const v ) { #if defined( LVARRAY_DEVICE_COMPILE ) @@ -175,7 +182,7 @@ __half2 convert( __half2 const, __half const u, __half const v ) * @brief Return the number of values stored in a @c __half2, which is 2. * @return The number of values stored in a @c __half2, which is 2. */ -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr int numValues( __half2 const & ) { return 2; } @@ -193,7 +200,7 @@ struct SingleType< __half2 > * @return The fist @c __half in @p x. * @param x The value to query. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half getFirst( __half2 const x ) { return __low2half( x ); } @@ -201,7 +208,7 @@ __half getFirst( __half2 const x ) * @return The second @c __half in @p x. * @param x The value to query. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half getSecond( __half2 const x ) { return __high2half( x ); } @@ -210,7 +217,7 @@ __half getSecond( __half2 const x ) * @param x The first value. * @param y The second value. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half lessThan( __half const x, __half const y ) { return __hlt( x, y ); } @@ -219,7 +226,7 @@ __half lessThan( __half const x, __half const y ) * @param x The first value. * @param y The second value. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 lessThan( __half2 const x, __half2 const y ) { return __hlt2( x, y ); } @@ -238,7 +245,7 @@ __half2 lessThan( __half2 const x, __half2 const y ) * @return The number of values stored in type @tparam T. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr int numValues() { return internal::numValues( T() ); } @@ -258,7 +265,7 @@ using SingleType = typename internal::SingleType< T >::type; * @return @p u converted to @tparam T. */ template< typename T, typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T convert( U const u ) { return internal::convert( T(), u ); } @@ -273,7 +280,7 @@ T convert( U const u ) * @return @p u, @p v converted to @tparam T. */ template< typename T, typename U, typename V > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T convert( U const u, V const v ) { return internal::convert( T(), u, v ); } @@ -284,7 +291,7 @@ T convert( U const u, V const v ) * @note If @code numValues< T >() == 1 @endcode then @p x is returned. */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE SingleType< T > getFirst( T const x ) { return internal::getFirst( x ); } @@ -295,7 +302,7 @@ SingleType< T > getFirst( T const x ) * @note If @code numValues< T >() == 1 @endcode then @p x is returned. */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE SingleType< T > getSecond( T const x ) { return internal::getSecond( x ); } @@ -306,7 +313,7 @@ SingleType< T > getSecond( T const x ) * @param b The second number. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr std::enable_if_t< std::is_arithmetic< T >::value, T > max( T const a, T const b ) { @@ -317,10 +324,10 @@ max( T const a, T const b ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc max( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half max( __half const a, __half const b ) { #if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) @@ -333,13 +340,11 @@ __half max( __half const a, __half const b ) } /// @copydoc max( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 max( __half2 const a, __half2 const b ) { #if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmax2( a, b ); -#elif defined(LVARRAY_USE_HIP) - return __hgt2( a, b ) ? a : b; #else __half2 const aFactor = __hge2( a, b ); __half2 const bFactor = convert< __half2 >( 1 ) - aFactor; @@ -357,7 +362,7 @@ __half2 max( __half2 const a, __half2 const b ) * @param b The second number. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr std::enable_if_t< std::is_arithmetic< T >::value, T > min( T const a, T const b ) { @@ -371,26 +376,24 @@ min( T const a, T const b ) #if defined( LVARRAY_USE_CUDA ) /// @copydoc min( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE +LVARRAY_FORCE_INLINE __half min( __half const a, __half const b ) { -#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmin( a, b ); -#elif defined(LVARRAY_USE_HIP) - return __hlt( a, b ) ? a : b; #else return a < b ? a : b; #endif } /// @copydoc min( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE +LVARRAY_FORCE_INLINE __half2 min( __half2 const a, __half2 const b ) { -#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmin2( a, b ); -#elif defined(LVARRAY_USE_HIP) - return __hlt2( a, b ) ? a : b; #else __half2 const aFactor = __hle2( a, b ); __half2 const bFactor = convert< __half2 >( 1 ) - aFactor; @@ -406,7 +409,7 @@ __half2 min( __half2 const a, __half2 const b ) * @note This set of overloads is valid for any numeric type. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T abs( T const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -416,10 +419,10 @@ T abs( T const x ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc abs( T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half abs( __half const x ) { #if CUDART_VERSION > 11000 @@ -430,7 +433,7 @@ __half abs( __half const x ) } /// @copydoc abs( T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 abs( __half2 const x ) { #if CUDART_VERSION > 11000 @@ -448,7 +451,7 @@ __half2 abs( __half2 const x ) * @param x The value to square. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T square( T const x ) { return x * x; } @@ -465,7 +468,7 @@ T square( T const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is @c double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float sqrt( float const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -477,7 +480,7 @@ float sqrt( float const x ) /// @copydoc sqrt( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double sqrt( T const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -487,15 +490,15 @@ double sqrt( T const x ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc sqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half sqrt( __half const x ) { return ::hsqrt( x ); } /// @copydoc sqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 sqrt( __half2 const x ) { return ::h2sqrt( x ); } @@ -507,7 +510,7 @@ __half2 sqrt( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float invSqrt( float const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -519,7 +522,7 @@ float invSqrt( float const x ) /// @copydoc invSqrt( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double invSqrt( T const x ) { #if defined( LVARRAY_DEVICE_COMPILE ) @@ -529,15 +532,15 @@ double invSqrt( T const x ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc invSqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half invSqrt( __half const x ) { return ::hrsqrt( x ); } /// @copydoc invSqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 invSqrt( __half2 const x ) { return ::h2rsqrt( x ); } @@ -556,7 +559,7 @@ __half2 invSqrt( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float sin( float const theta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -568,7 +571,7 @@ float sin( float const theta ) /// @copydoc sin( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double sin( T const theta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -578,15 +581,15 @@ double sin( T const theta ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc sin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half sin( __half const theta ) { return ::hsin( theta ); } /// @copydoc sin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 sin( __half2 const theta ) { return ::h2sin( theta ); } @@ -598,7 +601,7 @@ __half2 sin( __half2 const theta ) * @note This set of overloads is valid for any numeric type. If @p theta is not a float * it is converted to a double and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float cos( float const theta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -610,7 +613,7 @@ float cos( float const theta ) /// @copydoc cos( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double cos( T const theta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -620,15 +623,15 @@ double cos( T const theta ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc cos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half cos( __half const theta ) { return ::hcos( theta ); } /// @copydoc cos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 cos( __half2 const theta ) { return ::h2cos( theta ); } @@ -640,7 +643,7 @@ __half2 cos( __half2 const theta ) * @param sinTheta The sine of @p theta. * @param cosTheta The cosine of @p theta. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE void sincos( float const theta, float & sinTheta, float & cosTheta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -657,7 +660,7 @@ void sincos( float const theta, float & sinTheta, float & cosTheta ) /// @copydoc sincos( float, float &, float & ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE void sincos( double const theta, double & sinTheta, double & cosTheta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -670,7 +673,7 @@ void sincos( double const theta, double & sinTheta, double & cosTheta ) /// @copydoc sincos( float, float &, float & ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE void sincos( T const theta, double & sinTheta, double & cosTheta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -684,10 +687,10 @@ void sincos( T const theta, double & sinTheta, double & cosTheta ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc sincos( float, float &, float & ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE void sincos( __half const theta, __half & sinTheta, __half & cosTheta ) { sinTheta = ::hsin( theta ); @@ -695,7 +698,7 @@ void sincos( __half const theta, __half & sinTheta, __half & cosTheta ) } /// @copydoc sincos( float, float &, float & ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta ) { sinTheta = ::h2sin( theta ); @@ -710,7 +713,7 @@ void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta ) * @note This set of overloads is valid for any numeric type. If @p theta is not a float * it is converted to a double and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float tan( float const theta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -722,7 +725,7 @@ float tan( float const theta ) /// @copydoc tan( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double tan( T const theta ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -732,10 +735,10 @@ double tan( T const theta ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc tan( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half tan( __half const theta ) { __half s, c; @@ -744,7 +747,7 @@ __half tan( __half const theta ) } /// @copydoc tan( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 tan( __half2 const theta ) { __half2 s, c; @@ -776,7 +779,7 @@ namespace internal * @note Modified from https://developer.download.nvidia.com/cg/asin.html */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE T asinImpl( T const x ) { T const negate = lessThan( x, math::convert< T >( 0 ) ); @@ -798,7 +801,7 @@ T asinImpl( T const x ) * @note Modified from https://developer.download.nvidia.com/cg/acos.html */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE T acosImpl( T const x ) { T const negate = lessThan( x, math::convert< T >( 0 ) ); @@ -820,7 +823,8 @@ T acosImpl( T const x ) * @note Modified from https://developer.download.nvidia.com/cg/atan2.html */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE +LVARRAY_FORCE_INLINE T atan2Impl( T const y, T const x ) { T const absX = abs( x ); @@ -854,7 +858,7 @@ T atan2Impl( T const y, T const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float asin( float const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -866,7 +870,7 @@ float asin( float const x ) /// @copydoc asin( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double asin( T const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -876,15 +880,15 @@ double asin( T const x ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc asin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half asin( __half const x ) { return internal::asinImpl( x ); } /// @copydoc asin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 asin( __half2 const x ) { return internal::asinImpl( x ); } @@ -896,7 +900,7 @@ __half2 asin( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float acos( float const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -908,7 +912,7 @@ float acos( float const x ) /// @copydoc acos( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double acos( T const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -918,15 +922,15 @@ double acos( T const x ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc acos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half acos( __half const x ) { return internal::acosImpl( x ); } /// @copydoc acos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 acos( __half2 const x ) { return internal::acosImpl( x ); } @@ -939,7 +943,7 @@ __half2 acos( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float atan2( float const y, float const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -951,7 +955,7 @@ float atan2( float const y, float const x ) /// @copydoc atan2( float, float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double atan2( T const y, T const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -964,12 +968,12 @@ double atan2( T const y, T const x ) #if defined( LVARRAY_USE_CUDA ) /// @copydoc atan2( float, float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half atan2( __half const y, __half const x ) { return internal::atan2Impl( y, x ); } /// @copydoc atan2( float, float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 atan2( __half2 const y, __half2 const x ) { return internal::atan2Impl( y, x ); } @@ -988,7 +992,7 @@ __half2 atan2( __half2 const y, __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float exp( float const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -1000,7 +1004,7 @@ float exp( float const x ) /// @copydoc exp( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double exp( T const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -1010,15 +1014,15 @@ double exp( T const x ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc exp( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half exp( __half const x ) { return ::hexp( x ); } /// @copydoc exp( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 exp( __half2 const x ) { return ::h2exp( x ); } @@ -1030,7 +1034,7 @@ __half2 exp( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float log( float const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -1042,7 +1046,7 @@ float log( float const x ) /// @copydoc log( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double log( T const x ) { #if defined(LVARRAY_DEVICE_COMPILE) @@ -1052,15 +1056,15 @@ double log( T const x ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc log( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half log( __half const x ) { return ::hlog( x ); } /// @copydoc log( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 log( __half2 const x ) { return ::h2log( x ); } From 130837bdc1b2b64dceb02a4cb110e28340744be8 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Tue, 7 Jun 2022 12:05:14 -0700 Subject: [PATCH 16/34] remove spock, add crusher-base --- host-configs/ORNL/crusher-base.cmake | 25 ++++++++++++ host-configs/ORNL/crusher-cce@13.0.1.cmake | 26 +------------ host-configs/ORNL/crusher-cce@14.0.0.cmake | 29 +------------- .../ORNL/crusher-cpu-cce@13.0.1.cmake | 35 +---------------- host-configs/ORNL/spock-cce@12.0.3.cmake | 39 ------------------- 5 files changed, 31 insertions(+), 123 deletions(-) create mode 100644 host-configs/ORNL/crusher-base.cmake delete mode 100644 host-configs/ORNL/spock-cce@12.0.3.cmake diff --git a/host-configs/ORNL/crusher-base.cmake b/host-configs/ORNL/crusher-base.cmake new file mode 100644 index 00000000..53f647fa --- /dev/null +++ b/host-configs/ORNL/crusher-base.cmake @@ -0,0 +1,25 @@ + +set(CMAKE_CXX_STANDARD 14 CACHE STRING "") + +set( ENABLE_MPI ON CACHE BOOL "" FORCE ) +set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) + +# HIP Options +set( ENABLE_HIP ON CACHE BOOL "" FORCE ) + +# suppress -Werror for now +set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) + +# GTEST +set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") +set(gtest_disable_pthreads ON CACHE BOOL "") + +# disable most binaries and doc generation +set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) +set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) +set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) +set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) +set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) + +# BLT trying to find MPI fails on cray with cce +set(ENABLE_FIND_MPI FALSE CACHE BOOL "") diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake index 157fc0ab..30cf0bc7 100644 --- a/host-configs/ORNL/crusher-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -1,5 +1,5 @@ -set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") +set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") # Set up the tpls set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") @@ -24,37 +24,15 @@ set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "") set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "") set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "") -set(CMAKE_CXX_STANDARD 14 CACHE STRING "") - -set( ENABLE_MPI ON CACHE BOOL "" FORCE ) -set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) - -# HIP Options -set( ENABLE_HIP ON CACHE BOOL "" FORCE ) if( ENABLE_HIP ) set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation - set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) endif() - -set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror - -# GTEST options -set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") -set(gtest_disable_pthreads ON CACHE BOOL "") - -set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) -set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) -set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) -set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) -set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) - -#BLT -set(ENABLE_FIND_MPI FALSE CACHE BOOL "") \ No newline at end of file diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake index 6a509960..f3d051a2 100644 --- a/host-configs/ORNL/crusher-cce@14.0.0.cmake +++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake @@ -1,5 +1,6 @@ set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "") +include( crusher-base.cmake ) # Set up the tpls set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") @@ -25,44 +26,18 @@ set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH " set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") -set(CMAKE_CXX_STANDARD 14 CACHE STRING "") - -set( ENABLE_MPI ON CACHE BOOL "" FORCE ) -set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) - -# HIP Options -set( ENABLE_HIP ON CACHE BOOL "" FORCE ) - if( ENABLE_HIP ) set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" ) set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" ) - + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) endif() -# suppress -Werror for now -set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) - -# GTEST -set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") -set(gtest_disable_pthreads ON CACHE BOOL "") - -# disable most binaries and doc generation -set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) -set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) -set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) -set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) -set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) - -# BLT trying to find MPI fails on cray with cce -set(ENABLE_FIND_MPI FALSE CACHE BOOL "") - - diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake index 1a12fc7d..a4c98307 100644 --- a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake @@ -1,5 +1,6 @@ -set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") +set(CONFIG_NAME "crusher-cpu-cce@13.0.1" CACHE PATH "") +include( crusher-base.cmake ) # Set up the tpls set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") @@ -24,37 +25,5 @@ set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "") set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "") set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "") -set(CMAKE_CXX_STANDARD 14 CACHE STRING "") - -set( ENABLE_MPI ON CACHE BOOL "" FORCE ) -set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) - # HIP Options set( ENABLE_HIP OFF CACHE BOOL "" FORCE ) - -if( ENABLE_HIP ) - set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation - - set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) - set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) - - set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) - set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) - set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) - set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) -endif() - -set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror - -# GTEST options -set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") -set(gtest_disable_pthreads ON CACHE BOOL "") - -set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) -set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) -set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) -set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) -set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) - -#BLT -set(ENABLE_FIND_MPI FALSE CACHE BOOL "") \ No newline at end of file diff --git a/host-configs/ORNL/spock-cce@12.0.3.cmake b/host-configs/ORNL/spock-cce@12.0.3.cmake deleted file mode 100644 index f0764c32..00000000 --- a/host-configs/ORNL/spock-cce@12.0.3.cmake +++ /dev/null @@ -1,39 +0,0 @@ -set(CONFIG_NAME "spock-cce@12.0.3" CACHE PATH "") - -# Set up the tpls -set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen2/cce-12.0.3" CACHE PATH "") -set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") - -set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-qpmhf6p7n5sarmks55hgjnzff3ncs7jd/" CACHE PATH "" ) -set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-frggdmwjevbxy4a6kw7ctgrhyv7erfhr/" CACHE PATH "" ) - -set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-nkdetdg5tjyzzf5yjzo32jxwkmwfjjqn/" CACHE PATH "" ) -set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-wun25mr5qf7vo6x2vblhzh2ivs7vr4g6/" CACHE PATH "" ) -set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-a5ponjo23u7smy7w4a4jj7im47shrsxk/" CACHE PATH "" ) - -set(METIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/metis-5.1.0-rbblqiymq6eoursordyaq2ghimzpd22v/" CACHE PATH "" ) -set(PARMETIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/parmetis-4.0.3-mliemgo6vxrahsz4f6u5agdqyfpk2yd2/" CACHE PATH "" ) - -# C++ options -#set(CMAKE_C_COMPILER "/opt/cray/pe/cce/12.0.3/bin/craycc" CACHE PATH "") -#set(CMAKE_CXX_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayCC" CACHE PATH "") -#set(CMAKE_Fortran_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayftn" CACHE PATH "") - -set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.11/bin/cc" CACHE PATH "") -set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.11/bin/CC" CACHE PATH "") -set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.11/bin/ftn" CACHE PATH "") - -set(CMAKE_CXX_STANDARD 14 CACHE STRING "") - -set( ENABLE_MPI ON CACHE BOOL "" FORCE ) -set( ENABLE_FIND_MPI OFF CACHE BOOL "" FORCE ) - -# HIP Options -set( ENABLE_HIP ON CACHE BOOL "" FORCE ) -set( HIP_ROOT "/opt/rocm-4.2.0" CACHE PATH "" ) -set( HIP_VERSION_STRING "4.2.0" CACHE STRING "" ) -set( CMAKE_HIP_ARCHITECTURES "gfx908" CACHE STRING "" FORCE ) - -# GTEST options -set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") -set(gtest_disable_pthreads ON CACHE BOOL "") From 6e3c775f3d6dd0b71240fa3d4cfe2502fe7c10f6 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Tue, 28 Jun 2022 14:23:28 -0400 Subject: [PATCH 17/34] cmake path --- host-configs/ORNL/crusher-cce@13.0.1.cmake | 1 + host-configs/ORNL/crusher-cce@14.0.0.cmake | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake index 30cf0bc7..a10fda43 100644 --- a/host-configs/ORNL/crusher-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -1,5 +1,6 @@ set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) # Set up the tpls set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake index f3d051a2..967be640 100644 --- a/host-configs/ORNL/crusher-cce@14.0.0.cmake +++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake @@ -1,6 +1,6 @@ set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "") -include( crusher-base.cmake ) +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) # Set up the tpls set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") From 9e26620986f5a5406baa228476fd8d4031a77781 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Tue, 28 Jun 2022 13:55:39 -0700 Subject: [PATCH 18/34] lessthan --- src/math.hpp | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/math.hpp b/src/math.hpp index d2b07191..5e29dcaa 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -90,24 +90,6 @@ LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr SingleType< T > getSecond( T const x ) { return x; } -/** - * @return 1 if @p x is less than @p y, else 0. - * @tparam T The type of @p x and @p y. - * @param x The first value. - * @param y The second value. - */ - LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE -__half lessThan( __half const x, __half const y ) -{ - return __hlt( x, y ); -} - -LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE -__half2 lessThan( __half2 const x, __half2 const y ) -{ - return __hlt2( x, y ); -} - #if defined( LVARRAY_USE_CUDA ) /** * @brief Convert @p u to @c __half. @@ -212,6 +194,9 @@ LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half getSecond( __half2 const x ) { return __high2half( x ); } +#endif + +#if defined( LVARRAY_USE_DEVICE ) /** * @return 1 if @p x is less than @p y, else 0. * @param x The first value. @@ -229,7 +214,6 @@ __half lessThan( __half const x, __half const y ) LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 lessThan( __half2 const x, __half2 const y ) { return __hlt2( x, y ); } - #endif } // namespace internal From 9b7f84aa51c912e17397580416c385362fb47cb1 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 30 Jun 2022 08:27:39 -0700 Subject: [PATCH 19/34] lt --- src/math.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/math.hpp b/src/math.hpp index 5e29dcaa..61d0237e 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -90,6 +90,17 @@ LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr SingleType< T > getSecond( T const x ) { return x; } +/** + * @return 1 if @p x is less than @p y, else 0. + * @tparam T The type of @p x and @p y. + * @param x The first value. + * @param y The second value. + */ +template< typename T > +LVARRAY_HOST_DEVICE inline constexpr +T lessThan( T const x, T const y ) +{ return __hlt( x, y ); } + #if defined( LVARRAY_USE_CUDA ) /** * @brief Convert @p u to @c __half. From ad7c1d906d9e6f433de1ea923f525573ae398858 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 30 Jun 2022 08:36:07 -0700 Subject: [PATCH 20/34] removing pragma no longer needed on crusher --- src/Macros.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Macros.hpp b/src/Macros.hpp index 60a545a5..6e3536c9 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -12,8 +12,6 @@ #pragma once -#pragma clang diagnostic ignored "-Wfloat-equal" - // Source includes #include "LvArrayConfig.hpp" #include "system.hpp" From e531497a149c3a49319ec6aac0e91dcd7601a876 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 30 Jun 2022 08:48:53 -0700 Subject: [PATCH 21/34] remove shim type temporarily used for raja versioning differences --- src/bufferManipulation.hpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp index 548cfe2b..83e5a00e 100644 --- a/src/bufferManipulation.hpp +++ b/src/bufferManipulation.hpp @@ -69,21 +69,6 @@ namespace bufferManipulation */ HAS_MEMBER_FUNCTION_NO_RTYPE( move, MemorySpace::host, true ); - -template < typename T > -struct ContainerShim -{ - ContainerShim( T * begin, T * end ) - : m_begin( begin ) - , m_end( end ) - {} - T * begin() const { return m_begin; } - T * end() const { return m_end; } - T * m_begin; - T * m_end; -}; - - /** * @class VoidBuffer * @brief This class implements the default behavior for the Buffer methods related From a5949848876bda458980e29822fb31fc70e98d01 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 30 Jun 2022 08:59:18 -0700 Subject: [PATCH 22/34] doc --- src/Macros.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Macros.hpp b/src/Macros.hpp index 6e3536c9..8717e397 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -24,17 +24,22 @@ #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) + /// Macro defined when using a device. #define LVARRAY_USE_DEVICE #endif #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + /// Macro defined when currently compiling on device (only defined in the device context). #define LVARRAY_DEVICE_COMPILE + /// Marks a function/lambda for inlining #define LVARRAY_FORCE_INLINE __forceinline__ #else + /// Marks a function/lambda for inlining #define LVARRAY_FORCE_INLINE inline #endif #if defined(__CUDACC__) || defined(__HIPCC__) + // Denotes whether to define decorator macros later in this file. #define LVARRAY_DECORATE #endif From 286929765be04d96af894127cc4049cff420d15e Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 30 Jun 2022 10:13:42 -0700 Subject: [PATCH 23/34] versioning, merge from dev, bugs only showing up in tests which weren't compiled on crusher --- src/ArrayOfArraysView.hpp | 4 ++++ unitTests/testArray1DOfArray1DOfArray1D.cpp | 2 +- unitTests/testMath.cpp | 1 + unitTests/testSortedArray.cpp | 2 +- unitTests/testTensorOpsTwoSizes1.cpp | 4 ++-- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index d042d091..72695e3d 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -732,7 +732,11 @@ class ArrayOfArraysView auto const fillOffsets = [&]() { m_offsets[ 0 ] = 0; +#if RAJA_VERSION_MAJOR >= 1 && RAJA_VERSION_MINOR >= 13 RAJA::inclusive_scan< POLICY >( RAJA::make_span(capacities, numSubArrays), RAJA::make_span(m_offsets.data() + 1, numSubArrays) ); +#else + RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 ); +#endif }; resizeFromOffsetsImpl( numSubArrays, fillOffsets, buffers ... ); } diff --git a/unitTests/testArray1DOfArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1DOfArray1D.cpp index cdd17fe2..5038d778 100644 --- a/unitTests/testArray1DOfArray1DOfArray1D.cpp +++ b/unitTests/testArray1DOfArray1DOfArray1D.cpp @@ -272,7 +272,7 @@ using Array1DOfArray1DOfArray1DTestTypes = ::testing::Types< , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy > , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy > #endif -#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI) , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testMath.cpp b/unitTests/testMath.cpp index d7c76b19..f6d193cb 100644 --- a/unitTests/testMath.cpp +++ b/unitTests/testMath.cpp @@ -154,6 +154,7 @@ using TestMathTypes = ::testing::Types< #endif #if defined( LVARRAY_USE_CUDA ) , std::pair< __half, parallelDevicePolicy< 32 > > +#endif >; TYPED_TEST_SUITE( TestMath, TestMathTypes, ); diff --git a/unitTests/testSortedArray.cpp b/unitTests/testSortedArray.cpp index ae145fbd..fe52ddfc 100644 --- a/unitTests/testSortedArray.cpp +++ b/unitTests/testSortedArray.cpp @@ -451,7 +451,7 @@ using SortedArrayViewTestTypes = ::testing::Types< std::pair< SortedArray< int, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > , std::pair< SortedArray< Tensor, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > , std::pair< SortedArray< TestString, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > -#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI) , std::pair< SortedArray< int, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< SortedArray< Tensor, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testTensorOpsTwoSizes1.cpp b/unitTests/testTensorOpsTwoSizes1.cpp index 101c4671..96ac793c 100644 --- a/unitTests/testTensorOpsTwoSizes1.cpp +++ b/unitTests/testTensorOpsTwoSizes1.cpp @@ -938,9 +938,9 @@ using TwoSizesTestTypes = ::testing::Types< #endif >; -TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes ); +TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes, ); + - TYPED_TEST( TwoSizesTest, scale ) { this->testScale(); From 82b3d4bda74828e9072351d7552f7c35ebf09a69 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 30 Jun 2022 10:20:18 -0700 Subject: [PATCH 24/34] crustify --- src/Array.hpp | 2 +- src/ArrayOfArraysView.hpp | 2 +- src/CRSMatrix.hpp | 2 +- src/Macros.hpp | 32 +++++++++++++++--------------- src/math.hpp | 6 +++--- unitTests/testArray_ChaiBuffer.cpp | 2 +- unitTests/testChaiBuffer.cpp | 2 +- unitTests/testUtils.hpp | 2 +- 8 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/Array.hpp b/src/Array.hpp index 28ef6f95..503d4750 100644 --- a/src/Array.hpp +++ b/src/Array.hpp @@ -91,7 +91,7 @@ class Array : public ArrayView< T, { this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims ); -#if !defined(LVARRAY_DEVICE_COMPILE) +#if !defined(LVARRAY_DEVICE_COMPILE) setName( "" ); #endif #if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 72695e3d..58aa5e96 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -733,7 +733,7 @@ class ArrayOfArraysView { m_offsets[ 0 ] = 0; #if RAJA_VERSION_MAJOR >= 1 && RAJA_VERSION_MINOR >= 13 - RAJA::inclusive_scan< POLICY >( RAJA::make_span(capacities, numSubArrays), RAJA::make_span(m_offsets.data() + 1, numSubArrays) ); + RAJA::inclusive_scan< POLICY >( RAJA::make_span( capacities, numSubArrays ), RAJA::make_span( m_offsets.data() + 1, numSubArrays ) ); #else RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 ); #endif diff --git a/src/CRSMatrix.hpp b/src/CRSMatrix.hpp index daffdd9e..ddd786c5 100644 --- a/src/CRSMatrix.hpp +++ b/src/CRSMatrix.hpp @@ -139,7 +139,7 @@ class CRSMatrix : protected CRSMatrixView< T, COL_TYPE, INDEX_TYPE, BUFFER_TYPE RAJA::forall< POLICY >( RAJA::TypedRangeSegment< INDEX_TYPE >( 0, numRows() ), [view] LVARRAY_HOST_DEVICE ( INDEX_TYPE const row ) { - INDEX_TYPE const nnz = view.numNonZeros( row ); + INDEX_TYPE const nnz = view.numNonZeros( row ); T * const entries = view.getEntries( row ); arrayManipulation::destroy( entries, nnz ); } ); diff --git a/src/Macros.hpp b/src/Macros.hpp index 8717e397..82cf24d1 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -24,23 +24,23 @@ #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) - /// Macro defined when using a device. - #define LVARRAY_USE_DEVICE +/// Macro defined when using a device. +#define LVARRAY_USE_DEVICE #endif #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - /// Macro defined when currently compiling on device (only defined in the device context). - #define LVARRAY_DEVICE_COMPILE - /// Marks a function/lambda for inlining - #define LVARRAY_FORCE_INLINE __forceinline__ +/// Macro defined when currently compiling on device (only defined in the device context). +#define LVARRAY_DEVICE_COMPILE +/// Marks a function/lambda for inlining +#define LVARRAY_FORCE_INLINE __forceinline__ #else - /// Marks a function/lambda for inlining - #define LVARRAY_FORCE_INLINE inline +/// Marks a function/lambda for inlining +#define LVARRAY_FORCE_INLINE inline #endif #if defined(__CUDACC__) || defined(__HIPCC__) - // Denotes whether to define decorator macros later in this file. - #define LVARRAY_DECORATE +// Denotes whether to define decorator macros later in this file. +#define LVARRAY_DECORATE #endif @@ -137,10 +137,10 @@ { \ constexpr char const * formatString = "***** ERROR\n" \ "***** LOCATION: " LOCATION "\n" \ - "***** Block: [%u, %u, %u]\n" \ - "***** Thread: [%u, %u, %u]\n" \ - "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ - "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ + "***** Block: [%u, %u, %u]\n" \ + "***** Thread: [%u, %u, %u]\n" \ + "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \ + "***** MSG: " STRINGIZE( MSG ) "\n\n"; \ printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \ asm ( "trap;" ); \ } \ @@ -578,9 +578,9 @@ * the host. To use place directly above a the template. */ #if defined(LVARRAY_USE_CUDA) - #define DISABLE_HD_WARNING _Pragma("hd_warning_disable") +#define DISABLE_HD_WARNING _Pragma("hd_warning_disable") #else - #define DISABLE_HD_WARNING +#define DISABLE_HD_WARNING #endif #else /// Mark a function for both host and device usage. diff --git a/src/math.hpp b/src/math.hpp index 61d0237e..d45f68f3 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -339,7 +339,7 @@ LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 max( __half2 const a, __half2 const b ) { #if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) - return __hmax2( a, b ); + return __hmax2( a, b ); #else __half2 const aFactor = __hge2( a, b ); __half2 const bFactor = convert< __half2 >( 1 ) - aFactor; @@ -643,9 +643,9 @@ void sincos( float const theta, float & sinTheta, float & cosTheta ) { #if defined(LVARRAY_DEVICE_COMPILE) #if defined(LVARRAY_USE_CUDA) - ::sincos( theta, &sinTheta, &cosTheta ); + ::sincos( theta, &sinTheta, &cosTheta ); #elif defined(LVARRAY_USE_HIP) - ::sincosf( theta, &sinTheta, &cosTheta ); + ::sincosf( theta, &sinTheta, &cosTheta ); #endif #else sinTheta = std::sin( theta ); diff --git a/unitTests/testArray_ChaiBuffer.cpp b/unitTests/testArray_ChaiBuffer.cpp index 5ef2a6a1..8bd5aaed 100644 --- a/unitTests/testArray_ChaiBuffer.cpp +++ b/unitTests/testArray_ChaiBuffer.cpp @@ -144,7 +144,7 @@ TYPED_TEST( ArrayTest, DeviceAlloc ) { this->testHIPDeviceAlloc(); } - + #endif } // namespace testing diff --git a/unitTests/testChaiBuffer.cpp b/unitTests/testChaiBuffer.cpp index ae12886f..27b3401a 100644 --- a/unitTests/testChaiBuffer.cpp +++ b/unitTests/testChaiBuffer.cpp @@ -44,7 +44,7 @@ class ChaiBufferTest : public ::testing::Test #elif defined( LVARRAY_USE_HIP ) auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip }; - std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; + std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; #else std::initializer_list< MemorySpace > const spaces = { MemorySpace::host }; std::initializer_list< umpire::Allocator > const allocators = { hostPool }; diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp index 639b20ad..a4a3efa1 100644 --- a/unitTests/testUtils.hpp +++ b/unitTests/testUtils.hpp @@ -85,7 +85,7 @@ struct RAJAHelper< RAJA::hip_exec< N > > using AtomicPolicy = RAJA::hip_atomic; static constexpr MemorySpace space = MemorySpace::hip; }; - + #endif template< typename POLICY, typename INDEX_TYPE, typename LAMBDA > From d97d49979c84d35ded9e7b445b9eb8918ff3adcc Mon Sep 17 00:00:00 2001 From: wrtobin Date: Thu, 30 Jun 2022 10:35:03 -0700 Subject: [PATCH 25/34] better raja scan version guard --- src/ArrayOfArraysView.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 58aa5e96..592154d8 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -732,7 +732,7 @@ class ArrayOfArraysView auto const fillOffsets = [&]() { m_offsets[ 0 ] = 0; -#if RAJA_VERSION_MAJOR >= 1 && RAJA_VERSION_MINOR >= 13 +#if ( RAJA_VERSION_MAJOR == 1 && RAJA_VERSION_MINOR >= 13 ) || ( RAJA_VERSION_MAJOR > 1 ) RAJA::inclusive_scan< POLICY >( RAJA::make_span( capacities, numSubArrays ), RAJA::make_span( m_offsets.data() + 1, numSubArrays ) ); #else RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 ); From deb29b66f351e7286b45ec04ef5937d3ed797a38 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Tue, 5 Jul 2022 13:58:57 -0700 Subject: [PATCH 26/34] nvcc vs cce (hip-clang) differences --- host-configs/LLNL/lassen-base.cmake | 6 +++--- src/ArrayOfArraysView.hpp | 3 --- src/ArrayView.hpp | 3 --- src/CRSMatrixView.hpp | 2 -- unitTests/testTypeManipulation.cpp | 2 +- 5 files changed, 4 insertions(+), 12 deletions(-) diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake index 5a443bb9..3a60a7f3 100644 --- a/host-configs/LLNL/lassen-base.cmake +++ b/host-configs/LLNL/lassen-base.cmake @@ -21,14 +21,14 @@ set(ENABLE_CUDA ON CACHE BOOL "") set(CUDA_TOOLKIT_ROOT_DIR /usr/tce/packages/cuda/cuda-10.1.243 CACHE STRING "") set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "") set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc CACHE STRING "") -set(CUDA_ARCH sm_70 CACHE STRING "") +set(CUDA_ARCHITECTURES sm_70 CACHE STRING "") set(CMAKE_CUDA_STANDARD 14 CACHE STRING "") -set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") +set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCHITECTURES} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "") -set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE) +set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCHITECTURES}" CACHE STRING "" FORCE) # Uncomment this line to make nvcc output register usage for each kernel. # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --resource-usage" CACHE STRING "" FORCE) diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 592154d8..5efb4bc0 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -202,14 +202,12 @@ class ArrayOfArraysView * @brief A constructor to create an uninitialized ArrayOfArraysView. * @note An uninitialized ArrayOfArraysView should not be used until it is assigned to. */ - LVARRAY_HOST_DEVICE ArrayOfArraysView() = default; /** * @brief Default copy constructor. * @note The copy constructor will trigger the copy constructor for @tparam BUFFER_TYPE */ - LVARRAY_HOST_DEVICE ArrayOfArraysView( ArrayOfArraysView const & ) = default; /** @@ -246,7 +244,6 @@ class ArrayOfArraysView * @brief Default copy assignment operator. * @return *this. */ - LVARRAY_HOST_DEVICE inline ArrayOfArraysView & operator=( ArrayOfArraysView const & ) = default; diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp index 1a2a3d03..aabd48bf 100644 --- a/src/ArrayView.hpp +++ b/src/ArrayView.hpp @@ -118,7 +118,6 @@ class ArrayView * @brief A constructor to create an uninitialized ArrayView. * @note An uninitialized ArrayView should not be used until it is assigned to. */ - LVARRAY_HOST_DEVICE ArrayView() = default; /** @@ -186,7 +185,6 @@ class ArrayView * ArrayView< int, 1, 0, std::ptrdiff_t, MallocBuffer > anotherView = std::move( view ); * @endcode */ - LVARRAY_HOST_DEVICE ArrayView( ArrayView && source ) = default; /** @@ -208,7 +206,6 @@ class ArrayView {} /// The default destructor. - LVARRAY_HOST_DEVICE ~ArrayView() = default; /** diff --git a/src/CRSMatrixView.hpp b/src/CRSMatrixView.hpp index fe3c7c99..9a8bbca5 100644 --- a/src/CRSMatrixView.hpp +++ b/src/CRSMatrixView.hpp @@ -106,13 +106,11 @@ class CRSMatrixView : protected SparsityPatternView< COL_TYPE, INDEX_TYPE, BUFFE /** * @brief Default copy constructor. */ - LVARRAY_HOST_DEVICE CRSMatrixView( CRSMatrixView const & ) = default; /** * @brief Default move constructor. */ - LVARRAY_HOST_DEVICE inline CRSMatrixView( CRSMatrixView && ) = default; /** diff --git a/unitTests/testTypeManipulation.cpp b/unitTests/testTypeManipulation.cpp index 45ad8380..494fb038 100644 --- a/unitTests/testTypeManipulation.cpp +++ b/unitTests/testTypeManipulation.cpp @@ -81,7 +81,7 @@ CUDA_TEST( typeManipulation, forEachArg ) EXPECT_EQ( intReducer.get(), 2 ); EXPECT_EQ( floatReducer.get(), 4 ); EXPECT_EQ( doubleReducer.get(), 7 ); -#eli defined(LVARRAY_USE_HIP) +#elif defined(LVARRAY_USE_HIP) // Test on device. RAJA::ReduceSum< RAJA::hip_reduce, int > intReducer( 1 ); RAJA::ReduceSum< RAJA::hip_reduce, float > floatReducer( 3 ); From 62bef508a73db67b9f43544bea64916613186558 Mon Sep 17 00:00:00 2001 From: William Tobin Date: Tue, 23 Aug 2022 19:14:56 -0400 Subject: [PATCH 27/34] crusher debugging and host-configs --- cmake/CMakeBasics.cmake | 6 +-- host-configs/ORNL/crusher-cce@14.0.1.cmake | 39 +++++++++++++++++++ host-configs/ORNL/crusher-cce@14.0.2.cmake | 39 +++++++++++++++++++ .../ORNL/crusher-cpu-cce@13.0.1.cmake | 2 +- .../ORNL/crusher-cpu-cce@14.0.1.cmake | 30 ++++++++++++++ src/ChaiBuffer.hpp | 5 +-- src/Macros.hpp | 28 ++++++------- 7 files changed, 128 insertions(+), 21 deletions(-) create mode 100644 host-configs/ORNL/crusher-cce@14.0.1.cmake create mode 100644 host-configs/ORNL/crusher-cce@14.0.2.cmake create mode 100644 host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake diff --git a/cmake/CMakeBasics.cmake b/cmake/CMakeBasics.cmake index 4c3ec217..25c4bef7 100644 --- a/cmake/CMakeBasics.cmake +++ b/cmake/CMakeBasics.cmake @@ -12,9 +12,9 @@ option( ENABLE_TOTALVIEW_OUTPUT "" OFF ) set( LVARRAY_BUILD_OBJ_LIBS OFF CACHE BOOL "" ) -if( NOT BLT_CXX_STD STREQUAL c++14 ) - MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14") -endif() +# if( NOT BLT_CXX_STD STREQUAL c++14 ) +# MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14") +# endif() blt_append_custom_compiler_flag( FLAGS_VAR CMAKE_CXX_FLAGS DEFAULT "${OpenMP_CXX_FLAGS}") diff --git a/host-configs/ORNL/crusher-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cce@14.0.1.cmake new file mode 100644 index 00000000..15c54516 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@14.0.1.cmake @@ -0,0 +1,39 @@ + +set(CONFIG_NAME "crusher-cce@14.0.1" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" ) + +set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" ) +set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.15") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() diff --git a/host-configs/ORNL/crusher-cce@14.0.2.cmake b/host-configs/ORNL/crusher-cce@14.0.2.cmake new file mode 100644 index 00000000..d0e29023 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@14.0.2.cmake @@ -0,0 +1,39 @@ + +set(CONFIG_NAME "crusher-cce@14.0.2" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.2" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.2" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-ksdglvlmamju7gphtyzdavitriemedla" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.1-jxxcauxbzee6nqjmyjz45t5h4f7tv34r" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.1-vgvqpvi3cwdmvy6cu76sqoghnvprzlwu" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-7axkiea7q3hzgojswiz7qdbd2yq6bvsf" CACHE PATH "" ) + +set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-jptrwzs7vdbckndjg5qg4jwckfmgexmw/" CACHE PATH "" ) +set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-p2msdgsmomufcnwhnow5bbazg7463caf/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.17") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_VERSION_STRING "5.2.0" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake index a4c98307..b9d64b28 100644 --- a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake +++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake @@ -1,6 +1,6 @@ set(CONFIG_NAME "crusher-cpu-cce@13.0.1" CACHE PATH "") -include( crusher-base.cmake ) +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) # Set up the tpls set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") diff --git a/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake new file mode 100644 index 00000000..d25d6b2e --- /dev/null +++ b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake @@ -0,0 +1,30 @@ + +set(CONFIG_NAME "crusher-cpu-cce@14.0.1" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" ) + +set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" ) +set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.15") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +# HIP Options +set( ENABLE_HIP OFF CACHE BOOL "" FORCE ) diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp index f78998a8..83e8c254 100644 --- a/src/ChaiBuffer.hpp +++ b/src/ChaiBuffer.hpp @@ -412,10 +412,9 @@ class ChaiBuffer m_capacity == 0 || chaiSpace == chai::NONE ) return; + auto & am = internal::getArrayManager(); const_cast< T * & >( m_pointer ) = - static_cast< T * >( internal::getArrayManager().move( const_cast< T_non_const * >( m_pointer ), - m_pointerRecord, - chaiSpace ) ); + static_cast< T * >( am.move( const_cast< T_non_const * >( m_pointer ), m_pointerRecord, chaiSpace ) ); if( !std::is_const< T >::value && touch ) m_pointerRecord->m_touched[ chaiSpace ] = true; m_pointerRecord->m_last_space = chaiSpace; diff --git a/src/Macros.hpp b/src/Macros.hpp index 82cf24d1..bcd98b78 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -148,20 +148,20 @@ #endif #else #define LVARRAY_ERROR_IF( EXP, MSG ) \ - do \ - { \ - if( EXP ) \ - { \ - std::ostringstream __oss; \ - __oss << "***** ERROR\n"; \ - __oss << "***** LOCATION: " LOCATION "\n"; \ - __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \ - __oss << MSG << "\n"; \ - __oss << LvArray::system::stackTrace( true ); \ - std::cout << __oss.str() << std::endl; \ - LvArray::system::callErrorHandler(); \ - } \ - } while( false ) + // do \ + // { \ + // if( EXP ) \ + // { \ + // std::ostringstream __oss; \ + // __oss << "***** ERROR\n"; \ + // __oss << "***** LOCATION: " LOCATION "\n"; \ + // __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \ + // __oss << MSG << "\n"; \ + // __oss << LvArray::system::stackTrace( true ); \ + // std::cout << __oss.str() << std::endl; \ + // LvArray::system::callErrorHandler(); \ + // } \ + // } while( false ) #endif /** From 9cdbb7b16ee30bd6e86233c1da7f2796c4158afc Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Mon, 29 Aug 2022 15:55:13 -0700 Subject: [PATCH 28/34] Squash --- cmake/SetupTPL.cmake | 63 ++++- scripts/uberenv/packages/lvarray/package.py | 26 ++ .../toss_4_x86_64_ib/packages.yaml | 9 + src/CMakeLists.txt | 4 + src/dense/CMakeLists.cmake | 33 +++ src/dense/common.cpp | 18 ++ src/dense/common.hpp | 37 +++ src/dense/eigendecomposition.cpp | 229 ++++++++++++++++++ src/dense/eigendecomposition.hpp | 50 ++++ 9 files changed, 461 insertions(+), 8 deletions(-) create mode 100644 src/dense/CMakeLists.cmake create mode 100644 src/dense/common.cpp create mode 100644 src/dense/common.hpp create mode 100644 src/dense/eigendecomposition.cpp create mode 100644 src/dense/eigendecomposition.hpp diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake index 29a60128..c312306b 100644 --- a/cmake/SetupTPL.cmake +++ b/cmake/SetupTPL.cmake @@ -56,6 +56,12 @@ endif() # CHAI ################################ if(ENABLE_CHAI) + if(NOT EXISTS ${CHAI_DIR}) + message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.") + endif() + + message(STATUS "Using CHAI from ${CHAI_DIR}") + if(NOT ENABLE_UMPIRE) message(FATAL_ERROR "Umpire must be enabled to use CHAI.") endif() @@ -111,16 +117,57 @@ endif() ################################ # Python ################################ -if ( ENABLE_PYLVARRAY ) - message( STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}" ) - find_package( Python3 REQUIRED - COMPONENTS Development NumPy ) +if(ENABLE_PYLVARRAY) + message(STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}") + find_package(Python3 REQUIRED + COMPONENTS Development NumPy) + + message(STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}") + message(STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}") + message(STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}") + + set(thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy) +else() + message(STATUS "Not building pylvarray") +endif() + +################################ +# LAPACK/BLAS +################################ +if(ENABLE_LAPACK) + message(STATUS "BLAS_LIBRARIES = ${BLAS_LIBRARIES}") + message(STATUS "LAPACK_LIBRARIES = ${LAPACK_LIBRARIES}") - message( STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}" ) - message( STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}" ) - message( STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}" ) + blt_import_library(NAME blas + TREAT_INCLUDES_AS_SYSTEM ON + LIBRARIES ${BLAS_LIBRARIES}) - set( thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy ) + blt_import_library(NAME lapack + DEPENDS_ON blas + TREAT_INCLUDES_AS_SYSTEM ON + LIBRARIES ${LAPACK_LIBRARIES}) + + set(thirdPartyLibs ${thirdPartyLibs} blas lapack) +else() + message(STATUS "Not using LAPACK or BLAS.") +endif() + +################################ +# MAGMA +################################ +if(ENABLE_MAGMA) + message(STATUS "Using MAGMA from ${MAGMA_DIR}") + + if(NOT ENABLE_LAPACK) + message(FATAL_ERROR "LAPACK must be enabled to use MAGMA.") + endif() + + find_package(magma REQUIRED + PATHS ${MAGMA_DIR}) + + set(thirdPartyLibs ${thirdPartyLibs} magma) +else() + message(STATUS "Not using MAGMA.") endif() set( thirdPartyLibs ${thirdPartyLibs} CACHE STRING "" ) diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index b377bdfa..c5a3b35b 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -50,15 +50,21 @@ class Lvarray(CMakePackage, CudaPackage): variant('chai', default=False, description='Build Chai support') variant('caliper', default=False, description='Build Caliper support') variant('pylvarray', default=False, description='Build Python support') + # variant('lapack', default=False, description='Build LAPACK and BLAS support') + # variant('magma', default=False, description='Build MAGMA support') variant('tests', default=True, description='Build tests') variant('benchmarks', default=False, description='Build benchmarks') variant('examples', default=False, description='Build examples') variant('docs', default=False, description='Build docs') variant('addr2line', default=True, description='Build support for addr2line.') + variant('tpl_build_type', default='none', description='TPL build type', values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none')) + + # conflicts('~lapack', when='+magma') + depends_on('blt', when='@0.2.0:', type='build') depends_on('camp') @@ -76,6 +82,10 @@ class Lvarray(CMakePackage, CudaPackage): depends_on('py-scipy@1.5.2:', when='+pylvarray') depends_on('py-pip', when='+pylvarray') + # depends_on('blas', when='+lapack') + # depends_on('lapack', when='+lapack') + # depends_on('magma', when='+magma') + depends_on('doxygen@1.8.13:', when='+docs', type='build') depends_on('py-sphinx@1.6.3:', when='+docs', type='build') @@ -313,6 +323,22 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): else: cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False)) + # cfg.write('#{0}\n'.format('-' * 80)) + # cfg.write('# Math libraries\n') + # cfg.write('#{0}\n\n'.format('-' * 80)) + # if '+lapack' in spec: + # cfg.write(cmake_cache_option('ENABLE_LAPACK', True)) + # cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs)) + # cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs)) + # else: + # cfg.write(cmake_cache_option('ENABLE_LAPACK', False)) + + # if '+magma' in spec: + # cfg.write(cmake_cache_option('ENABLE_MAGMA', True)) + # cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix)) + # else: + # cfg.write(cmake_cache_option('ENABLE_MAGMA', False)) + cfg.write("#{0}\n".format("-" * 80)) cfg.write("# Documentation\n") cfg.write("#{0}\n\n".format("-" * 80)) diff --git a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml index e7ed36f4..ea2998fc 100644 --- a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml @@ -2,6 +2,15 @@ packages: all: target: [default] compiler: [gcc, clang, intel] + providers: + blas: [intel-mkl] + lapack: [intel-mkl] + + intel-mkl: + buildable: False + externals: + - spec: intel-mkl@2020.0.166 threads=openmp + prefix: /usr/tce/packages/mkl/mkl-2020.0/ cmake: buildable: False diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index da7c512f..8d4ad2ca 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -79,3 +79,7 @@ lvarray_add_code_checks( PREFIX lvarray if( ENABLE_PYLVARRAY ) add_subdirectory( python ) endif() + +if( ENABLE_LAPACK ) + add_subdirectory( dense ) +endif() diff --git a/src/dense/CMakeLists.cmake b/src/dense/CMakeLists.cmake new file mode 100644 index 00000000..0f7096ae --- /dev/null +++ b/src/dense/CMakeLists.cmake @@ -0,0 +1,33 @@ +set( lvarraydense_headers + common.hpp + eigendecomposition.hpp + ) + +set( lvarraydense_sources + common.cpp + eigendecomposition.cpp + ) + +blt_add_library( NAME lvarraydense + SOURCES ${lvarraydense_sources} + HEADERS ${lvarraydense_headers} + DEPENDS_ON lvarray ${lvarray_dependencies} blas lapack + SHARED TRUE + CLEAR_PREFIX TRUE + ) + +target_include_directories( lvarraydense + PUBLIC + $ + $ ) + +install( TARGETS lvarraydense + EXPORT lvarraydense + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib + RUNTIME DESTINATION lib ) + +install( EXPORT lvarraydense + DESTINATION share/lvarray/cmake/ ) + +lvarray_add_code_checks( PREFIX lvarraydense ) diff --git a/src/dense/common.cpp b/src/dense/common.cpp new file mode 100644 index 00000000..75c06070 --- /dev/null +++ b/src/dense/common.cpp @@ -0,0 +1,18 @@ +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +char const * getOption( SymmetricMatrixStorageType const option ) +{ + static constexpr char const * const upper = "U"; + static constexpr char const * const lower = "L"; + + return option == SymmetricMatrixStorageType::UPPER_TRIANGULAR ? upper : lower; +} + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/common.hpp b/src/dense/common.hpp new file mode 100644 index 00000000..40072a91 --- /dev/null +++ b/src/dense/common.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include "common.hpp" +#include "../Array.hpp" +#include "../ChaiBuffer.hpp" + +namespace LvArray +{ +namespace dense +{ + +/** + * + */ +enum class SymmetricMatrixStorageType +{ + UPPER_TRIANGULAR, + LOWER_TRIANGULAR, +}; + +/** + * TODO: move to internal namespace + */ +char const * getOption( SymmetricMatrixStorageType const option ); + +/** + * + */ +template< typename T > +struct Workspace +{ + Array< std::complex< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > workComplex; + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rWork; +}; + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/eigendecomposition.cpp b/src/dense/eigendecomposition.cpp new file mode 100644 index 00000000..fc51132a --- /dev/null +++ b/src/dense/eigendecomposition.cpp @@ -0,0 +1,229 @@ +#include "eigendecomposition.hpp" + +/// This macro provide a flexible interface for Fortran naming convention for compiled objects +// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE +#define FORTRAN_MANGLE( name ) name +// #else +// #define FORTRAN_MANGLE( name ) name ## _ +// #endif + +extern "C" +{ + +#define LVARRAY_CHEEV FORTRAN_MANGLE( cheev ) +void LVARRAY_CHEEV( + char const * JOBZ, + char const * UPLO, + int const * N, + std::complex< float > * A, + int const * LDA, + float * W, + std::complex< float > * WORK, + int const * LWORK, + float const * RWORK, + int * INFO +); + +#define LVARRAY_ZHEEV FORTRAN_MANGLE( zheev ) +void LVARRAY_ZHEEV( + char const * JOBZ, + char const * UPLO, + int const * N, + std::complex< double > * A, + int const * LDA, + double * W, + std::complex< double > * WORK, + int const * LWORK, + double const * RWORK, + int * INFO ); + +#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr ) +void LVARRAY_ZHEEVR( + char const * JOBZ, + char const * RANGE, + char const * UPLO, + int const * N, + std::complex< double > * A, + int const * LDA, + double const * VL, + double const * VU, + int const * IL, + int const * IU, + double const * ABSTOL, + int * M, + double * W, + double * Z, + int const * LDZ, + int * ISUPPZ, + std::complex< double > * WORK, + int const * LWORK, + double * RWORK, + int * LRWORK, + int const * IWORK, + int const * LIWORK, + int * INFO ); + + +} // extern "C" + +namespace LvArray +{ +namespace dense +{ +namespace internal +{ + +/** + * + */ +char const * getOption( EigenDecompositionOption const option ) +{ + static constexpr char const * const eigenvalueString = "N"; + static constexpr char const * const eigenvectorString = "V"; + + return option == EigenDecompositionOption::EIGENVALUES ? eigenvalueString : eigenvectorString; +} + +struct HEEVR_status +{ + int LWORK; + int LRWORK; + int LIWORK; + bool success +}; + + +template< typename T, typename INDEX_TYPE > +HEEVR_Sizes heevr( + EigenDecompositionOption const decompositionOptions, + ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A, + ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, + Workspace< T > & workspace, + SymmetricMatrixStorageType const storageType, + bool const compute ) + +} // namespace internal + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template< typename T, typename INDEX_TYPE > +void heev( + MemorySpace const space, + EigenDecompositionOption const decompositionType, + ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A, + ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, + Workspace< T > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::cpu, "Device not yet supported." ); + + LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), A.size( 1 ), + "The matrix A must be square." ); + + LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), eigenValues.size(), + "The matrix A and lambda have incompatible sizes." ); + + // define the arguments of zheev + int const N = LvArray::integerConversion< int >( A.size( 0 ) ); + int const LDA = N; + int INFO; + + // Make sure that the workspace is the right size. + workspace.rWork.resizeWithoutInitializationOrDestruction( std::max( 1, 3 * N - 2 ) ); + + if( workspace.workComplex.size() < std::max( 1, 2 * N - 1 ) ); + { + std::complex< T > optimalWorkSize{ 0, 0 }; + + int LWORK = -1; + + if( std::is_same_v< T, float > ) + { + LVARRAY_CHEEV( + getOption( decompositionType ), + getOption( storageType ), + &N, + nullptr, + &LDA, + nullptr, + &optimalWorkSize, + &LWORK, + nullptr, + &INFO ); + } + else + { + LVARRAY_ZHEEV( + getOption( decompositionType ), + getOption( storageType ), + &N, + nullptr, + &LDA, + nullptr, + &optimalWorkSize, + &LWORK, + nullptr, + &INFO ); + } + + LVARRAY_ERROR_IF_NE_MSG( INFO, 0, + "Error in computing the optimal workspace size." ); + + workspace.workComplex.resizeWithoutInitializationOrDestruction( + static_cast< INDEX_TYPE >( optimalWorkSize.real() ) ); + } + + int const LWORK = integerConversion< int >( workspace.workComplex.size() ); + + if( std::is_same< T, float >::value ) + { + LVARRAY_CHEEV( + getOption( decompositionType ), + getOption( storageType ), + &N, + A.data(), + &LDA, + eigenValues.data(), + workspace.workComplex.data(), + &LWORK, + workspace.rWork.data(), + &INFO ); + } + else + { + LVARRAY_ZHEEV( + getOption( decompositionType ), + getOption( storageType ), + &N, + A.data(), + &LDA, + eigenValues.data(), + workspace.workComplex.data(), + &LWORK, + workspace.rWork.data(), + &INFO ); + } + + LVARRAY_ERROR_IF_NE_MSG( INFO, 0, + "Error in computing the eigen decomposition." ); +}` + + +// explicit instantiations. +template void heev< float >( + MemorySpace const space, + EigenDecompositionOption const decompositionType, + ArraySlice< std::complex< float >, 2, 1, INDEX_TYPE > const & A, + ArraySlice< float, 1, 0, INDEX_TYPE > const & eigenValues, + Workspace< float > & workspace, + SymmetricMatrixStorageType const storageType ); + +template void heev< double >( + MemorySpace const space, + EigenDecompositionOption const decompositionType, + ArraySlice< std::complex< double >, 2, 1, INDEX_TYPE > const & A, + ArraySlice< double, 1, 0, INDEX_TYPE > const & eigenValues, + Workspace< double > & workspace, + SymmetricMatrixStorageType const storageType ); + +} // namespace dense +} // namespace LvArray diff --git a/src/dense/eigendecomposition.hpp b/src/dense/eigendecomposition.hpp new file mode 100644 index 00000000..c7d98e49 --- /dev/null +++ b/src/dense/eigendecomposition.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +/** + * + */ +struct EigenDecompositionOptions +{ + enum Type + { + EIGENVALUES, + EIGENVALUES_AND_VECTORS, + }; + + enum Range + { + ALL, + IN_RANGE, + BY_INDEX, + }; + + Type const m_type; + Range const m_range; + double const rangeMin; + double const rangeMax; + int const indexMin; + int const indexMax; +}; + +/** + * + */ +template< typename T, INDEX_TYPE > +void heev( + MemorySpace const space, + EigenDecompositionOption const options, + ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A, + ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, + Workspace< T > & workspace, + SymmetricMatrixStorageType const storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR +); + +} // namespace dense +} // namespace LvArray \ No newline at end of file From 9fb36e9b256fed0a2bc4a96b1adc1b217829ddfa Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Wed, 31 Aug 2022 16:47:57 -0700 Subject: [PATCH 29/34] Squash --- src/Macros.hpp | 28 +- .../{CMakeLists.cmake => CMakeLists.txt} | 0 src/dense/common.hpp | 101 ++++++- src/dense/eigendecomposition.cpp | 258 +++++++++--------- src/dense/eigendecomposition.hpp | 40 +-- 5 files changed, 271 insertions(+), 156 deletions(-) rename src/dense/{CMakeLists.cmake => CMakeLists.txt} (100%) diff --git a/src/Macros.hpp b/src/Macros.hpp index bcd98b78..82cf24d1 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -148,20 +148,20 @@ #endif #else #define LVARRAY_ERROR_IF( EXP, MSG ) \ - // do \ - // { \ - // if( EXP ) \ - // { \ - // std::ostringstream __oss; \ - // __oss << "***** ERROR\n"; \ - // __oss << "***** LOCATION: " LOCATION "\n"; \ - // __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \ - // __oss << MSG << "\n"; \ - // __oss << LvArray::system::stackTrace( true ); \ - // std::cout << __oss.str() << std::endl; \ - // LvArray::system::callErrorHandler(); \ - // } \ - // } while( false ) + do \ + { \ + if( EXP ) \ + { \ + std::ostringstream __oss; \ + __oss << "***** ERROR\n"; \ + __oss << "***** LOCATION: " LOCATION "\n"; \ + __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \ + __oss << MSG << "\n"; \ + __oss << LvArray::system::stackTrace( true ); \ + std::cout << __oss.str() << std::endl; \ + LvArray::system::callErrorHandler(); \ + } \ + } while( false ) #endif /** diff --git a/src/dense/CMakeLists.cmake b/src/dense/CMakeLists.txt similarity index 100% rename from src/dense/CMakeLists.cmake rename to src/dense/CMakeLists.txt diff --git a/src/dense/common.hpp b/src/dense/common.hpp index 40072a91..4588080e 100644 --- a/src/dense/common.hpp +++ b/src/dense/common.hpp @@ -1,13 +1,36 @@ #pragma once -#include "common.hpp" #include "../Array.hpp" #include "../ChaiBuffer.hpp" +#include + namespace LvArray { namespace dense { +namespace internal +{ + +/** + * TODO make a complex type and add it to the main LvArray. Make a uniform way of interacting with various complex number implementations. + */ +template< typename T > +struct RealVersion +{ + using Type = T; +}; + +/** + * + */ +template< typename T > +struct RealVersion< std::complex< T > > +{ + using Type = T; +}; + +} // namespace internal /** * @@ -27,10 +50,82 @@ char const * getOption( SymmetricMatrixStorageType const option ); * */ template< typename T > +using RealVersion = typename internal::RealVersion< T >::Type; + + +using DenseInt = int; + +/** + * + */ +template< typename T > +struct Matrix +{ + /** + * + */ + template< typename INDEX_TYPE > + Matrix( ArraySlice< T, 2, 1, INDEX_TYPE > const & slice ): + nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) }, + nCols{ integerConversion< DenseInt >( slice.size( 0 ) ) }, + stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] ) }, + data{ &slice( 0, 0 ) } + {} + + /** + * + */ + bool isSquare() const + { + return nRows == nCols; + } + + DenseInt const nRows; + DenseInt const nCols; + DenseInt const stride; + T * const data; +}; + +/** + * + */ +template< typename T > +struct Vector +{ + template< int USD, typename INDEX_TYPE > + Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ): + n{ integerConversion< DenseInt >( slice.size() ) }, + stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] ) }, + data{ &slice[ 0 ] } + {} + + DenseInt const n; + DenseInt const stride; + T * const data; +}; + +/** + * TODO(corbett5): Make this into a virtual heirarchy so we can get rid of ChaiBuffer here. + * Also add a version that is only for computing sizes so no dynamic allocation needed. + * When that is done you can get rid of the constructor here. + */ +template< typename T > struct Workspace { - Array< std::complex< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > workComplex; - Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rWork; + Workspace() + {} + + Workspace( std::ptrdiff_t initialSize ): + work( initialSize ), + rwork( initialSize ), + iwork( initialSize ) + {} + + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > work; + + Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rwork; + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > iwork; }; } // namespace dense diff --git a/src/dense/eigendecomposition.cpp b/src/dense/eigendecomposition.cpp index fc51132a..5bf2e0c4 100644 --- a/src/dense/eigendecomposition.cpp +++ b/src/dense/eigendecomposition.cpp @@ -52,14 +52,14 @@ void LVARRAY_ZHEEVR( double const * ABSTOL, int * M, double * W, - double * Z, + std::complex< double > * Z, int const * LDZ, int * ISUPPZ, std::complex< double > * WORK, int const * LWORK, double * RWORK, - int * LRWORK, - int const * IWORK, + int const * LRWORK, + int * IWORK, int const * LIWORK, int * INFO ); @@ -73,157 +73,169 @@ namespace dense namespace internal { -/** - * - */ -char const * getOption( EigenDecompositionOption const option ) +template< typename T > +int heevr( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenValues, + Matrix< std::complex< T > > const & eigenVectors, + Vector< int > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType, + bool const compute ) { - static constexpr char const * const eigenvalueString = "N"; - static constexpr char const * const eigenvectorString = "V"; + LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." ); - return option == EigenDecompositionOption::EIGENVALUES ? eigenvalueString : eigenvectorString; -} + LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); -struct HEEVR_status -{ - int LWORK; - int LRWORK; - int LIWORK; - bool success -}; - - -template< typename T, typename INDEX_TYPE > -HEEVR_Sizes heevr( - EigenDecompositionOption const decompositionOptions, - ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A, - ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, - Workspace< T > & workspace, - SymmetricMatrixStorageType const storageType, - bool const compute ) + char const * const JOBZ = decompositionOptions.typeArg(); + char const * const RANGE = decompositionOptions.rangeArg(); + char const * const UPLO = getOption( storageType ); + int const N = integerConversion< int >( A.nCols ); + int const LDA = A.stride; -} // namespace internal + T const VL = decompositionOptions.rangeMin; + T const VU = decompositionOptions.rangeMax; -//////////////////////////////////////////////////////////////////////////////////////////////////// -template< typename T, typename INDEX_TYPE > -void heev( - MemorySpace const space, - EigenDecompositionOption const decompositionType, - ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A, - ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, - Workspace< T > & workspace, - SymmetricMatrixStorageType const storageType ) -{ - LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::cpu, "Device not yet supported." ); + if( decompositionOptions.range == EigenDecompositionOptions::Range::IN_INTERVAL ) + { + LVARRAY_ERROR_IF_GE( VL, VU ); + } - LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), A.size( 1 ), - "The matrix A must be square." ); + int maxEigenvaluesToFind = N; + int const IL = decompositionOptions.indexMin; + int const IU = decompositionOptions.indexMax; + if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX ) + { + LVARRAY_ERROR_IF_LT( IL, 1 ); + LVARRAY_ERROR_IF_GT( IU, N ); + LVARRAY_ERROR_IF_GT( IL, IU ); - LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), eigenValues.size(), - "The matrix A and lambda have incompatible sizes." ); + maxEigenvaluesToFind = IU - IL + 1; + } - // define the arguments of zheev - int const N = LvArray::integerConversion< int >( A.size( 0 ) ); - int const LDA = N; - int INFO; + LVARRAY_ERROR_IF_LT( eigenValues.n, maxEigenvaluesToFind ); - // Make sure that the workspace is the right size. - workspace.rWork.resizeWithoutInitializationOrDestruction( std::max( 1, 3 * N - 2 ) ); + int const ABSTOL = decompositionOptions.abstol; + int M = 0; - if( workspace.workComplex.size() < std::max( 1, 2 * N - 1 ) ); + if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS ) { - std::complex< T > optimalWorkSize{ 0, 0 }; - - int LWORK = -1; + LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N ); + LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind ); + } - if( std::is_same_v< T, float > ) - { - LVARRAY_CHEEV( - getOption( decompositionType ), - getOption( storageType ), - &N, - nullptr, - &LDA, - nullptr, - &optimalWorkSize, - &LWORK, - nullptr, - &INFO ); - } - else - { - LVARRAY_ZHEEV( - getOption( decompositionType ), - getOption( storageType ), - &N, - nullptr, - &LDA, - nullptr, - &optimalWorkSize, - &LWORK, - nullptr, - &INFO ); - } + int const LDZ = eigenVectors.stride; - LVARRAY_ERROR_IF_NE_MSG( INFO, 0, - "Error in computing the optimal workspace size." ); - - workspace.workComplex.resizeWithoutInitializationOrDestruction( - static_cast< INDEX_TYPE >( optimalWorkSize.real() ) ); - } + // TODO: check ISUPPZ + + int const LWORK = compute ? integerConversion< int >( workspace.work.size() ) : -1; + int const LRWORK = integerConversion< int >( workspace.rwork.size() ); + int const LIWORK = integerConversion< int >( workspace.iwork.size() ); - int const LWORK = integerConversion< int >( workspace.workComplex.size() ); + int INFO = 0; + // With C++ 17 we can remove the reinterpret_cast with constexpr if. if( std::is_same< T, float >::value ) { - LVARRAY_CHEEV( - getOption( decompositionType ), - getOption( storageType ), - &N, - A.data(), - &LDA, - eigenValues.data(), - workspace.workComplex.data(), - &LWORK, - workspace.rWork.data(), - &INFO ); } else { - LVARRAY_ZHEEV( - getOption( decompositionType ), - getOption( storageType ), + LVARRAY_ZHEEVR( + JOBZ, + RANGE, + UPLO, &N, - A.data(), + reinterpret_cast< std::complex< double > * >( A.data ), &LDA, - eigenValues.data(), - workspace.workComplex.data(), + reinterpret_cast< double const * >( &VL ), + reinterpret_cast< double const * >( &VU ), + &IL, + &IU, + reinterpret_cast< double const * >( &ABSTOL ), + &M, + reinterpret_cast< double * >( eigenValues.data ), + reinterpret_cast< std::complex< double > * >( eigenVectors.data ), + &LDZ, + support.data, + reinterpret_cast< std::complex< double > * >( workspace.work.data() ), &LWORK, - workspace.rWork.data(), + reinterpret_cast< double * >( workspace.rwork.data() ), + &LRWORK, + workspace.iwork.data(), + &LIWORK, &INFO ); } - - LVARRAY_ERROR_IF_NE_MSG( INFO, 0, - "Error in computing the eigen decomposition." ); -}` + LVARRAY_ERROR_IF_NE( INFO, 0 ); -// explicit instantiations. -template void heev< float >( + return M; +} + +} // namespace internal + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template< typename T > +int heevr( MemorySpace const space, - EigenDecompositionOption const decompositionType, - ArraySlice< std::complex< float >, 2, 1, INDEX_TYPE > const & A, - ArraySlice< float, 1, 0, INDEX_TYPE > const & eigenValues, - Workspace< float > & workspace, - SymmetricMatrixStorageType const storageType ); + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenValues, + Matrix< std::complex< T > > const & eigenVectors, + Vector< int > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + bool const reallocateWork = workspace.work.size() < 2 * A.nRows; + bool const reallocateRWork = workspace.rwork.size() < 24 * A.nRows; + bool const reallocateIWork = workspace.iwork.size() < 10 * A.nRows; -template void heev< double >( + if( reallocateWork || reallocateRWork || reallocateIWork ) + { + Workspace< std::complex< T > > optimalSizes( 1 ); + internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false ); + + if( reallocateWork ) + { + workspace.work.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.work[ 0 ].real() ) ); + } + + if( reallocateRWork ) + { + workspace.rwork.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.rwork[ 0 ] ) ); + } + + if( reallocateIWork ) + { + workspace.rwork.resizeWithoutInitializationOrDestruction( space, optimalSizes.iwork[ 0 ] ); + } + } + + return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true ); +} + +// explicit instantiations. +template int heevr< float >( MemorySpace const space, - EigenDecompositionOption const decompositionType, - ArraySlice< std::complex< double >, 2, 1, INDEX_TYPE > const & A, - ArraySlice< double, 1, 0, INDEX_TYPE > const & eigenValues, - Workspace< double > & workspace, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< float > > const & A, + Vector< float > const & eigenValues, + Matrix< std::complex< float > > const & eigenVectors, + Vector< int > const & support, + Workspace< std::complex< float > > & workspace, SymmetricMatrixStorageType const storageType ); +// template int heevr< double >( +// MemorySpace const space, +// EigenDecompositionOptions const decompositionOptions, +// Matrix< std::complex< double > > const & A, +// Vector< double > const & eigenValues, +// Matrix< std::complex< double > > const & eigenVectors, +// Vector< int > const & support, +// Workspace< std::complex< double > > & workspace, +// SymmetricMatrixStorageType const storageType ); + } // namespace dense } // namespace LvArray diff --git a/src/dense/eigendecomposition.hpp b/src/dense/eigendecomposition.hpp index c7d98e49..75ae830b 100644 --- a/src/dense/eigendecomposition.hpp +++ b/src/dense/eigendecomposition.hpp @@ -21,30 +21,38 @@ struct EigenDecompositionOptions enum Range { ALL, - IN_RANGE, + IN_INTERVAL, BY_INDEX, }; - Type const m_type; - Range const m_range; + char const * typeArg() const + { + static constexpr char const * const eigenvalueString = "N"; + static constexpr char const * const eigenvectorString = "V"; + + return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString; + } + + char const * rangeArg() const + { + static constexpr char const * const allString = "A"; + static constexpr char const * const intervalString = "V"; + static constexpr char const * const indexString = "I"; + + if( range == Range::ALL ) + { return allString; } + + return range == Range::IN_INTERVAL ? intervalString : indexString; + } + + Type const type; + Range const range; double const rangeMin; double const rangeMax; int const indexMin; int const indexMax; + double const abstol; }; -/** - * - */ -template< typename T, INDEX_TYPE > -void heev( - MemorySpace const space, - EigenDecompositionOption const options, - ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A, - ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, - Workspace< T > & workspace, - SymmetricMatrixStorageType const storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR -); - } // namespace dense } // namespace LvArray \ No newline at end of file From c7036cbe276cb671f9758bccdad27dc9e487d198 Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Thu, 1 Sep 2022 01:36:40 -0700 Subject: [PATCH 30/34] Eigen stuff seems to be at least partialy working. --- scripts/uberenv/packages/lvarray/package.py | 307 ++++++++++++-------- src/ArraySlice.hpp | 9 + src/dense/CMakeLists.txt | 4 +- src/dense/common.hpp | 130 +++++++-- src/dense/eigenDecomposition.cpp | 270 +++++++++++++++++ src/dense/eigenDecomposition.hpp | 202 +++++++++++++ src/dense/eigendecomposition.cpp | 241 --------------- src/dense/eigendecomposition.hpp | 58 ---- unitTests/CMakeLists.txt | 5 + unitTests/dense/CMakeLists.txt | 34 +++ unitTests/dense/testEigenDecomposition.cpp | 62 ++++ 11 files changed, 881 insertions(+), 441 deletions(-) create mode 100644 src/dense/eigenDecomposition.cpp create mode 100644 src/dense/eigenDecomposition.hpp delete mode 100644 src/dense/eigendecomposition.cpp delete mode 100644 src/dense/eigendecomposition.hpp create mode 100644 unitTests/dense/CMakeLists.txt create mode 100644 unitTests/dense/testEigenDecomposition.cpp diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index c5a3b35b..347ca123 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -18,6 +18,12 @@ def cmake_cache_entry(name, value, comment=""): return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name, value, comment) +def cmake_cache_list(name, value, comment=""): + """Generate a list for a cmake cache variable""" + + indent = 5 + len(name) + join_str = '\n' + ' ' * indent + return 'set(%s %s CACHE STRING "%s")\n\n' % (name, join_str.join(value), comment) def cmake_cache_string(name, string, comment=""): """Generate a string for a cmake cache variable""" @@ -50,7 +56,7 @@ class Lvarray(CMakePackage, CudaPackage): variant('chai', default=False, description='Build Chai support') variant('caliper', default=False, description='Build Caliper support') variant('pylvarray', default=False, description='Build Python support') - # variant('lapack', default=False, description='Build LAPACK and BLAS support') + variant('lapack', default=False, description='Build LAPACK and BLAS support') # variant('magma', default=False, description='Build MAGMA support') variant('tests', default=True, description='Build tests') variant('benchmarks', default=False, description='Build benchmarks') @@ -65,7 +71,7 @@ class Lvarray(CMakePackage, CudaPackage): # conflicts('~lapack', when='+magma') - depends_on('blt', when='@0.2.0:', type='build') + depends_on('blt@0.4.1:', when='@0.2.0:', type='build') depends_on('camp') @@ -82,8 +88,8 @@ class Lvarray(CMakePackage, CudaPackage): depends_on('py-scipy@1.5.2:', when='+pylvarray') depends_on('py-pip', when='+pylvarray') - # depends_on('blas', when='+lapack') - # depends_on('lapack', when='+lapack') + depends_on('blas', when='+lapack') + depends_on('lapack', when='+lapack') # depends_on('magma', when='+magma') depends_on('doxygen@1.8.13:', when='+docs', type='build') @@ -181,132 +187,130 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cmake_exe = os.path.realpath(cmake_exe) host_config_path = self._get_host_config_path(spec) - cfg = open(host_config_path, "w") - cfg.write("#{0}\n".format("#" * 80)) - cfg.write("# Generated host-config - Edit at own risk!\n") - cfg.write("#{0}\n".format("#" * 80)) - - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# SYS_TYPE: {0}\n".format(sys_type)) - cfg.write("# Compiler Spec: {0}\n".format(spec.compiler)) - cfg.write("# CMake executable path: %s\n" % cmake_exe) - cfg.write("#{0}\n\n".format("-" * 80)) - - if 'blt' in spec: - cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix)) - - ####################### - # Compiler Settings - ####################### - - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Compilers\n") - cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler)) - cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler)) - - # use global spack compiler flags - cflags = ' '.join(spec.compiler_flags['cflags']) - cxxflags = ' '.join(spec.compiler_flags['cxxflags']) - - if "%intel" in spec: - cflags += ' -qoverride-limits' - cxxflags += ' -qoverride-limits' - - if cflags: - cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags)) - - if cxxflags: - cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) + with open(host_config_path, "w") as cfg: + cfg.write("#{0}\n".format("#" * 80)) + cfg.write("# Generated host-config - Edit at own risk!\n") + cfg.write("#{0}\n".format("#" * 80)) - release_flags = "-O3 -DNDEBUG" - cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE", - release_flags)) - reldebinf_flags = "-O3 -g -DNDEBUG" - cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO", - reldebinf_flags)) - debug_flags = "-O0 -g" - cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags)) - - if "%clang arch=linux-rhel7-ppc64le" in spec: - cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize")) - - if "+cuda" in spec: cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Cuda\n") + cfg.write("# SYS_TYPE: {0}\n".format(sys_type)) + cfg.write("# Compiler Spec: {0}\n".format(spec.compiler)) + cfg.write("# Spec: {0}\n".format(spec)) + cfg.write("# CMake executable path: %s\n" % cmake_exe) cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_option("ENABLE_CUDA", True)) - cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14)) - - cudatoolkitdir = spec['cuda'].prefix - cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR", - cudatoolkitdir)) - cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" - cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler)) - - cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror ' - 'cross-execution-space-call,reorder,' - 'deprecated-declarations') + if 'blt' in spec: + cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix)) - archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch") - for archSpecifier in archSpecifiers: - for compilerArg in spec.compiler_flags['cxxflags']: - if compilerArg.startswith(archSpecifier): - cmake_cuda_flags += ' -Xcompiler ' + compilerArg + ####################### + # Compiler Settings + ####################### - if not spec.satisfies('cuda_arch=none'): - cuda_arch = spec.variants['cuda_arch'].value - cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0]) - - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags)) - - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", - "-O3 -Xcompiler -O3 -DNDEBUG")) - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", - "-O3 -g -lineinfo -Xcompiler -O3")) - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", - "-O0 -Xcompiler -O0 -g -G")) - - else: - cfg.write(cmake_cache_option("ENABLE_CUDA", False)) - - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# CAMP\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Compilers\n") + cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler)) + cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler)) + + # use global spack compiler flags + cflags = ' '.join(spec.compiler_flags['cflags']) + cxxflags = ' '.join(spec.compiler_flags['cxxflags']) + + if "%intel" in spec: + cflags += ' -qoverride-limits' + cxxflags += ' -qoverride-limits' + + if cflags: + cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags)) + + if cxxflags: + cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) + + release_flags = "-O3 -DNDEBUG" + cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE", + release_flags)) + reldebinf_flags = "-O3 -g -DNDEBUG" + cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO", + reldebinf_flags)) + debug_flags = "-O0 -g" + cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags)) + + if "%clang arch=linux-rhel7-ppc64le" in spec: + cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize")) + + if "+cuda" in spec: + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Cuda\n") + cfg.write("#{0}\n\n".format("-" * 80)) + + cfg.write(cmake_cache_option("ENABLE_CUDA", True)) + cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14)) + + cudatoolkitdir = spec['cuda'].prefix + cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR", + cudatoolkitdir)) + cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" + cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler)) + + cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror ' + 'cross-execution-space-call,reorder,' + 'deprecated-declarations') + + archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch") + for archSpecifier in archSpecifiers: + for compilerArg in spec.compiler_flags['cxxflags']: + if compilerArg.startswith(archSpecifier): + cmake_cuda_flags += ' -Xcompiler ' + compilerArg + + if not spec.satisfies('cuda_arch=none'): + cuda_arch = spec.variants['cuda_arch'].value + cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0]) + + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags)) + + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", + "-O3 -Xcompiler -O3 -DNDEBUG")) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", + "-O3 -g -lineinfo -Xcompiler -O3")) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", + "-O0 -Xcompiler -O0 -g -G")) + + else: + cfg.write(cmake_cache_option("ENABLE_CUDA", False)) - cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# CAMP\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# RAJA\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix)) - cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# RAJA\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Umpire\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix)) - if "+umpire" in spec: - cfg.write(cmake_cache_option("ENABLE_UMPIRE", True)) - cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix)) - else: - cfg.write(cmake_cache_option("ENABLE_UMPIRE", False)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Umpire\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# CHAI\n") - cfg.write("#{0}\n\n".format("-" * 80)) + if "+umpire" in spec: + cfg.write(cmake_cache_option("ENABLE_UMPIRE", True)) + cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix)) + else: + cfg.write(cmake_cache_option("ENABLE_UMPIRE", False)) - if "+chai" in spec: - cfg.write(cmake_cache_option("ENABLE_CHAI", True)) - cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix)) - else: - cfg.write(cmake_cache_option("ENABLE_CHAI", False)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# CHAI\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Caliper\n") - cfg.write("#{0}\n\n".format("-" * 80)) + if "+chai" in spec: + cfg.write(cmake_cache_option("ENABLE_CHAI", True)) + cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix)) + else: + cfg.write(cmake_cache_option("ENABLE_CHAI", False)) +<<<<<<< HEAD if "+caliper" in spec: cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) @@ -358,15 +362,74 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): 'doxygen'))) else: cfg.write(cmake_cache_option("ENABLE_DOCS", False)) +======= + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Caliper\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# addr2line\n") - cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec)) + if "+caliper" in spec: + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Caliper\n") + cfg.write("#{0}\n\n".format("-" * 80)) + + cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) + cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) + else: + cfg.write(cmake_cache_option("ENABLE_CALIPER", False)) + + cfg.write('#{0}\n'.format('-' * 80)) + cfg.write('# Python\n') + cfg.write('#{0}\n\n'.format('-' * 80)) + if '+pylvarray' in spec: + cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True)) + cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3'))) + else: + cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False)) + + cfg.write('#{0}\n'.format('-' * 80)) + cfg.write('# Math libraries\n') + cfg.write('#{0}\n\n'.format('-' * 80)) + if '+lapack' in spec: + cfg.write(cmake_cache_option('ENABLE_LAPACK', True)) + cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs)) + cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs)) + else: + cfg.write(cmake_cache_option('ENABLE_LAPACK', False)) + + # if '+magma' in spec: + # cfg.write(cmake_cache_option('ENABLE_MAGMA', True)) + # cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix)) + # else: + # cfg.write(cmake_cache_option('ENABLE_MAGMA', False)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Other\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Documentation\n") + cfg.write("#{0}\n\n".format("-" * 80)) + if "+docs" in spec: + cfg.write(cmake_cache_option("ENABLE_DOCS", True)) + sphinx_dir = spec['py-sphinx'].prefix + cfg.write(cmake_cache_string('SPHINX_EXECUTABLE', + os.path.join(sphinx_dir, + 'bin', + 'sphinx-build'))) + + doxygen_dir = spec['doxygen'].prefix + cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE', + os.path.join(doxygen_dir, + 'bin', + 'doxygen'))) + else: + cfg.write(cmake_cache_option("ENABLE_DOCS", False)) +>>>>>>> f6cec78 (Eigen stuff seems to be at least partialy working.) + + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# addr2line\n") + cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec)) + + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Other\n") + cfg.write("#{0}\n\n".format("-" * 80)) def cmake_args(self): spec = self.spec diff --git a/src/ArraySlice.hpp b/src/ArraySlice.hpp index 374979e3..84357d8b 100644 --- a/src/ArraySlice.hpp +++ b/src/ArraySlice.hpp @@ -308,6 +308,15 @@ class ArraySlice return m_data[ linearIndex( indices ... ) ]; } + /** + * @brief + */ + LVARRAY_HOST_DEVICE inline constexpr + T * data() const + { + return m_data; + } + /** * @return Return a pointer to the values. * @tparam USD_ Dummy template parameter, do not specify. diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt index 0f7096ae..36778a28 100644 --- a/src/dense/CMakeLists.txt +++ b/src/dense/CMakeLists.txt @@ -1,11 +1,11 @@ set( lvarraydense_headers common.hpp - eigendecomposition.hpp + eigenDecomposition.hpp ) set( lvarraydense_sources common.cpp - eigendecomposition.cpp + eigenDecomposition.cpp ) blt_add_library( NAME lvarraydense diff --git a/src/dense/common.hpp b/src/dense/common.hpp index 4588080e..146bb407 100644 --- a/src/dense/common.hpp +++ b/src/dense/common.hpp @@ -65,11 +65,21 @@ struct Matrix * */ template< typename INDEX_TYPE > - Matrix( ArraySlice< T, 2, 1, INDEX_TYPE > const & slice ): + Matrix( ArraySlice< T, 2, 0, INDEX_TYPE > const & slice ): nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) }, nCols{ integerConversion< DenseInt >( slice.size( 0 ) ) }, - stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] ) }, - data{ &slice( 0, 0 ) } + stride{ integerConversion< DenseInt >( slice.stride( 1 ) ) }, + columnMajor{ true }, + data{ slice.data() } + {} + + template< typename INDEX_TYPE > + Matrix( T & value ): + nRows{ 1 }, + nCols{ 1 }, + stride{ 1 }, + columnMajor{ true }, + data{ &value } {} /** @@ -83,6 +93,7 @@ struct Matrix DenseInt const nRows; DenseInt const nCols; DenseInt const stride; + bool const columnMajor; T * const data; }; @@ -94,38 +105,121 @@ struct Vector { template< int USD, typename INDEX_TYPE > Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ): - n{ integerConversion< DenseInt >( slice.size() ) }, - stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] ) }, - data{ &slice[ 0 ] } + size{ integerConversion< DenseInt >( slice.size() ) }, + stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) }, + data{ slice.data() } {} - DenseInt const n; + Vector( T & value ): + size{ 1 }, + stride{ 1 }, + data{ &value } + {} + + DenseInt const size; DenseInt const stride; T * const data; }; /** - * TODO(corbett5): Make this into a virtual heirarchy so we can get rid of ChaiBuffer here. - * Also add a version that is only for computing sizes so no dynamic allocation needed. - * When that is done you can get rid of the constructor here. + * */ template< typename T > struct Workspace { - Workspace() + virtual ~Workspace() + {}; + + virtual Vector< T > work() = 0; + + virtual Vector< RealVersion< T > > rwork() = 0; + + virtual Vector< DenseInt > iwork() = 0; + + virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) = 0; + + virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) = 0; + + virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) = 0; +}; + +/** + * + */ +template< typename T, template< typename > class BUFFER_TYPE > +struct ArrayWorkspace : public Workspace< T > +{ + ArrayWorkspace() {} - Workspace( std::ptrdiff_t initialSize ): - work( initialSize ), - rwork( initialSize ), - iwork( initialSize ) + virtual Vector< T > work() override + { return m_work.toSlice(); } + + virtual Vector< RealVersion< T > > rwork() override + { return m_rwork.toSlice(); } + + virtual Vector< DenseInt > iwork() override + { return m_iwork.toSlice(); } + + virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override + { m_work.resizeWithoutInitializationOrDestruction( space, newSize ); } + + virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override + { m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); } + + virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override + { m_iwork.resizeWithoutInitializationOrDestruction( space, newSize ); } + +private: + Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work; + + Array< RealVersion< T >, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_rwork; + + Array< DenseInt, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_iwork; +}; + +/** + * + */ +template< typename T > +struct OptimalSizeCalculation : public Workspace< T > +{ + OptimalSizeCalculation() {} - Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > work; + virtual Vector< T > work() override + { return m_work; } + + virtual Vector< RealVersion< T > > rwork() override + { return m_rwork; } + + virtual Vector< int > iwork() override + { return m_iwork; } + + virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + DenseInt optimalWorkSize() const + { return static_cast< DenseInt >( m_work.real() ); } + + DenseInt optimalRWorkSize() const + { return static_cast< DenseInt >( m_rwork ); } + + DenseInt optimalIWorkSize() const + { return m_iwork; } + +private: + T m_work; - Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rwork; + RealVersion< T > m_rwork; - Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > iwork; + DenseInt m_iwork; }; } // namespace dense diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp new file mode 100644 index 00000000..68a2256d --- /dev/null +++ b/src/dense/eigenDecomposition.cpp @@ -0,0 +1,270 @@ +#include "eigenDecomposition.hpp" + +/// This macro provide a flexible interface for Fortran naming convention for compiled objects +// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE +#define FORTRAN_MANGLE( name ) name +// #else +// #define FORTRAN_MANGLE( name ) name ## _ +// #endif + +extern "C" +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CHEEVR FORTRAN_MANGLE( cheevr ) +void LVARRAY_CHEEVR( + char const * JOBZ, + char const * RANGE, + char const * UPLO, + LvArray::dense::DenseInt const * N, + std::complex< float > * A, + LvArray::dense::DenseInt const * LDA, + float const * VL, + float const * VU, + LvArray::dense::DenseInt const * IL, + LvArray::dense::DenseInt const * IU, + float const * ABSTOL, + LvArray::dense::DenseInt * M, + float * W, + std::complex< float > * Z, + LvArray::dense::DenseInt const * LDZ, + LvArray::dense::DenseInt * ISUPPZ, + std::complex< float > * WORK, + LvArray::dense::DenseInt const * LWORK, + float * RWORK, + LvArray::dense::DenseInt const * LRWORK, + LvArray::dense::DenseInt * IWORK, + LvArray::dense::DenseInt const * LIWORK, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr ) +void LVARRAY_ZHEEVR( + char const * JOBZ, + char const * RANGE, + char const * UPLO, + LvArray::dense::DenseInt const * N, + std::complex< double > * A, + LvArray::dense::DenseInt const * LDA, + double const * VL, + double const * VU, + LvArray::dense::DenseInt const * IL, + LvArray::dense::DenseInt const * IU, + double const * ABSTOL, + LvArray::dense::DenseInt * M, + double * W, + std::complex< double > * Z, + LvArray::dense::DenseInt const * LDZ, + LvArray::dense::DenseInt * ISUPPZ, + std::complex< double > * WORK, + LvArray::dense::DenseInt const * LWORK, + double * RWORK, + LvArray::dense::DenseInt const * LRWORK, + LvArray::dense::DenseInt * IWORK, + LvArray::dense::DenseInt const * LIWORK, + LvArray::dense::DenseInt * INFO ); + + +} // extern "C" + +namespace LvArray +{ +namespace dense +{ +namespace internal +{ + +/** + * + */ +template< typename T > +DenseInt heevr( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenValues, + Matrix< std::complex< T > > const & eigenVectors, + Vector< DenseInt > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType, + bool const compute ) +{ + LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." ); + + LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); + + char const * const JOBZ = decompositionOptions.typeArg(); + char const * const RANGE = decompositionOptions.rangeArg(); + char const * const UPLO = getOption( storageType ); + DenseInt const N = A.nCols; + DenseInt const LDA = A.stride; + + T const VL = decompositionOptions.rangeMin; + T const VU = decompositionOptions.rangeMax; + + DenseInt maxEigenvaluesToFind = N; + DenseInt const IL = decompositionOptions.indexMin; + DenseInt const IU = decompositionOptions.indexMax; + if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX ) + { + LVARRAY_ERROR_IF_GT( IU, N ); + maxEigenvaluesToFind = IU - IL + 1; + } + + LVARRAY_ERROR_IF_LT( eigenValues.size, maxEigenvaluesToFind ); + + DenseInt const ABSTOL = decompositionOptions.abstol; + DenseInt M = 0; + + if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS ) + { + LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N ); + LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind ); + } + + DenseInt const LDZ = std::max( 1, eigenVectors.stride ); + + if( decompositionOptions.range == EigenDecompositionOptions::Range::ALL || + ( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX && + maxEigenvaluesToFind == N ) ) + { + LVARRAY_ERROR_IF_LT( support.size, 2 * maxEigenvaluesToFind ); + } + + DenseInt const LWORK = compute ? workspace.work().size : -1; + DenseInt const LRWORK = compute ? workspace.rwork().size : -1; + DenseInt const LIWORK = compute ? workspace.iwork().size : -1; + + DenseInt INFO = 0; + + // With C++ 17 we can remove the reinterpret_cast with constexpr if. + if( std::is_same< T, float >::value ) + { + LVARRAY_CHEEVR( + JOBZ, + RANGE, + UPLO, + &N, + reinterpret_cast< std::complex< float > * >( A.data ), + &LDA, + reinterpret_cast< float const * >( &VL ), + reinterpret_cast< float const * >( &VU ), + &IL, + &IU, + reinterpret_cast< float const * >( &ABSTOL ), + &M, + reinterpret_cast< float * >( eigenValues.data ), + reinterpret_cast< std::complex< float > * >( eigenVectors.data ), + &LDZ, + support.data, + reinterpret_cast< std::complex< float > * >( workspace.work().data ), + &LWORK, + reinterpret_cast< float * >( workspace.rwork().data ), + &LRWORK, + workspace.iwork().data, + &LIWORK, + &INFO ); + } + else + { + LVARRAY_ZHEEVR( + JOBZ, + RANGE, + UPLO, + &N, + reinterpret_cast< std::complex< double > * >( A.data ), + &LDA, + reinterpret_cast< double const * >( &VL ), + reinterpret_cast< double const * >( &VU ), + &IL, + &IU, + reinterpret_cast< double const * >( &ABSTOL ), + &M, + reinterpret_cast< double * >( eigenValues.data ), + reinterpret_cast< std::complex< double > * >( eigenVectors.data ), + &LDZ, + support.data, + reinterpret_cast< std::complex< double > * >( workspace.work().data ), + &LWORK, + reinterpret_cast< double * >( workspace.rwork().data ), + &LRWORK, + workspace.iwork().data, + &LIWORK, + &INFO ); + } + + LVARRAY_ERROR_IF_NE( INFO, 0 ); + + return M; +} + +} // namespace internal + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template< typename T > +DenseInt heevr( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenValues, + Matrix< std::complex< T > > const & eigenVectors, + Vector< DenseInt > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + bool const reallocateWork = workspace.work().size < 2 * A.nRows; + bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows; + bool const reallocateIWork = workspace.iwork().size < 10 * A.nRows; + + if( reallocateWork || reallocateRWork || reallocateIWork ) + { + OptimalSizeCalculation< std::complex< T > > optimalSizes; + internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false ); + + if( reallocateWork ) + { + workspace.resizeWork( space, optimalSizes.optimalWorkSize() ); + } + + if( reallocateRWork ) + { + workspace.resizeRWork( space, optimalSizes.optimalRWorkSize() ); + } + + if( reallocateIWork ) + { + workspace.resizeIWork( space, optimalSizes.optimalIWorkSize() ); + } + } + + return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// explicit instantiations. +//////////////////////////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template DenseInt heevr< float >( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< float > > const & A, + Vector< float > const & eigenValues, + Matrix< std::complex< float > > const & eigenVectors, + Vector< DenseInt > const & support, + Workspace< std::complex< float > > & workspace, + SymmetricMatrixStorageType const storageType ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template DenseInt heevr< double >( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< double > > const & A, + Vector< double > const & eigenValues, + Matrix< std::complex< double > > const & eigenVectors, + Vector< DenseInt > const & support, + Workspace< std::complex< double > > & workspace, + SymmetricMatrixStorageType const storageType ); + +} // namespace dense +} // namespace LvArray diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp new file mode 100644 index 00000000..16ec001a --- /dev/null +++ b/src/dense/eigenDecomposition.hpp @@ -0,0 +1,202 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +/** + * + */ +struct EigenDecompositionOptions +{ + /** + * + */ + enum Type + { + EIGENVALUES, + EIGENVALUES_AND_VECTORS, + }; + + /** + * + */ + enum Range + { + ALL, + IN_INTERVAL, + BY_INDEX, + }; + + /** + * + */ + EigenDecompositionOptions( Type const typeP, double const abstolP=0 ): + type{ typeP }, + abstol{ abstolP } + {} + + /** + * + */ + EigenDecompositionOptions( + Type const typeP, + double const rangeMinP, + double const rangeMaxP, + double const abstolP ): + type{ typeP }, + range{ Range::IN_INTERVAL }, + rangeMin{ rangeMinP }, + rangeMax{ rangeMaxP }, + abstol{ abstolP } + { + LVARRAY_ERROR_IF_GE( rangeMin, rangeMax ); + } + + /** + * TODO: Not sure how I feel about the one based indexing for eigenvalues by index. + */ + EigenDecompositionOptions( + Type const typeP, + DenseInt const indexMinP, + DenseInt const indexMaxP, + double const abstolP ): + type{ typeP }, + range{ Range::IN_INTERVAL }, + indexMin{ indexMinP }, + indexMax{ indexMaxP }, + abstol{ abstolP } + { + LVARRAY_ERROR_IF_LT( indexMin, 1 ); + LVARRAY_ERROR_IF_GT( indexMin, indexMax ); + } + + /** + * + */ + char const * typeArg() const + { + static constexpr char const * const eigenvalueString = "N"; + static constexpr char const * const eigenvectorString = "V"; + + return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString; + } + + /** + * + */ + char const * rangeArg() const + { + static constexpr char const * const allString = "A"; + static constexpr char const * const intervalString = "V"; + static constexpr char const * const indexString = "I"; + + if( range == Range::ALL ) + { return allString; } + + return range == Range::IN_INTERVAL ? intervalString : indexString; + } + + /// + Type const type; + + /// + Range const range = Range::ALL; + + /// + double const rangeMin = std::numeric_limits< double >::max(); + + /// + double const rangeMax = std::numeric_limits< double >::lowest(); + + /// + DenseInt const indexMin = std::numeric_limits< DenseInt >::max(); + + /// + DenseInt const indexMax = std::numeric_limits< DenseInt >::lowest(); + + /// + double const abstol = 0; +}; + + +/** + * + */ +template< typename T > +DenseInt heevr( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenValues, + Matrix< std::complex< T > > const & eigenVectors, + Vector< DenseInt > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ); + +/** + * + */ +template< typename T, int USD, typename INDEX_TYPE > +DenseInt heevr( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & A, + ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, + ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & eigenVectors, + ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + Matrix< std::complex< T > > AMatrix( A ); + Vector< T > eigenValuesVector( eigenValues ); + Matrix< std::complex< T > > eigenVectorsMatrix( eigenVectors ); + Vector< DenseInt > supportVector( support ); + + return heevr( + space, + decompositionOptions, + AMatrix, + eigenValuesVector, + eigenVectorsMatrix, + supportVector, + workspace, + storageType ); +} + +/** + * + */ +template< typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +DenseInt heevr( + MemorySpace const space, + EigenDecompositionOptions const decompositionOptions, + ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & A, + ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues, + ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors, + ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + // Unclear about the touch here since half of A is destroyed, maybe it's not necessary. + A.move( space, true ); + eigenValues.move( space, true ); + eigenVectors.move( space, true ); + support.move( space, true ); + + return heevr( + space, + decompositionOptions, + A.toSlice(), + eigenValues.toSlice(), + eigenVectors.toSlice(), + support.toSlice(), + workspace, + storageType ); +} + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/eigendecomposition.cpp b/src/dense/eigendecomposition.cpp deleted file mode 100644 index 5bf2e0c4..00000000 --- a/src/dense/eigendecomposition.cpp +++ /dev/null @@ -1,241 +0,0 @@ -#include "eigendecomposition.hpp" - -/// This macro provide a flexible interface for Fortran naming convention for compiled objects -// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE -#define FORTRAN_MANGLE( name ) name -// #else -// #define FORTRAN_MANGLE( name ) name ## _ -// #endif - -extern "C" -{ - -#define LVARRAY_CHEEV FORTRAN_MANGLE( cheev ) -void LVARRAY_CHEEV( - char const * JOBZ, - char const * UPLO, - int const * N, - std::complex< float > * A, - int const * LDA, - float * W, - std::complex< float > * WORK, - int const * LWORK, - float const * RWORK, - int * INFO -); - -#define LVARRAY_ZHEEV FORTRAN_MANGLE( zheev ) -void LVARRAY_ZHEEV( - char const * JOBZ, - char const * UPLO, - int const * N, - std::complex< double > * A, - int const * LDA, - double * W, - std::complex< double > * WORK, - int const * LWORK, - double const * RWORK, - int * INFO ); - -#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr ) -void LVARRAY_ZHEEVR( - char const * JOBZ, - char const * RANGE, - char const * UPLO, - int const * N, - std::complex< double > * A, - int const * LDA, - double const * VL, - double const * VU, - int const * IL, - int const * IU, - double const * ABSTOL, - int * M, - double * W, - std::complex< double > * Z, - int const * LDZ, - int * ISUPPZ, - std::complex< double > * WORK, - int const * LWORK, - double * RWORK, - int const * LRWORK, - int * IWORK, - int const * LIWORK, - int * INFO ); - - -} // extern "C" - -namespace LvArray -{ -namespace dense -{ -namespace internal -{ - -template< typename T > -int heevr( - MemorySpace const space, - EigenDecompositionOptions const decompositionOptions, - Matrix< std::complex< T > > const & A, - Vector< T > const & eigenValues, - Matrix< std::complex< T > > const & eigenVectors, - Vector< int > const & support, - Workspace< std::complex< T > > & workspace, - SymmetricMatrixStorageType const storageType, - bool const compute ) -{ - LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." ); - - LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); - - char const * const JOBZ = decompositionOptions.typeArg(); - char const * const RANGE = decompositionOptions.rangeArg(); - char const * const UPLO = getOption( storageType ); - int const N = integerConversion< int >( A.nCols ); - int const LDA = A.stride; - - T const VL = decompositionOptions.rangeMin; - T const VU = decompositionOptions.rangeMax; - - if( decompositionOptions.range == EigenDecompositionOptions::Range::IN_INTERVAL ) - { - LVARRAY_ERROR_IF_GE( VL, VU ); - } - - int maxEigenvaluesToFind = N; - int const IL = decompositionOptions.indexMin; - int const IU = decompositionOptions.indexMax; - if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX ) - { - LVARRAY_ERROR_IF_LT( IL, 1 ); - LVARRAY_ERROR_IF_GT( IU, N ); - LVARRAY_ERROR_IF_GT( IL, IU ); - - maxEigenvaluesToFind = IU - IL + 1; - } - - LVARRAY_ERROR_IF_LT( eigenValues.n, maxEigenvaluesToFind ); - - int const ABSTOL = decompositionOptions.abstol; - int M = 0; - - if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS ) - { - LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N ); - LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind ); - } - - int const LDZ = eigenVectors.stride; - - // TODO: check ISUPPZ - - int const LWORK = compute ? integerConversion< int >( workspace.work.size() ) : -1; - int const LRWORK = integerConversion< int >( workspace.rwork.size() ); - int const LIWORK = integerConversion< int >( workspace.iwork.size() ); - - int INFO = 0; - - // With C++ 17 we can remove the reinterpret_cast with constexpr if. - if( std::is_same< T, float >::value ) - { - } - else - { - LVARRAY_ZHEEVR( - JOBZ, - RANGE, - UPLO, - &N, - reinterpret_cast< std::complex< double > * >( A.data ), - &LDA, - reinterpret_cast< double const * >( &VL ), - reinterpret_cast< double const * >( &VU ), - &IL, - &IU, - reinterpret_cast< double const * >( &ABSTOL ), - &M, - reinterpret_cast< double * >( eigenValues.data ), - reinterpret_cast< std::complex< double > * >( eigenVectors.data ), - &LDZ, - support.data, - reinterpret_cast< std::complex< double > * >( workspace.work.data() ), - &LWORK, - reinterpret_cast< double * >( workspace.rwork.data() ), - &LRWORK, - workspace.iwork.data(), - &LIWORK, - &INFO ); - } - - LVARRAY_ERROR_IF_NE( INFO, 0 ); - - return M; -} - -} // namespace internal - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template< typename T > -int heevr( - MemorySpace const space, - EigenDecompositionOptions const decompositionOptions, - Matrix< std::complex< T > > const & A, - Vector< T > const & eigenValues, - Matrix< std::complex< T > > const & eigenVectors, - Vector< int > const & support, - Workspace< std::complex< T > > & workspace, - SymmetricMatrixStorageType const storageType ) -{ - bool const reallocateWork = workspace.work.size() < 2 * A.nRows; - bool const reallocateRWork = workspace.rwork.size() < 24 * A.nRows; - bool const reallocateIWork = workspace.iwork.size() < 10 * A.nRows; - - if( reallocateWork || reallocateRWork || reallocateIWork ) - { - Workspace< std::complex< T > > optimalSizes( 1 ); - internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false ); - - if( reallocateWork ) - { - workspace.work.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.work[ 0 ].real() ) ); - } - - if( reallocateRWork ) - { - workspace.rwork.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.rwork[ 0 ] ) ); - } - - if( reallocateIWork ) - { - workspace.rwork.resizeWithoutInitializationOrDestruction( space, optimalSizes.iwork[ 0 ] ); - } - } - - return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true ); -} - -// explicit instantiations. -template int heevr< float >( - MemorySpace const space, - EigenDecompositionOptions const decompositionOptions, - Matrix< std::complex< float > > const & A, - Vector< float > const & eigenValues, - Matrix< std::complex< float > > const & eigenVectors, - Vector< int > const & support, - Workspace< std::complex< float > > & workspace, - SymmetricMatrixStorageType const storageType ); - -// template int heevr< double >( -// MemorySpace const space, -// EigenDecompositionOptions const decompositionOptions, -// Matrix< std::complex< double > > const & A, -// Vector< double > const & eigenValues, -// Matrix< std::complex< double > > const & eigenVectors, -// Vector< int > const & support, -// Workspace< std::complex< double > > & workspace, -// SymmetricMatrixStorageType const storageType ); - -} // namespace dense -} // namespace LvArray diff --git a/src/dense/eigendecomposition.hpp b/src/dense/eigendecomposition.hpp deleted file mode 100644 index 75ae830b..00000000 --- a/src/dense/eigendecomposition.hpp +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -#include "common.hpp" - -namespace LvArray -{ -namespace dense -{ - -/** - * - */ -struct EigenDecompositionOptions -{ - enum Type - { - EIGENVALUES, - EIGENVALUES_AND_VECTORS, - }; - - enum Range - { - ALL, - IN_INTERVAL, - BY_INDEX, - }; - - char const * typeArg() const - { - static constexpr char const * const eigenvalueString = "N"; - static constexpr char const * const eigenvectorString = "V"; - - return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString; - } - - char const * rangeArg() const - { - static constexpr char const * const allString = "A"; - static constexpr char const * const intervalString = "V"; - static constexpr char const * const indexString = "I"; - - if( range == Range::ALL ) - { return allString; } - - return range == Range::IN_INTERVAL ? intervalString : indexString; - } - - Type const type; - Range const range; - double const rangeMin; - double const rangeMax; - int const indexMin; - int const indexMax; - double const abstol; -}; - -} // namespace dense -} // namespace LvArray \ No newline at end of file diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt index 4d91681e..3ac33255 100644 --- a/unitTests/CMakeLists.txt +++ b/unitTests/CMakeLists.txt @@ -149,3 +149,8 @@ install(TARGETS testTensorOps if( ENABLE_PYLVARRAY ) add_subdirectory( python ) endif() + +if( ENABLE_LAPACK ) + add_subdirectory( dense ) +endif() + diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt new file mode 100644 index 00000000..f324797e --- /dev/null +++ b/unitTests/dense/CMakeLists.txt @@ -0,0 +1,34 @@ +################################################################################################### +# Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. +# All rights reserved. +# See the LICENSE file for details. +# SPDX-License-Identifier: (BSD-3-Clause) +################################################################################################### + +# +# Specify list of tests +# +set( testSources + testEigenDecomposition.cpp + ) + +# +# Add gtest C++ based tests +# +foreach(test ${testSources}) + get_filename_component( test_name ${test} NAME_WE ) + blt_add_executable( NAME ${test_name} + SOURCES ${test} + OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY} + DEPENDS_ON gtest lvarray lvarraydense ${lvarray_dependencies} ) + + target_include_directories( ${test_name} PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../../src ) + + blt_add_test( NAME ${test_name} + COMMAND ${test_name} ) + + install(TARGETS ${test_name} + DESTINATION bin) +endforeach() + + diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp new file mode 100644 index 00000000..8f1c1a2b --- /dev/null +++ b/unitTests/dense/testEigenDecomposition.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +// Source includes +#include "dense/eigenDecomposition.hpp" + +#include "../testUtils.hpp" + +namespace LvArray +{ +namespace testing +{ + +template< typename T > +using Array1d = Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, DEFAULT_BUFFER >; + +template< typename T, typename PERM > +using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >; + + +TEST( heevr, allEigenvalues ) +{ + Array2d< std::complex< double >, RAJA::PERM_JI > matrix( 3, 3 ); + matrix( 1, 1 ) = 2; + matrix( 0, 0 ) = 3; + matrix( 2, 2 ) = -4; + + Array1d< double > eigenvalues( 3 ); + Array2d< std::complex< double >, RAJA::PERM_JI > eigenvectors; + Array1d< int > support( 6 ); + dense::ArrayWorkspace< std::complex< double >, ChaiBuffer > workspace; + dense::SymmetricMatrixStorageType storageType = dense::SymmetricMatrixStorageType::UPPER_TRIANGULAR; + + dense::heevr< double >( + MemorySpace::host, + dense::EigenDecompositionOptions( dense::EigenDecompositionOptions::Type::EIGENVALUES ), + matrix.toView(), + eigenvalues.toView(), + eigenvectors.toView(), + support, + workspace, + storageType ); + + EXPECT_DOUBLE_EQ( eigenvalues[ 0 ], -4 ); + EXPECT_DOUBLE_EQ( eigenvalues[ 1 ], 2 ); + EXPECT_DOUBLE_EQ( eigenvalues[ 2 ], 3 ); +} + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + return result; +} From 3ac40d6f453fa8943a908f255a0a1f3cd113a959 Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Thu, 1 Sep 2022 22:01:51 -0700 Subject: [PATCH 31/34] Building and compiling with MAGMA. GPU not yet working, think it's something to do with the new workspaces. --- cmake/Config.cmake | 5 +- cmake/SetupTPL.cmake | 64 ++- scripts/uberenv/packages/lvarray/package.py | 72 +-- scripts/uberenv/packages/magma/cmake-W.patch | 12 + scripts/uberenv/packages/magma/ibm-xl.patch | 248 ++++++++++ .../packages/magma/magma-2.3.0-gcc-4.8.patch | 24 + .../packages/magma/magma-2.5.0-cmake.patch | 77 ++++ .../uberenv/packages/magma/magma-2.5.0.patch | 428 ++++++++++++++++++ scripts/uberenv/packages/magma/package.py | 125 +++++ .../blueos_3_ppc64le_ib_p9/packages.yaml | 9 + src/LvArrayConfig.hpp.in | 2 + src/dense/CMakeLists.txt | 9 +- src/dense/common.cpp | 13 + src/dense/common.hpp | 74 ++- src/dense/eigenDecomposition.cpp | 277 +++++++++--- src/dense/eigenDecomposition.hpp | 33 +- unitTests/dense/testEigenDecomposition.cpp | 116 +++-- 17 files changed, 1405 insertions(+), 183 deletions(-) create mode 100644 scripts/uberenv/packages/magma/cmake-W.patch create mode 100644 scripts/uberenv/packages/magma/ibm-xl.patch create mode 100644 scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch create mode 100644 scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch create mode 100644 scripts/uberenv/packages/magma/magma-2.5.0.patch create mode 100644 scripts/uberenv/packages/magma/package.py diff --git a/cmake/Config.cmake b/cmake/Config.cmake index cf8ff35b..c513fbab 100644 --- a/cmake/Config.cmake +++ b/cmake/Config.cmake @@ -2,9 +2,10 @@ set( PREPROCESSOR_DEFINES UMPIRE CHAI CUDA - HIP + HIP TOTALVIEW_OUTPUT - CALIPER ) + CALIPER + MAGMA ) set( USE_CONFIGFILE ON CACHE BOOL "" ) foreach( DEP in ${PREPROCESSOR_DEFINES}) diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake index c312306b..c40d0582 100644 --- a/cmake/SetupTPL.cmake +++ b/cmake/SetupTPL.cmake @@ -1,3 +1,60 @@ +macro(find_and_register) + set(singleValueArgs NAME HEADER) + set(multiValueArgs INCLUDE_DIRECTORIES + LIBRARY_DIRECTORIES + LIBRARIES + EXTRA_LIBRARIES + DEPENDS ) + + ## parse the arguments + cmake_parse_arguments(arg + "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT DEFINED arg_NAME) + message(FATAL_ERROR "The find_and_register required parameter NAME specifies the name of the library to register.") + endif() + + if(NOT DEFINED arg_INCLUDE_DIRECTORIES) + message(FATAL_ERROR "The find_and_register required parameter INCLUDE_DIRECTORIES specifies the directories to search for the given header.") + endif() + + if(NOT DEFINED arg_LIBRARY_DIRECTORIES) + message(FATAL_ERROR "The find_and_register required parameter LIBRARY_DIRECTORIES specifies the directories to search for the given libraries.") + endif() + + if(NOT DEFINED arg_HEADER) + message(FATAL_ERROR "The find_and_register required parameter HEADER specifies the header to search for.") + endif() + + if(NOT DEFINED arg_LIBRARIES) + message(FATAL_ERROR "The find_and_register required parameter LIBRARIES specifies the libraries to search for.") + endif() + + find_path(${arg_NAME}_INCLUDE_DIR ${arg_HEADER} + PATHS ${arg_INCLUDE_DIRECTORIES} + NO_DEFAULT_PATH + NO_CMAKE_ENVIRONMENT_PATH + NO_CMAKE_PATH + NO_SYSTEM_ENVIRONMENT_PATH + NO_CMAKE_SYSTEM_PATH) + + if(${arg_NAME}_INCLUDE_DIR STREQUAL ${arg_NAME}_INCLUDE_DIR-NOTFOUND) + message(FATAL_ERROR "Could not find '${arg_HEADER}' in '${arg_INCLUDE_DIRECTORIES}'") + endif() + + blt_find_libraries(FOUND_LIBS ${arg_NAME}_LIBRARIES + NAMES ${arg_LIBRARIES} + PATHS ${arg_LIBRARY_DIRECTORIES} + REQUIRED ON) + + blt_import_library(NAME ${arg_NAME} + INCLUDES ${${arg_NAME}_INCLUDE_DIR} + LIBRARIES ${${arg_NAME}_LIBRARIES} ${arg_EXTRA_LIBRARIES} + TREAT_INCLUDES_AS_SYSTEM ON + DEPENDS_ON ${arg_DEPENDS}) + +endmacro(find_and_register) + set(thirdPartyLibs "") ############################### @@ -162,8 +219,11 @@ if(ENABLE_MAGMA) message(FATAL_ERROR "LAPACK must be enabled to use MAGMA.") endif() - find_package(magma REQUIRED - PATHS ${MAGMA_DIR}) + find_and_register(NAME magma + INCLUDE_DIRECTORIES ${MAGMA_DIR}/include + LIBRARY_DIRECTORIES ${MAGMA_DIR}/lib + HEADER magma.h + LIBRARIES magma) set(thirdPartyLibs ${thirdPartyLibs} magma) else() diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index 347ca123..7fc306fd 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -57,7 +57,7 @@ class Lvarray(CMakePackage, CudaPackage): variant('caliper', default=False, description='Build Caliper support') variant('pylvarray', default=False, description='Build Python support') variant('lapack', default=False, description='Build LAPACK and BLAS support') - # variant('magma', default=False, description='Build MAGMA support') + variant('magma', default=False, description='Build MAGMA support') variant('tests', default=True, description='Build tests') variant('benchmarks', default=False, description='Build benchmarks') variant('examples', default=False, description='Build examples') @@ -65,11 +65,15 @@ class Lvarray(CMakePackage, CudaPackage): variant('addr2line', default=True, description='Build support for addr2line.') +<<<<<<< HEAD variant('tpl_build_type', default='none', description='TPL build type', values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none')) # conflicts('~lapack', when='+magma') +======= + conflicts('~lapack', when='+magma') +>>>>>>> cde43f2 (Building and compiling with MAGMA. GPU not yet working, think it's something to do with the new workspaces.) depends_on('blt@0.4.1:', when='@0.2.0:', type='build') @@ -90,7 +94,7 @@ class Lvarray(CMakePackage, CudaPackage): depends_on('blas', when='+lapack') depends_on('lapack', when='+lapack') - # depends_on('magma', when='+magma') + depends_on('magma', when='+magma') depends_on('doxygen@1.8.13:', when='+docs', type='build') depends_on('py-sphinx@1.6.3:', when='+docs', type='build') @@ -310,59 +314,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): else: cfg.write(cmake_cache_option("ENABLE_CHAI", False)) -<<<<<<< HEAD - if "+caliper" in spec: - cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) - cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) - else: - cfg.write(cmake_cache_option("ENABLE_CALIPER", False)) - - cfg.write('#{0}\n'.format('-' * 80)) - cfg.write('# Python\n') - cfg.write('#{0}\n\n'.format('-' * 80)) - - if '+pylvarray' in spec: - cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True)) - cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3'))) - else: - cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False)) - - # cfg.write('#{0}\n'.format('-' * 80)) - # cfg.write('# Math libraries\n') - # cfg.write('#{0}\n\n'.format('-' * 80)) - # if '+lapack' in spec: - # cfg.write(cmake_cache_option('ENABLE_LAPACK', True)) - # cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs)) - # cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs)) - # else: - # cfg.write(cmake_cache_option('ENABLE_LAPACK', False)) - - # if '+magma' in spec: - # cfg.write(cmake_cache_option('ENABLE_MAGMA', True)) - # cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix)) - # else: - # cfg.write(cmake_cache_option('ENABLE_MAGMA', False)) - - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Documentation\n") - cfg.write("#{0}\n\n".format("-" * 80)) - - if "+docs" in spec: - cfg.write(cmake_cache_option("ENABLE_DOCS", True)) - sphinx_dir = spec['py-sphinx'].prefix - cfg.write(cmake_cache_string('SPHINX_EXECUTABLE', - os.path.join(sphinx_dir, - 'bin', - 'sphinx-build'))) - - doxygen_dir = spec['doxygen'].prefix - cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE', - os.path.join(doxygen_dir, - 'bin', - 'doxygen'))) - else: - cfg.write(cmake_cache_option("ENABLE_DOCS", False)) -======= cfg.write("#{0}\n".format("-" * 80)) cfg.write("# Caliper\n") cfg.write("#{0}\n\n".format("-" * 80)) @@ -396,11 +347,11 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): else: cfg.write(cmake_cache_option('ENABLE_LAPACK', False)) - # if '+magma' in spec: - # cfg.write(cmake_cache_option('ENABLE_MAGMA', True)) - # cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix)) - # else: - # cfg.write(cmake_cache_option('ENABLE_MAGMA', False)) + if '+magma' in spec: + cfg.write(cmake_cache_option('ENABLE_MAGMA', True)) + cfg.write(cmake_cache_entry('MAGMA_DIR', spec['magma'].prefix)) + else: + cfg.write(cmake_cache_option('ENABLE_MAGMA', False)) cfg.write("#{0}\n".format("-" * 80)) cfg.write("# Documentation\n") @@ -420,7 +371,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): 'doxygen'))) else: cfg.write(cmake_cache_option("ENABLE_DOCS", False)) ->>>>>>> f6cec78 (Eigen stuff seems to be at least partialy working.) cfg.write("#{0}\n".format("-" * 80)) cfg.write("# addr2line\n") diff --git a/scripts/uberenv/packages/magma/cmake-W.patch b/scripts/uberenv/packages/magma/cmake-W.patch new file mode 100644 index 00000000..59179676 --- /dev/null +++ b/scripts/uberenv/packages/magma/cmake-W.patch @@ -0,0 +1,12 @@ +diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt +--- magma-2.5.0-orig/CMakeLists.txt 2019-01-02 11:18:39.000000000 -0800 ++++ magma-2.5.0/CMakeLists.txt 2019-04-03 15:58:01.871234891 -0700 +@@ -363,8 +363,6 @@ + else() + # Primarily for gcc / nvcc: + # Ignore unused static functions in headers. +- set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unused-function" ) +- set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unused-function" ) + endif() + + if (CMAKE_HOST_APPLE) diff --git a/scripts/uberenv/packages/magma/ibm-xl.patch b/scripts/uberenv/packages/magma/ibm-xl.patch new file mode 100644 index 00000000..0deab656 --- /dev/null +++ b/scripts/uberenv/packages/magma/ibm-xl.patch @@ -0,0 +1,248 @@ +diff -Naur magma-2.2.0/src/dlaex3_m.cpp magma-2.2.0-patched/src/dlaex3_m.cpp +--- magma-2.2.0/src/dlaex3_m.cpp 2016-11-20 20:20:06.000000000 -0500 ++++ magma-2.2.0/src/dlaex3_m.cpp 2017-01-06 15:54:29.423668874 -0500 +@@ -197,7 +197,7 @@ + magmaDouble_ptr dwork[], + magma_queue_t queues[MagmaMaxGPUs][2], + magma_range_t range, double vl, double vu, magma_int_t il, magma_int_t iu, +- magma_int_t *info ) ++ magma_int_t *infom ) + { + #define Q(i_,j_) (Q + (i_) + (j_)*ldq) + +@@ -209,8 +209,8 @@ + magma_setdevice(0); + magma_dlaex3( k, n, n1, d, Q, ldq, rho, + dlamda, Q2, indx, ctot, w, s, indxq, +- *dwork, range, vl, vu, il, iu, info ); +- return *info; ++ *dwork, range, vl, vu, il, iu, infom ); ++ return *infom; + } + double d_one = 1.; + double d_zero = 0.; +@@ -229,37 +229,37 @@ + valeig = (range == MagmaRangeV); + indeig = (range == MagmaRangeI); + +- *info = 0; ++ *infom = 0; + + if (k < 0) +- *info=-1; ++ *infom=-1; + else if (n < k) +- *info=-2; ++ *infom=-2; + else if (ldq < max(1,n)) +- *info=-6; ++ *infom=-6; + else if (! (alleig || valeig || indeig)) +- *info = -15; ++ *infom = -15; + else { + if (valeig) { + if (n > 0 && vu <= vl) +- *info = -17; ++ *infom = -17; + } + else if (indeig) { + if (il < 1 || il > max(1,n)) +- *info = -18; ++ *infom = -18; + else if (iu < min(n,il) || iu > n) +- *info = -19; ++ *infom = -19; + } + } + +- if (*info != 0) { +- magma_xerbla( __func__, -(*info) ); +- return *info; ++ if (*infom != 0) { ++ magma_xerbla( __func__, -(*infom) ); ++ return *infom; + } + + // Quick return if possible + if (k == 0) +- return *info; ++ return *infom; + + magma_device_t orig_dev; + magma_getdevice( &orig_dev ); +@@ -360,15 +360,15 @@ + lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) { +- #pragma omp critical (info) +- *info = iinfo; ++ #pragma omp critical (infom) ++ *infom = iinfo; + break; + } + } + + #pragma omp barrier + +- if (*info == 0) { ++ if (*infom == 0) { + #pragma omp single + { + // Prepare the INDXQ sorting permutation. +@@ -452,8 +452,8 @@ + } + } + } // end omp parallel +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + timer_stop( time ); + timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); +@@ -474,10 +474,10 @@ + lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) +- *info=iinfo; ++ *infom=iinfo; + } +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + // Prepare the INDXQ sorting permutation. + magma_int_t nk = n - k; +@@ -688,5 +688,5 @@ + + magma_setdevice( orig_dev ); + +- return *info; ++ return *infom; + } /* magma_dlaed3_m */ +diff -Naur magma-2.2.0/src/slaex3_m.cpp magma-2.2.0-patched/src/slaex3_m.cpp +--- magma-2.2.0/src/slaex3_m.cpp 2016-11-20 20:20:24.000000000 -0500 ++++ magma-2.2.0/src/slaex3_m.cpp 2017-01-06 10:20:13.200783151 -0500 +@@ -197,7 +197,7 @@ + magmaFloat_ptr dwork[], + magma_queue_t queues[MagmaMaxGPUs][2], + magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, +- magma_int_t *info ) ++ magma_int_t *infom ) + { + #define Q(i_,j_) (Q + (i_) + (j_)*ldq) + +@@ -209,8 +209,8 @@ + magma_setdevice(0); + magma_slaex3( k, n, n1, d, Q, ldq, rho, + dlamda, Q2, indx, ctot, w, s, indxq, +- *dwork, range, vl, vu, il, iu, info ); +- return *info; ++ *dwork, range, vl, vu, il, iu, infom ); ++ return *infom; + } + float d_one = 1.; + float d_zero = 0.; +@@ -229,37 +229,37 @@ + valeig = (range == MagmaRangeV); + indeig = (range == MagmaRangeI); + +- *info = 0; ++ *infom = 0; + + if (k < 0) +- *info=-1; ++ *infom=-1; + else if (n < k) +- *info=-2; ++ *infom=-2; + else if (ldq < max(1,n)) +- *info=-6; ++ *infom=-6; + else if (! (alleig || valeig || indeig)) +- *info = -15; ++ *infom = -15; + else { + if (valeig) { + if (n > 0 && vu <= vl) +- *info = -17; ++ *infom = -17; + } + else if (indeig) { + if (il < 1 || il > max(1,n)) +- *info = -18; ++ *infom = -18; + else if (iu < min(n,il) || iu > n) +- *info = -19; ++ *infom = -19; + } + } + +- if (*info != 0) { +- magma_xerbla( __func__, -(*info) ); +- return *info; ++ if (*infom != 0) { ++ magma_xerbla( __func__, -(*infom) ); ++ return *infom; + } + + // Quick return if possible + if (k == 0) +- return *info; ++ return *infom; + + magma_device_t orig_dev; + magma_getdevice( &orig_dev ); +@@ -360,15 +360,15 @@ + lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) { +- #pragma omp critical (info) +- *info = iinfo; ++ #pragma omp critical (infom) ++ *infom = iinfo; + break; + } + } + + #pragma omp barrier + +- if (*info == 0) { ++ if (*infom == 0) { + #pragma omp single + { + // Prepare the INDXQ sorting permutation. +@@ -452,8 +452,8 @@ + } + } + } // end omp parallel +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + timer_stop( time ); + timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); +@@ -474,10 +474,10 @@ + lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) +- *info=iinfo; ++ *infom=iinfo; + } +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + // Prepare the INDXQ sorting permutation. + magma_int_t nk = n - k; +@@ -688,5 +688,5 @@ + + magma_setdevice( orig_dev ); + +- return *info; ++ return *infom; + } /* magma_slaed3_m */ diff --git a/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch new file mode 100644 index 00000000..f734a5f1 --- /dev/null +++ b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch @@ -0,0 +1,24 @@ +diff -ru magma-2.3.0/testing/testings.h magma-2.3.0-patched/testing/testings.h +--- magma-2.3.0/testing/testings.h 2017-11-14 21:34:00.000000000 -0800 ++++ magma-2.3.0-patched/testing/testings.h 2018-03-23 20:41:16.459934643 -0700 +@@ -269,4 +269,20 @@ + typename blas::traits::real_t* sigma, + FloatT* A, magma_int_t lda ); + ++// This overload for the case sigma = nullptr is a workaround for an issue ++// when building with gcc 4.8.5. This is not an issue with gcc 4.9.2. ++template< typename FloatT > ++void magma_generate_matrix( ++ magma_opts& opts, ++ magma_int_t m, magma_int_t n, ++ std::nullptr_t sigma, ++ FloatT* A, magma_int_t lda ) ++{ ++ magma_generate_matrix( ++ opts, ++ m, n, ++ (typename blas::traits::real_t*) sigma, ++ A, lda ); ++} ++ + #endif /* TESTINGS_H */ diff --git a/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch new file mode 100644 index 00000000..56b58d85 --- /dev/null +++ b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch @@ -0,0 +1,77 @@ +diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt +--- magma-2.5.0-orig/CMakeLists.txt 2019-01-02 11:18:39.000000000 -0800 ++++ magma-2.5.0/CMakeLists.txt 2019-04-03 15:58:01.871234891 -0700 +@@ -440,18 +440,20 @@ + # compile MAGMA sparse library + + # sparse doesn't have Fortran at the moment, so no need for above shenanigans +-include_directories( sparse/include ) +-include_directories( sparse/control ) +-include_directories( testing ) +-cuda_add_library( magma_sparse ${libsparse_all} ) +-target_link_libraries( magma_sparse +- magma ++if (MAGMA_SPARSE) ++ include_directories( sparse/include ) ++ include_directories( sparse/control ) ++ include_directories( testing ) ++ cuda_add_library( magma_sparse ${libsparse_all} ) ++ target_link_libraries( magma_sparse ++ magma + ${LAPACK_LIBRARIES} + ${CUDA_CUDART_LIBRARY} + ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_cusparse_LIBRARY} +-) +-set( LIBS_SPARSE ${LIBS} magma_sparse ) ++ ) ++ set( LIBS_SPARSE ${LIBS} magma_sparse ) ++endif() + + + # ---------------------------------------- +@@ -480,23 +482,31 @@ + + # ---------------------------------------- + # compile each sparse tester +-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing ) +-foreach( TEST ${sparse_testing_all} ) ++if (MAGMA_SPARSE) ++ set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing ) ++ foreach( TEST ${sparse_testing_all} ) + string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} ) + string( REGEX REPLACE "sparse/testing/" "" EXE ${EXE} ) + #message( "${TEST} --> ${EXE}" ) + add_executable( ${EXE} ${TEST} ) + target_link_libraries( ${EXE} ${LIBS_SPARSE} ) +-endforeach() ++ endforeach() ++endif() + + + # ---------------------------------------- + # what to install +-install( TARGETS magma magma_sparse ${blas_fix} ++set(MAGMA_TARGETS magma) ++set(MAGMA_HEADERS_PATTERNS include/*.h) ++if (MAGMA_SPARSE) ++ set(MAGMA_TARGETS ${MAGMA_TARGETS} magma_sparse) ++ set(MAGMA_HEADERS_PATTERNS ${MAGMA_HEADERS_PATTERNS} sparse/include/*.h) ++endif() ++install( TARGETS ${MAGMA_TARGETS} ${blas_fix} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib ) +-file( GLOB headers include/*.h sparse/include/*.h ) ++file( GLOB headers ${MAGMA_HEADERS_PATTERNS} ) + install( FILES ${headers} + DESTINATION include ) + +@@ -509,4 +519,6 @@ + message( STATUS " NFLAGS ${CUDA_NVCC_FLAGS}" ) + message( STATUS " FFLAGS ${CMAKE_Fortran_FLAGS}" ) + message( STATUS " LIBS ${LIBS}" ) +-message( STATUS " LIBS_SPARSE ${LIBS_SPARSE}" ) ++if (MAGMA_SPARSE) ++ message( STATUS " LIBS_SPARSE ${LIBS_SPARSE}" ) ++endif() diff --git a/scripts/uberenv/packages/magma/magma-2.5.0.patch b/scripts/uberenv/packages/magma/magma-2.5.0.patch new file mode 100644 index 00000000..1ac800c5 --- /dev/null +++ b/scripts/uberenv/packages/magma/magma-2.5.0.patch @@ -0,0 +1,428 @@ +diff -r 89706c0efbdb .hgtags +--- a/.hgtags Wed Jan 02 14:17:26 2019 -0500 ++++ b/.hgtags Wed Apr 03 15:50:54 2019 -0700 +@@ -1,3 +1,4 @@ + 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0 + b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0 + 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0 ++89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0 +diff -r 89706c0efbdb include/magma_types.h +--- a/include/magma_types.h Wed Jan 02 14:17:26 2019 -0500 ++++ b/include/magma_types.h Wed Apr 03 15:50:54 2019 -0700 +@@ -77,7 +77,7 @@ + typedef magma_int_t magma_device_t; + + // Half precision in CUDA +- #if defined(__cplusplus) && CUDA_VERSION > 7500 ++ #if defined(__cplusplus) && CUDA_VERSION >= 7500 + #include + typedef __half magmaHalf; + #else +diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu +--- a/sparse/blas/magma_zsampleselect.cu Wed Jan 02 14:17:26 2019 -0500 ++++ b/sparse/blas/magma_zsampleselect.cu Wed Apr 03 15:50:54 2019 -0700 +@@ -15,9 +15,12 @@ + + #define PRECISION_z + ++ + namespace magma_sampleselect { + +-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) { ++__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) ++{ ++#if (__CUDA_ARCH__ >= 350) + auto idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= size) { + return; +@@ -25,6 +28,7 @@ + + auto v = in[idx]; + out[idx] = real(v) * real(v) + imag(v) * imag(v); ++#endif + } + + } // namespace magma_sampleselect +@@ -164,36 +168,43 @@ + magma_queue_t queue ) + { + magma_int_t info = 0; ++ magma_int_t arch = magma_getdevice_arch(); + +- auto num_blocks = magma_ceildiv(total_size, block_size); +- auto local_work = (total_size + num_threads - 1) / num_threads; +- auto required_size = sizeof(double) * (total_size + searchtree_size) ++ if( arch >= 350 ) { ++ auto num_blocks = magma_ceildiv(total_size, block_size); ++ auto local_work = (total_size + num_threads - 1) / num_threads; ++ auto required_size = sizeof(double) * (total_size + searchtree_size) + + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1); +- auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size); ++ auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size); + +- double* gputmp = (double*)*tmp_ptr; +- double* gputree = gputmp + total_size; +- uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size); +- int32_t* gpurankout = (int32_t*)(gpubucketidx + 1); +- int32_t* gpucounts = gpurankout + 1; +- int32_t* gpulocalcounts = gpucounts + searchtree_width; +- uint32_t bucketidx{}; ++ double* gputmp = (double*)*tmp_ptr; ++ double* gputree = gputmp + total_size; ++ uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size); ++ int32_t* gpurankout = (int32_t*)(gpubucketidx + 1); ++ int32_t* gpucounts = gpurankout + 1; ++ int32_t* gpulocalcounts = gpucounts + searchtree_width; ++ uint32_t bucketidx{}; + +- CHECK(realloc_result); ++ CHECK(realloc_result); + +- compute_abs<<cuda_stream()>>> +- (val, gputmp, total_size); +- build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>> +- (gputmp, gputree, total_size); +- count_buckets<<cuda_stream()>>> +- (gputmp, gputree, gpulocalcounts, total_size, local_work); +- reduce_counts<<cuda_stream()>>> +- (gpulocalcounts, gpucounts, num_grouped_blocks); +- sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>> +- (gpucounts, subset_size, gpubucketidx, gpurankout); +- magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue); +- magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue); +- *thrs = std::sqrt(*thrs); ++ compute_abs<<cuda_stream()>>> ++ (val, gputmp, total_size); ++ build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>> ++ (gputmp, gputree, total_size); ++ count_buckets<<cuda_stream()>>> ++ (gputmp, gputree, gpulocalcounts, total_size, local_work); ++ reduce_counts<<cuda_stream()>>> ++ (gpulocalcounts, gpucounts, num_grouped_blocks); ++ sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>> ++ (gpucounts, subset_size, gpubucketidx, gpurankout); ++ magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue); ++ magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue); ++ *thrs = std::sqrt(*thrs); ++ } ++ else { ++ printf("error: this functionality needs CUDA architecture >= 3.5\n"); ++ info = MAGMA_ERR_NOT_SUPPORTED; ++ } + + cleanup: + return info; +diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp +--- a/src/xhsgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/src/xhsgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -16,6 +16,131 @@ + #include + #endif + ++#if CUDA_VERSION < 9020 ++// conversion float to half are not defined for host in CUDA version <9.2 ++// thus uses the conversion below when CUDA VERSION is < 9.2. ++#include ++// ++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions ++// are met: ++// * Redistributions of source code must retain the above copyright ++// notice, this list of conditions and the following disclaimer. ++// * Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// * Neither the name of NVIDIA CORPORATION nor the names of its ++// contributors may be used to endorse or promote products derived ++// from this software without specific prior written permission. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY ++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++// This code modified from the public domain code here: ++// https://gist.github.com/rygorous/2156668 ++// The URL above includes more robust conversion routines ++// that handle Inf and NaN correctly. ++// ++// It is recommended to use the more robust versions in production code. ++ ++typedef unsigned uint; ++ ++union FP32 ++{ ++ uint u; ++ float f; ++ struct ++ { ++ uint Mantissa : 23; ++ uint Exponent : 8; ++ uint Sign : 1; ++ }; ++}; ++ ++union FP16 ++{ ++ unsigned short u; ++ struct ++ { ++ uint Mantissa : 10; ++ uint Exponent : 5; ++ uint Sign : 1; ++ }; ++}; ++ ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++static half approx_float_to_half(float fl) ++{ ++ FP32 f32infty = { 255 << 23 }; ++ FP32 f16max = { (127 + 16) << 23 }; ++ FP32 magic = { 15 << 23 }; ++ FP32 expinf = { (255 ^ 31) << 23 }; ++ uint sign_mask = 0x80000000u; ++ FP16 o = { 0 }; ++ ++ FP32 f = *((FP32*)&fl); ++ ++ uint sign = f.u & sign_mask; ++ f.u ^= sign; ++ ++ if (!(f.f < f32infty.u)) // Inf or NaN ++ o.u = f.u ^ expinf.u; ++ else ++ { ++ if (f.f > f16max.f) f.f = f16max.f; ++ f.f *= magic.f; ++ } ++ ++ o.u = f.u >> 13; // Take the mantissa bits ++ o.u |= sign >> 16; ++ half tmp; ++ memcpy(&tmp, &o, sizeof(half)); ++ //return *((half*)&o); ++ return tmp; ++} ++ ++// from half->float code - just for verification. ++static float half_to_float(half hf) ++{ ++ FP16 h; ++ memcpy(&h, &hf, sizeof(half)); ++ ++ static const FP32 magic = { 113 << 23 }; ++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift ++ FP32 o; ++ ++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits ++ uint exp = shifted_exp & o.u; // just the exponent ++ o.u += (127 - 15) << 23; // exponent adjust ++ ++ // handle exponent special cases ++ if (exp == shifted_exp) // Inf/NaN? ++ o.u += (128 - 16) << 23; // extra exp adjust ++ else if (exp == 0) // Zero/Denormal? ++ { ++ o.u += 1 << 23; // extra exp adjust ++ o.f -= magic.f; // renormalize ++ } ++ ++ o.u |= (h.u & 0x8000) << 16; // sign bit ++ return o.f; ++} ++#endif ++ + #include "magma_internal.h" + //#include "nvToolsExt.h" + +@@ -106,10 +231,13 @@ + float c_one = MAGMA_S_ONE; + float c_neg_one = MAGMA_S_NEG_ONE; + #if 1 ++ #if CUDA_VERSION >= 9020 + const magmaHalf h_one = (magmaHalf) 1.0; + const magmaHalf h_neg_one = (magmaHalf)-1.0; +- //const magmaHalf h_one = approx_float_to_half(1.0); +- //const magmaHalf h_neg_one = approx_float_to_half(-1.0); ++ #else ++ const magmaHalf h_one = approx_float_to_half(1.0); ++ const magmaHalf h_neg_one = approx_float_to_half(-1.0); ++ #endif + #else + FP32 float_one = *((FP32*)&c_one); + FP16 half_one = float_to_half_full(float_one); +diff -r 89706c0efbdb src/xshgetrf_gpu.cpp +--- a/src/xshgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/src/xshgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -92,7 +92,7 @@ + magma_mp_type_t enable_tc, + magma_mp_type_t mp_algo_type ) + { +-#if CUDA_VERSION >= 7500 ++#if CUDA_VERSION >= 9000 + #ifdef HAVE_clBLAS + #define dA(i_, j_) dA, (dA_offset + (i_) + (j_)*ldda) + #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_)) +diff -r 89706c0efbdb testing/testing_hgemm.cpp +--- a/testing/testing_hgemm.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/testing/testing_hgemm.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -22,6 +22,131 @@ + #include "magma_operators.h" + #include "testings.h" + ++#if CUDA_VERSION < 9020 ++// conversion float to half are not defined for host in CUDA version <9.2 ++// thus uses the conversion below when CUDA VERSION is < 9.2. ++#include ++// ++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions ++// are met: ++// * Redistributions of source code must retain the above copyright ++// notice, this list of conditions and the following disclaimer. ++// * Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// * Neither the name of NVIDIA CORPORATION nor the names of its ++// contributors may be used to endorse or promote products derived ++// from this software without specific prior written permission. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY ++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++// This code modified from the public domain code here: ++// https://gist.github.com/rygorous/2156668 ++// The URL above includes more robust conversion routines ++// that handle Inf and NaN correctly. ++// ++// It is recommended to use the more robust versions in production code. ++ ++typedef unsigned uint; ++ ++union FP32 ++{ ++ uint u; ++ float f; ++ struct ++ { ++ uint Mantissa : 23; ++ uint Exponent : 8; ++ uint Sign : 1; ++ }; ++}; ++ ++union FP16 ++{ ++ unsigned short u; ++ struct ++ { ++ uint Mantissa : 10; ++ uint Exponent : 5; ++ uint Sign : 1; ++ }; ++}; ++ ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++static half approx_float_to_half(float fl) ++{ ++ FP32 f32infty = { 255 << 23 }; ++ FP32 f16max = { (127 + 16) << 23 }; ++ FP32 magic = { 15 << 23 }; ++ FP32 expinf = { (255 ^ 31) << 23 }; ++ uint sign_mask = 0x80000000u; ++ FP16 o = { 0 }; ++ ++ FP32 f = *((FP32*)&fl); ++ ++ uint sign = f.u & sign_mask; ++ f.u ^= sign; ++ ++ if (!(f.f < f32infty.u)) // Inf or NaN ++ o.u = f.u ^ expinf.u; ++ else ++ { ++ if (f.f > f16max.f) f.f = f16max.f; ++ f.f *= magic.f; ++ } ++ ++ o.u = f.u >> 13; // Take the mantissa bits ++ o.u |= sign >> 16; ++ half tmp; ++ memcpy(&tmp, &o, sizeof(half)); ++ //return *((half*)&o); ++ return tmp; ++} ++ ++// from half->float code - just for verification. ++static float half_to_float(half hf) ++{ ++ FP16 h; ++ memcpy(&h, &hf, sizeof(half)); ++ ++ static const FP32 magic = { 113 << 23 }; ++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift ++ FP32 o; ++ ++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits ++ uint exp = shifted_exp & o.u; // just the exponent ++ o.u += (127 - 15) << 23; // exponent adjust ++ ++ // handle exponent special cases ++ if (exp == shifted_exp) // Inf/NaN? ++ o.u += (128 - 16) << 23; // extra exp adjust ++ else if (exp == 0) // Zero/Denormal? ++ { ++ o.u += 1 << 23; // extra exp adjust ++ o.f -= magic.f; // renormalize ++ } ++ ++ o.u |= (h.u & 0x8000) << 16; // sign bit ++ return o.f; ++} ++#endif ++ + /* //////////////////////////////////////////////////////////////////////////// + -- Testing sgemm + */ +@@ -47,8 +172,13 @@ + float c_neg_one = MAGMA_S_NEG_ONE; + float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); + float beta = MAGMA_S_MAKE( -0.48, 0.38 ); +- magmaHalf h_alpha = (magmaHalf)alpha; +- magmaHalf h_beta = (magmaHalf)beta; ++ #if CUDA_VERSION >= 9020 ++ const magmaHalf h_alpha = (magmaHalf) alpha; ++ const magmaHalf h_beta = (magmaHalf) beta; ++ #else ++ const magmaHalf h_alpha = approx_float_to_half(alpha); ++ const magmaHalf h_beta = approx_float_to_half(beta); ++ #endif + magma_opts opts; + opts.parse_opts( argc, argv ); + diff --git a/scripts/uberenv/packages/magma/package.py b/scripts/uberenv/packages/magma/package.py new file mode 100644 index 00000000..8d37bec6 --- /dev/null +++ b/scripts/uberenv/packages/magma/package.py @@ -0,0 +1,125 @@ +# Copyright 2013-2021 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +from spack import * + + +class Magma(CMakePackage, CudaPackage): + """The MAGMA project aims to develop a dense linear algebra library similar + to LAPACK but for heterogeneous/hybrid architectures, starting with + current "Multicore+GPU" systems. + """ + + homepage = "http://icl.cs.utk.edu/magma/" + url = "http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz" + maintainers = ['stomov', 'luszczek'] + + version('2.5.4', sha256='7734fb417ae0c367b418dea15096aef2e278a423e527c615aab47f0683683b67') + version('2.5.3', sha256='c602d269a9f9a3df28f6a4f593be819abb12ed3fa413bba1ff8183de721c5ef6') + version('2.5.2', sha256='065feb85558f9dd6f4cc4db36ac633a3f787827fc832d0b578a049a43a195620') + version('2.5.1', sha256='ce32c199131515336b30c92a907effe0c441ebc5c5bdb255e4b06b2508de109f') + version('2.5.0', sha256='4fd45c7e46bd9d9124253e7838bbfb9e6003c64c2c67ffcff02e6c36d2bcfa33') + version('2.4.0', sha256='4eb839b1295405fd29c8a6f5b4ed578476010bf976af46573f80d1169f1f9a4f') + version('2.3.0', sha256='010a4a057d7aa1e57b9426bffc0958f3d06913c9151463737e289e67dd9ea608') + version('2.2.0', sha256='df5d4ace417e5bf52694eae0d91490c6bde4cde1b0da98e8d400c5c3a70d83a2') + + variant('fortran', default=True, + description='Enable Fortran bindings support') + variant('shared', default=True, + description='Enable shared library') + variant('cuda', default=True, description='Build with CUDA') + variant('cuda_arch', default='none', multi=True, + description='Specify CUDA architecture(s)') + + # corbett5 added this variant + variant('fortran_convention', default='default', description='LAPACK/BLAS mangling scheme', + values=('default', 'add_', 'nochange', 'upcase'), multi=False) + + depends_on('blas') + depends_on('lapack') + depends_on('cuda@8:', when='@2.5.1:') # See PR #14471 + + conflicts('~cuda', msg='Magma requires cuda') + conflicts('cuda_arch=none', + msg='Please indicate a CUDA arch value or values') + + # currently not compatible with CUDA-11 + # https://bitbucket.org/icl/magma/issues/22/cuda-11-changes-issue + # https://bitbucket.org/icl/magma/issues/25/error-cusparsesolveanalysisinfo_t-does-not + conflicts('^cuda@11:', when='@:2.5.3') + + patch('ibm-xl.patch', when='@2.2:2.5.0%xl') + patch('ibm-xl.patch', when='@2.2:2.5.0%xl_r') + patch('magma-2.3.0-gcc-4.8.patch', when='@2.3.0%gcc@:4.8') + patch('magma-2.5.0.patch', when='@2.5.0') + patch('magma-2.5.0-cmake.patch', when='@2.5.0') + patch('cmake-W.patch', when='@2.5.0:%nvhpc') + + def cmake_args(self): + spec = self.spec + options = [] + + options.extend([ + '-DCMAKE_INSTALL_PREFIX=%s' % self.prefix, + '-DCMAKE_INSTALL_NAME_DIR:PATH=%s/lib' % self.prefix, + '-DBLAS_LIBRARIES=%s' % spec['blas'].libs.joined(';'), + # As of MAGMA v2.3.0, CMakeLists.txt does not use the variable + # BLAS_LIBRARIES, but only LAPACK_LIBRARIES, so we need to + # explicitly add blas to LAPACK_LIBRARIES. + '-DLAPACK_LIBRARIES=%s' % + (spec['lapack'].libs + spec['blas'].libs).joined(';') + ]) + + options += ['-DBUILD_SHARED_LIBS=%s' % + ('ON' if ('+shared' in spec) else 'OFF')] + + if '+fortran' in spec: + options.extend([ + '-DUSE_FORTRAN=yes' + ]) + if spec.satisfies('%xl') or spec.satisfies('%xl_r'): + options.extend([ + '-DCMAKE_Fortran_COMPILER=%s' % self.compiler.f77 + ]) + + # corbett5 added this else block + else: + options.extend([ + '-DUSE_FORTRAN=no' + ]) + + if spec.satisfies('^cuda'): + cuda_arch = self.spec.variants['cuda_arch'].value + if '@:2.2.0' in spec: + capabilities = ' '.join('sm{0}'.format(i) for i in cuda_arch) + options.extend(['-DGPU_TARGET=' + capabilities]) + else: + capabilities = ' '.join('sm_{0}'.format(i) for i in cuda_arch) + options.extend(['-DGPU_TARGET=' + capabilities]) + + if '@2.5.0' in spec: + options.extend(['-DMAGMA_SPARSE=OFF']) + if spec.compiler.name in ['xl', 'xl_r']: + options.extend(['-DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=TRUE']) + + # corbett5 added these definitions + if spec.variants['fortran_convention'].value == 'add_': + options.extend(['-DFORTRAN_CONVENTION=-DADD_']) + + if spec.variants['fortran_convention'].value == 'nochange': + options.extend(['-DFORTRAN_CONVENTION=-DNOCHANGE']) + + if spec.variants['fortran_convention'].value == 'upcase': + options.extend(['-DFORTRAN_CONVENTION=-DUPCASE']) + + return options + + @run_after('install') + def post_install(self): + install('magmablas/atomics.cuh', self.prefix.include) + install('control/magma_threadsetting.h', self.prefix.include) + install('control/pthread_barrier.h', self.prefix.include) + install('control/magma_internal.h', self.prefix.include) diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml index d054887c..265a6c5f 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml @@ -2,6 +2,15 @@ packages: all: target: [default] compiler: [gcc, clang, xl] + providers: + blas: [netlib-lapack] + lapack: [netlib-lapack] + + netlib-lapack: + buildable: False + externals: + - spec: netlib-lapack@3.10.0 ~external-blas + prefix: /usr/tcetmp/packages/lapack/lapack-3.10.0-P9-xl-2022.03.10/ cuda: buildable: False diff --git a/src/LvArrayConfig.hpp.in b/src/LvArrayConfig.hpp.in index dcbd30b3..bf48242a 100644 --- a/src/LvArrayConfig.hpp.in +++ b/src/LvArrayConfig.hpp.in @@ -32,4 +32,6 @@ #cmakedefine LVARRAY_USE_CALIPER +#cmakedefine LVARRAY_USE_MAGMA + #cmakedefine LVARRAY_ADDR2LINE_EXEC @LVARRAY_ADDR2LINE_EXEC@ diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt index 36778a28..3d7a1f34 100644 --- a/src/dense/CMakeLists.txt +++ b/src/dense/CMakeLists.txt @@ -8,10 +8,17 @@ set( lvarraydense_sources eigenDecomposition.cpp ) +set( dependencies lvarray ${lvarray_dependencies} blas lapack ) + +if( ENABLE_MAGMA ) + set( dependencies ${dependencies} magma ) +endif() + + blt_add_library( NAME lvarraydense SOURCES ${lvarraydense_sources} HEADERS ${lvarraydense_headers} - DEPENDS_ON lvarray ${lvarray_dependencies} blas lapack + DEPENDS_ON ${dependencies} SHARED TRUE CLEAR_PREFIX TRUE ) diff --git a/src/dense/common.cpp b/src/dense/common.cpp index 75c06070..8843ca82 100644 --- a/src/dense/common.cpp +++ b/src/dense/common.cpp @@ -14,5 +14,18 @@ char const * getOption( SymmetricMatrixStorageType const option ) return option == SymmetricMatrixStorageType::UPPER_TRIANGULAR ? upper : lower; } +//////////////////////////////////////////////////////////////////////////////////////////////////// +MemorySpace getSpaceForBackend( BuiltInBackends const backend ) +{ +#if defined( LVARRAY_USE_MAGMA ) + // TODO: This needs to be changed to MemorySpace::hip or whatever. + if( backend == BuiltInBackends::MAGMA_GPU ) return MemorySpace::cuda; +#else + LVARRAY_UNUSED_VARIABLE( backend ); +#endif + + return MemorySpace::host; +} + } // namespace dense } // namespace LvArray \ No newline at end of file diff --git a/src/dense/common.hpp b/src/dense/common.hpp index 146bb407..09ef3edd 100644 --- a/src/dense/common.hpp +++ b/src/dense/common.hpp @@ -53,6 +53,26 @@ template< typename T > using RealVersion = typename internal::RealVersion< T >::Type; +/** + * + */ +enum class BuiltInBackends +{ + LAPACK, +#if defined( LVARRAY_USE_MAGMA ) + MAGMA, + MAGMA_GPU, +#endif +}; + +/** + * + */ +MemorySpace getSpaceForBackend( BuiltInBackends const backend ); + +/** + * + */ using DenseInt = int; /** @@ -73,7 +93,9 @@ struct Matrix data{ slice.data() } {} - template< typename INDEX_TYPE > + /** + * + */ Matrix( T & value ): nRows{ 1 }, nCols{ 1 }, @@ -132,12 +154,20 @@ struct Workspace virtual Vector< T > work() = 0; + virtual Vector< T > work2() = 0; + + virtual Vector< T > work3() = 0; + virtual Vector< RealVersion< T > > rwork() = 0; virtual Vector< DenseInt > iwork() = 0; virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) = 0; + virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) = 0; + + virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) = 0; + virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) = 0; virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) = 0; @@ -155,6 +185,12 @@ struct ArrayWorkspace : public Workspace< T > virtual Vector< T > work() override { return m_work.toSlice(); } + virtual Vector< T > work2() override + { return m_work2.toSlice(); } + + virtual Vector< T > work3() override + { return m_work3.toSlice(); } + virtual Vector< RealVersion< T > > rwork() override { return m_rwork.toSlice(); } @@ -163,16 +199,28 @@ struct ArrayWorkspace : public Workspace< T > virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override { m_work.resizeWithoutInitializationOrDestruction( space, newSize ); } + + virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) override + { m_work2.resizeWithoutInitializationOrDestruction( space, newSize ); } + + virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) override + { m_work3.resizeWithoutInitializationOrDestruction( space, newSize ); } virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override { m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); } virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override - { m_iwork.resizeWithoutInitializationOrDestruction( space, newSize ); } + { + m_iwork.resizeWithoutInitializationOrDestruction( space, newSize ); + } private: Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work; + Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work2; + + Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work3; + Array< RealVersion< T >, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_rwork; Array< DenseInt, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_iwork; @@ -190,6 +238,12 @@ struct OptimalSizeCalculation : public Workspace< T > virtual Vector< T > work() override { return m_work; } + virtual Vector< T > work2() override + { return m_work2; } + + virtual Vector< T > work3() override + { return m_work3; } + virtual Vector< RealVersion< T > > rwork() override { return m_rwork; } @@ -199,6 +253,12 @@ struct OptimalSizeCalculation : public Workspace< T > virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } @@ -215,11 +275,15 @@ struct OptimalSizeCalculation : public Workspace< T > { return m_iwork; } private: - T m_work; + T m_work { -1 }; + + T m_work2 { -1 }; + + T m_work3 { -1 }; - RealVersion< T > m_rwork; + RealVersion< T > m_rwork { -1 }; - DenseInt m_iwork; + DenseInt m_iwork { -1 }; }; } // namespace dense diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp index 68a2256d..071d5996 100644 --- a/src/dense/eigenDecomposition.cpp +++ b/src/dense/eigenDecomposition.cpp @@ -1,5 +1,9 @@ #include "eigenDecomposition.hpp" +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + /// This macro provide a flexible interface for Fortran naming convention for compiled objects // #ifdef FORTRAN_MANGLE_NO_UNDERSCORE #define FORTRAN_MANGLE( name ) name @@ -79,17 +83,17 @@ namespace internal */ template< typename T > DenseInt heevr( - MemorySpace const space, + BuiltInBackends const backend, EigenDecompositionOptions const decompositionOptions, Matrix< std::complex< T > > const & A, - Vector< T > const & eigenValues, - Matrix< std::complex< T > > const & eigenVectors, + Vector< T > const & eigenvalues, + Matrix< std::complex< T > > const & eigenvectors, Vector< DenseInt > const & support, Workspace< std::complex< T > > & workspace, SymmetricMatrixStorageType const storageType, bool const compute ) { - LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." ); + LVARRAY_UNUSED_VARIABLE( backend ); LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); @@ -105,27 +109,27 @@ DenseInt heevr( DenseInt maxEigenvaluesToFind = N; DenseInt const IL = decompositionOptions.indexMin; DenseInt const IU = decompositionOptions.indexMax; - if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX ) + if( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX ) { LVARRAY_ERROR_IF_GT( IU, N ); maxEigenvaluesToFind = IU - IL + 1; } - LVARRAY_ERROR_IF_LT( eigenValues.size, maxEigenvaluesToFind ); + LVARRAY_ERROR_IF_LT( eigenvalues.size, maxEigenvaluesToFind ); DenseInt const ABSTOL = decompositionOptions.abstol; DenseInt M = 0; - if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS ) + if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS ) { - LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N ); - LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind ); + LVARRAY_ERROR_IF_NE( eigenvectors.nRows, N ); + LVARRAY_ERROR_IF_LT( eigenvectors.nCols, maxEigenvaluesToFind ); } - DenseInt const LDZ = std::max( 1, eigenVectors.stride ); + DenseInt const LDZ = std::max( 1, eigenvectors.stride ); - if( decompositionOptions.range == EigenDecompositionOptions::Range::ALL || - ( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX && + if( decompositionOptions.range == EigenDecompositionOptions::ALL || + ( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX && maxEigenvaluesToFind == N ) ) { LVARRAY_ERROR_IF_LT( support.size, 2 * maxEigenvaluesToFind ); @@ -138,59 +142,178 @@ DenseInt heevr( DenseInt INFO = 0; // With C++ 17 we can remove the reinterpret_cast with constexpr if. - if( std::is_same< T, float >::value ) + if( backend == BuiltInBackends::LAPACK ) { - LVARRAY_CHEEVR( - JOBZ, - RANGE, - UPLO, - &N, - reinterpret_cast< std::complex< float > * >( A.data ), - &LDA, - reinterpret_cast< float const * >( &VL ), - reinterpret_cast< float const * >( &VU ), - &IL, - &IU, - reinterpret_cast< float const * >( &ABSTOL ), - &M, - reinterpret_cast< float * >( eigenValues.data ), - reinterpret_cast< std::complex< float > * >( eigenVectors.data ), - &LDZ, - support.data, - reinterpret_cast< std::complex< float > * >( workspace.work().data ), - &LWORK, - reinterpret_cast< float * >( workspace.rwork().data ), - &LRWORK, - workspace.iwork().data, - &LIWORK, - &INFO ); + if( std::is_same< T, float >::value ) + { + LVARRAY_CHEEVR( + JOBZ, + RANGE, + UPLO, + &N, + reinterpret_cast< std::complex< float > * >( A.data ), + &LDA, + reinterpret_cast< float const * >( &VL ), + reinterpret_cast< float const * >( &VU ), + &IL, + &IU, + reinterpret_cast< float const * >( &ABSTOL ), + &M, + reinterpret_cast< float * >( eigenvalues.data ), + reinterpret_cast< std::complex< float > * >( eigenvectors.data ), + &LDZ, + support.data, + reinterpret_cast< std::complex< float > * >( workspace.work().data ), + &LWORK, + reinterpret_cast< float * >( workspace.rwork().data ), + &LRWORK, + workspace.iwork().data, + &LIWORK, + &INFO ); + } + else + { + LVARRAY_ZHEEVR( + JOBZ, + RANGE, + UPLO, + &N, + reinterpret_cast< std::complex< double > * >( A.data ), + &LDA, + reinterpret_cast< double const * >( &VL ), + reinterpret_cast< double const * >( &VU ), + &IL, + &IU, + reinterpret_cast< double const * >( &ABSTOL ), + &M, + reinterpret_cast< double * >( eigenvalues.data ), + reinterpret_cast< std::complex< double > * >( eigenvectors.data ), + &LDZ, + support.data, + reinterpret_cast< std::complex< double > * >( workspace.work().data ), + &LWORK, + reinterpret_cast< double * >( workspace.rwork().data ), + &LRWORK, + workspace.iwork().data, + &LIWORK, + &INFO ); + } } +#if defined( LVARRAY_USE_MAGMA ) + else if( backend == BuiltInBackends::MAGMA ) + { + if( std::is_same< T, float >::value ) + { + magma_cheevr( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< float * >( eigenvalues.data ), + reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaFloatComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< float * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); + } + else + { + magma_zheevr( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< double * >( eigenvalues.data ), + reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< double * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); + } + } + else if( backend == BuiltInBackends::MAGMA_GPU ) + { + int LDWA = N; + int LDWZ = 1; + + if( compute ) + { + workspace.resizeWork2( MemorySpace::cuda, LDWA * N ); + + if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS ) + { + LDWZ = N; + } + + workspace.resizeWork3( MemorySpace::cuda, LDWZ * maxEigenvaluesToFind ); + } + + if( std::is_same< T, float >::value ) + { + magma_cheevr_gpu( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< float * >( eigenvalues.data ), + reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaFloatComplex * >( workspace.work2().data ), + LDWA, + reinterpret_cast< magmaFloatComplex * >( workspace.work3().data ), + LDWZ, + reinterpret_cast< magmaFloatComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< float * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); + } + else + { + LVARRAY_ERROR( "Not supported." ); + } + } +#endif else { - LVARRAY_ZHEEVR( - JOBZ, - RANGE, - UPLO, - &N, - reinterpret_cast< std::complex< double > * >( A.data ), - &LDA, - reinterpret_cast< double const * >( &VL ), - reinterpret_cast< double const * >( &VU ), - &IL, - &IU, - reinterpret_cast< double const * >( &ABSTOL ), - &M, - reinterpret_cast< double * >( eigenValues.data ), - reinterpret_cast< std::complex< double > * >( eigenVectors.data ), - &LDZ, - support.data, - reinterpret_cast< std::complex< double > * >( workspace.work().data ), - &LWORK, - reinterpret_cast< double * >( workspace.rwork().data ), - &LRWORK, - workspace.iwork().data, - &LIWORK, - &INFO ); + LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) ); } LVARRAY_ERROR_IF_NE( INFO, 0 ); @@ -203,15 +326,20 @@ DenseInt heevr( //////////////////////////////////////////////////////////////////////////////////////////////////// template< typename T > DenseInt heevr( - MemorySpace const space, + BuiltInBackends const backend, EigenDecompositionOptions const decompositionOptions, Matrix< std::complex< T > > const & A, - Vector< T > const & eigenValues, - Matrix< std::complex< T > > const & eigenVectors, + Vector< T > const & eigenvalues, + Matrix< std::complex< T > > const & eigenvectors, Vector< DenseInt > const & support, Workspace< std::complex< T > > & workspace, SymmetricMatrixStorageType const storageType ) { + // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries. + // I'm not sure exactly how this would work for the eigenvectors though. + LVARRAY_ERROR_IF( !A.columnMajor, "Row major is not yet supported." ); + LVARRAY_ERROR_IF( !eigenvectors.columnMajor, "Row major is not yet supported." ); + bool const reallocateWork = workspace.work().size < 2 * A.nRows; bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows; bool const reallocateIWork = workspace.iwork().size < 10 * A.nRows; @@ -219,25 +347,30 @@ DenseInt heevr( if( reallocateWork || reallocateRWork || reallocateIWork ) { OptimalSizeCalculation< std::complex< T > > optimalSizes; - internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false ); + internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, optimalSizes, storageType, false ); + MemorySpace const space = getSpaceForBackend( backend ); + if( reallocateWork ) { + LVARRAY_LOG_VAR( optimalSizes.optimalWorkSize() ); workspace.resizeWork( space, optimalSizes.optimalWorkSize() ); } if( reallocateRWork ) { + LVARRAY_LOG_VAR( optimalSizes.optimalRWorkSize() ); workspace.resizeRWork( space, optimalSizes.optimalRWorkSize() ); } if( reallocateIWork ) { + LVARRAY_LOG_VAR( optimalSizes.optimalIWorkSize() ); workspace.resizeIWork( space, optimalSizes.optimalIWorkSize() ); } } - return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true ); + return internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, workspace, storageType, true ); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -246,22 +379,22 @@ DenseInt heevr( //////////////////////////////////////////////////////////////////////////////////////////////////// template DenseInt heevr< float >( - MemorySpace const space, + BuiltInBackends const backend, EigenDecompositionOptions const decompositionOptions, Matrix< std::complex< float > > const & A, - Vector< float > const & eigenValues, - Matrix< std::complex< float > > const & eigenVectors, + Vector< float > const & eigenvalues, + Matrix< std::complex< float > > const & eigenvectors, Vector< DenseInt > const & support, Workspace< std::complex< float > > & workspace, SymmetricMatrixStorageType const storageType ); //////////////////////////////////////////////////////////////////////////////////////////////////// template DenseInt heevr< double >( - MemorySpace const space, + BuiltInBackends const backend, EigenDecompositionOptions const decompositionOptions, Matrix< std::complex< double > > const & A, - Vector< double > const & eigenValues, - Matrix< std::complex< double > > const & eigenVectors, + Vector< double > const & eigenvalues, + Matrix< std::complex< double > > const & eigenvectors, Vector< DenseInt > const & support, Workspace< std::complex< double > > & workspace, SymmetricMatrixStorageType const storageType ); diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp index 16ec001a..e83305fa 100644 --- a/src/dense/eigenDecomposition.hpp +++ b/src/dense/eigenDecomposition.hpp @@ -37,7 +37,9 @@ struct EigenDecompositionOptions EigenDecompositionOptions( Type const typeP, double const abstolP=0 ): type{ typeP }, abstol{ abstolP } - {} + { + LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type ); + } /** * @@ -48,11 +50,12 @@ struct EigenDecompositionOptions double const rangeMaxP, double const abstolP ): type{ typeP }, - range{ Range::IN_INTERVAL }, + range{ IN_INTERVAL }, rangeMin{ rangeMinP }, rangeMax{ rangeMaxP }, abstol{ abstolP } { + LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type ); LVARRAY_ERROR_IF_GE( rangeMin, rangeMax ); } @@ -65,11 +68,12 @@ struct EigenDecompositionOptions DenseInt const indexMaxP, double const abstolP ): type{ typeP }, - range{ Range::IN_INTERVAL }, + range{ IN_INTERVAL }, indexMin{ indexMinP }, indexMax{ indexMaxP }, abstol{ abstolP } { + LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type ); LVARRAY_ERROR_IF_LT( indexMin, 1 ); LVARRAY_ERROR_IF_GT( indexMin, indexMax ); } @@ -82,7 +86,7 @@ struct EigenDecompositionOptions static constexpr char const * const eigenvalueString = "N"; static constexpr char const * const eigenvectorString = "V"; - return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString; + return type == EIGENVALUES ? eigenvalueString : eigenvectorString; } /** @@ -94,17 +98,17 @@ struct EigenDecompositionOptions static constexpr char const * const intervalString = "V"; static constexpr char const * const indexString = "I"; - if( range == Range::ALL ) + if( range == ALL ) { return allString; } - return range == Range::IN_INTERVAL ? intervalString : indexString; + return range == IN_INTERVAL ? intervalString : indexString; } /// Type const type; /// - Range const range = Range::ALL; + Range const range = ALL; /// double const rangeMin = std::numeric_limits< double >::max(); @@ -128,7 +132,7 @@ struct EigenDecompositionOptions */ template< typename T > DenseInt heevr( - MemorySpace const space, + BuiltInBackends const backend, EigenDecompositionOptions const decompositionOptions, Matrix< std::complex< T > > const & A, Vector< T > const & eigenValues, @@ -140,9 +144,9 @@ DenseInt heevr( /** * */ -template< typename T, int USD, typename INDEX_TYPE > +template< typename BACK_END, typename T, int USD, typename INDEX_TYPE > DenseInt heevr( - MemorySpace const space, + BACK_END && backend, EigenDecompositionOptions const decompositionOptions, ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & A, ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, @@ -157,7 +161,7 @@ DenseInt heevr( Vector< DenseInt > supportVector( support ); return heevr( - space, + std::forward< BACK_END >( backend ), decompositionOptions, AMatrix, eigenValuesVector, @@ -170,9 +174,9 @@ DenseInt heevr( /** * */ -template< typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +template< typename BACK_END, typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > DenseInt heevr( - MemorySpace const space, + BACK_END && backend, EigenDecompositionOptions const decompositionOptions, ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & A, ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues, @@ -182,13 +186,14 @@ DenseInt heevr( SymmetricMatrixStorageType const storageType ) { // Unclear about the touch here since half of A is destroyed, maybe it's not necessary. + MemorySpace const space = getSpaceForBackend( backend ); A.move( space, true ); eigenValues.move( space, true ); eigenVectors.move( space, true ); support.move( space, true ); return heevr( - space, + std::forward< BACK_END >( backend ), decompositionOptions, A.toSlice(), eigenValues.toSlice(), diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp index 8f1c1a2b..9234362c 100644 --- a/unitTests/dense/testEigenDecomposition.cpp +++ b/unitTests/dense/testEigenDecomposition.cpp @@ -15,39 +15,103 @@ namespace LvArray namespace testing { +using namespace dense; + template< typename T > using Array1d = Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, DEFAULT_BUFFER >; template< typename T, typename PERM > using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >; +template< typename T > +struct HEEVR_TEST +{ + HEEVR_TEST( BuiltInBackends const backend ): + m_backend( backend ) + {} + + void threeByThreeEigenvalues() + { + resize( 3, 3, 0 ); + + m_matrix( 1, 1 ) = 2; + m_matrix( 0, 0 ) = 3; + m_matrix( 2, 2 ) = -4; + + SymmetricMatrixStorageType storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR; + + heevr( + m_backend, + EigenDecompositionOptions( EigenDecompositionOptions::EIGENVALUES ), + m_matrix.toView(), + m_eigenvalues.toView(), + m_eigenvectors.toView(), + m_support, + m_workspace, + storageType ); + + EXPECT_DOUBLE_EQ( m_eigenvalues[ 0 ], -4 ); + EXPECT_DOUBLE_EQ( m_eigenvalues[ 1 ], 2 ); + EXPECT_DOUBLE_EQ( m_eigenvalues[ 2 ], 3 ); + } + +private: + void resize( DenseInt const n, DenseInt const nvals, DenseInt const nvec ) + { + m_matrix.resize( n, n ); + m_eigenvalues.resize( nvals ); + m_eigenvectors.resize( n, nvec );; + m_support.resize( 2 * n ); + } + + BuiltInBackends const m_backend; + Array2d< std::complex< T >, RAJA::PERM_JI > m_matrix; + Array1d< T > m_eigenvalues; + Array2d< std::complex< T >, RAJA::PERM_JI > m_eigenvectors; + Array1d< int > m_support; + ArrayWorkspace< std::complex< T >, ChaiBuffer > m_workspace; +}; + +TEST( eigenvalues_float, lapack ) +{ + HEEVR_TEST< float > test( BuiltInBackends::LAPACK ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_double, lapack ) +{ + HEEVR_TEST< double > test( BuiltInBackends::LAPACK ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_float, magma ) +{ + HEEVR_TEST< float > test( BuiltInBackends::MAGMA ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_double, magma ) +{ + HEEVR_TEST< double > test( BuiltInBackends::MAGMA ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_float, magma_gpu ) +{ + HEEVR_TEST< float > test( BuiltInBackends::MAGMA_GPU ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_double, magma_gpu ) +{ + HEEVR_TEST< double > test( BuiltInBackends::MAGMA_GPU ); -TEST( heevr, allEigenvalues ) -{ - Array2d< std::complex< double >, RAJA::PERM_JI > matrix( 3, 3 ); - matrix( 1, 1 ) = 2; - matrix( 0, 0 ) = 3; - matrix( 2, 2 ) = -4; - - Array1d< double > eigenvalues( 3 ); - Array2d< std::complex< double >, RAJA::PERM_JI > eigenvectors; - Array1d< int > support( 6 ); - dense::ArrayWorkspace< std::complex< double >, ChaiBuffer > workspace; - dense::SymmetricMatrixStorageType storageType = dense::SymmetricMatrixStorageType::UPPER_TRIANGULAR; - - dense::heevr< double >( - MemorySpace::host, - dense::EigenDecompositionOptions( dense::EigenDecompositionOptions::Type::EIGENVALUES ), - matrix.toView(), - eigenvalues.toView(), - eigenvectors.toView(), - support, - workspace, - storageType ); - - EXPECT_DOUBLE_EQ( eigenvalues[ 0 ], -4 ); - EXPECT_DOUBLE_EQ( eigenvalues[ 1 ], 2 ); - EXPECT_DOUBLE_EQ( eigenvalues[ 2 ], 3 ); + test.threeByThreeEigenvalues(); } } // namespace testing From 5ada3d5c13ddb8c447ab1f1aa38d1a575e61c556 Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Tue, 6 Sep 2022 15:06:56 -0700 Subject: [PATCH 32/34] Got linear solve and eigenvalues working. --- src/dense/CMakeLists.txt | 3 + src/dense/backendHelpers.hpp | 12 + src/dense/common.hpp | 57 ++++- src/dense/eigenDecomposition.cpp | 72 +++--- src/dense/eigenDecomposition.hpp | 33 ++- src/dense/linearSolve.cpp | 278 +++++++++++++++++++++ src/dense/linearSolve.hpp | 85 +++++++ unitTests/dense/CMakeLists.txt | 1 + unitTests/dense/testEigenDecomposition.cpp | 34 ++- unitTests/dense/testLinearSolve.cpp | 198 +++++++++++++++ 10 files changed, 715 insertions(+), 58 deletions(-) create mode 100644 src/dense/backendHelpers.hpp create mode 100644 src/dense/linearSolve.cpp create mode 100644 src/dense/linearSolve.hpp create mode 100644 unitTests/dense/testLinearSolve.cpp diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt index 3d7a1f34..0a1de30b 100644 --- a/src/dense/CMakeLists.txt +++ b/src/dense/CMakeLists.txt @@ -1,11 +1,14 @@ set( lvarraydense_headers common.hpp + backendHelpers.hpp eigenDecomposition.hpp + linearSolve.hpp ) set( lvarraydense_sources common.cpp eigenDecomposition.cpp + linearSolve.cpp ) set( dependencies lvarray ${lvarray_dependencies} blas lapack ) diff --git a/src/dense/backendHelpers.hpp b/src/dense/backendHelpers.hpp new file mode 100644 index 00000000..144ad845 --- /dev/null +++ b/src/dense/backendHelpers.hpp @@ -0,0 +1,12 @@ +#pragma once + +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + +/// This macro provide a flexible interface for Fortran naming convention for compiled objects +// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE +#define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name +// #else +// #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name ## _ +// #endif \ No newline at end of file diff --git a/src/dense/common.hpp b/src/dense/common.hpp index 09ef3edd..9c4fda87 100644 --- a/src/dense/common.hpp +++ b/src/dense/common.hpp @@ -52,6 +52,17 @@ char const * getOption( SymmetricMatrixStorageType const option ); template< typename T > using RealVersion = typename internal::RealVersion< T >::Type; +/** + * + */ +template< typename T > +static constexpr bool IsComplex = !std::is_same< RealVersion< T >, T >::value; + +/** + * + */ +template< typename T, typename U > +static constexpr bool IsComplexT = IsComplex< T > && std::is_same< RealVersion< T >, U >::value; /** * @@ -86,10 +97,22 @@ struct Matrix */ template< typename INDEX_TYPE > Matrix( ArraySlice< T, 2, 0, INDEX_TYPE > const & slice ): - nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) }, - nCols{ integerConversion< DenseInt >( slice.size( 0 ) ) }, + nRows{ integerConversion< DenseInt >( slice.size( 0 ) ) }, + nCols{ integerConversion< DenseInt >( slice.size( 1 ) ) }, stride{ integerConversion< DenseInt >( slice.stride( 1 ) ) }, - columnMajor{ true }, + isColumnMajor{ true }, + data{ slice.data() } + {} + + /** + * + */ + template< typename INDEX_TYPE, int USD > + Matrix( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ): + nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) }, + nCols{ integerConversion< DenseInt >( 1 ) }, + stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) }, + isColumnMajor{ true }, data{ slice.data() } {} @@ -100,7 +123,7 @@ struct Matrix nRows{ 1 }, nCols{ 1 }, stride{ 1 }, - columnMajor{ true }, + isColumnMajor{ true }, data{ &value } {} @@ -115,7 +138,7 @@ struct Matrix DenseInt const nRows; DenseInt const nCols; DenseInt const stride; - bool const columnMajor; + bool const isColumnMajor; T * const data; }; @@ -180,7 +203,13 @@ template< typename T, template< typename > class BUFFER_TYPE > struct ArrayWorkspace : public Workspace< T > { ArrayWorkspace() - {} + { + m_work.setName( "ArrayWorkspace::m_work" ); + m_work2.setName( "ArrayWorkspace::m_work2" ); + m_work3.setName( "ArrayWorkspace::m_work3" ); + m_rwork.setName( "ArrayWorkspace::m_rwork" ); + m_iwork.setName( "ArrayWorkspace::m_iwork" ); + } virtual Vector< T > work() override { return m_work.toSlice(); } @@ -198,16 +227,24 @@ struct ArrayWorkspace : public Workspace< T > { return m_iwork.toSlice(); } virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override - { m_work.resizeWithoutInitializationOrDestruction( space, newSize ); } + { + m_work.resizeWithoutInitializationOrDestruction( space, newSize ); + } virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) override - { m_work2.resizeWithoutInitializationOrDestruction( space, newSize ); } + { + m_work2.resizeWithoutInitializationOrDestruction( space, newSize ); + } virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) override - { m_work3.resizeWithoutInitializationOrDestruction( space, newSize ); } + { + m_work3.resizeWithoutInitializationOrDestruction( space, newSize ); + } virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override - { m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); } + { + m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); + } virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override { diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp index 071d5996..e70b6561 100644 --- a/src/dense/eigenDecomposition.cpp +++ b/src/dense/eigenDecomposition.cpp @@ -1,21 +1,11 @@ #include "eigenDecomposition.hpp" - -#if defined( LVARRAY_USE_MAGMA ) - #include -#endif - -/// This macro provide a flexible interface for Fortran naming convention for compiled objects -// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE -#define FORTRAN_MANGLE( name ) name -// #else -// #define FORTRAN_MANGLE( name ) name ## _ -// #endif +#include "backendHelpers.hpp" extern "C" { //////////////////////////////////////////////////////////////////////////////////////////////////// -#define LVARRAY_CHEEVR FORTRAN_MANGLE( cheevr ) +#define LVARRAY_CHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( cheevr ) void LVARRAY_CHEEVR( char const * JOBZ, char const * RANGE, @@ -42,7 +32,7 @@ void LVARRAY_CHEEVR( LvArray::dense::DenseInt * INFO ); //////////////////////////////////////////////////////////////////////////////////////////////////// -#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr ) +#define LVARRAY_ZHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( zheevr ) void LVARRAY_ZHEEVR( char const * JOBZ, char const * RANGE, @@ -260,18 +250,12 @@ DenseInt heevr( else if( backend == BuiltInBackends::MAGMA_GPU ) { int LDWA = N; - int LDWZ = 1; + int LDWZ = N; if( compute ) { - workspace.resizeWork2( MemorySpace::cuda, LDWA * N ); - - if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS ) - { - LDWZ = N; - } - - workspace.resizeWork3( MemorySpace::cuda, LDWZ * maxEigenvaluesToFind ); + workspace.resizeWork2( MemorySpace::host, LDWA * N ); + workspace.resizeWork3( MemorySpace::host, LDWZ * maxEigenvaluesToFind ); } if( std::is_same< T, float >::value ) @@ -307,7 +291,34 @@ DenseInt heevr( } else { - LVARRAY_ERROR( "Not supported." ); + magma_zheevr_gpu( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< double * >( eigenvalues.data ), + reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaDoubleComplex * >( workspace.work2().data ), + LDWA, + reinterpret_cast< magmaDoubleComplex * >( workspace.work3().data ), + LDWZ, + reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< double * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); } } #endif @@ -337,8 +348,8 @@ DenseInt heevr( { // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries. // I'm not sure exactly how this would work for the eigenvectors though. - LVARRAY_ERROR_IF( !A.columnMajor, "Row major is not yet supported." ); - LVARRAY_ERROR_IF( !eigenvectors.columnMajor, "Row major is not yet supported." ); + LVARRAY_ERROR_IF( !A.isColumnMajor, "Row major is not yet supported." ); + LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor, "Row major is not yet supported." ); bool const reallocateWork = workspace.work().size < 2 * A.nRows; bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows; @@ -349,24 +360,19 @@ DenseInt heevr( OptimalSizeCalculation< std::complex< T > > optimalSizes; internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, optimalSizes, storageType, false ); - MemorySpace const space = getSpaceForBackend( backend ); - if( reallocateWork ) { - LVARRAY_LOG_VAR( optimalSizes.optimalWorkSize() ); - workspace.resizeWork( space, optimalSizes.optimalWorkSize() ); + workspace.resizeWork( MemorySpace::host, optimalSizes.optimalWorkSize() ); } if( reallocateRWork ) { - LVARRAY_LOG_VAR( optimalSizes.optimalRWorkSize() ); - workspace.resizeRWork( space, optimalSizes.optimalRWorkSize() ); + workspace.resizeRWork( MemorySpace::host, optimalSizes.optimalRWorkSize() ); } if( reallocateIWork ) { - LVARRAY_LOG_VAR( optimalSizes.optimalIWorkSize() ); - workspace.resizeIWork( space, optimalSizes.optimalIWorkSize() ); + workspace.resizeIWork( MemorySpace::host, optimalSizes.optimalIWorkSize() ); } } diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp index e83305fa..5e7f3819 100644 --- a/src/dense/eigenDecomposition.hpp +++ b/src/dense/eigenDecomposition.hpp @@ -144,13 +144,13 @@ DenseInt heevr( /** * */ -template< typename BACK_END, typename T, int USD, typename INDEX_TYPE > +template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE > DenseInt heevr( BACK_END && backend, EigenDecompositionOptions const decompositionOptions, - ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & A, + ArraySlice< std::complex< T >, 2, USD_A, INDEX_TYPE > const & A, ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, - ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & eigenVectors, + ArraySlice< std::complex< T >, 2, USD_V, INDEX_TYPE > const & eigenVectors, ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & support, Workspace< std::complex< T > > & workspace, SymmetricMatrixStorageType const storageType ) @@ -174,23 +174,36 @@ DenseInt heevr( /** * */ -template< typename BACK_END, typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > DenseInt heevr( BACK_END && backend, EigenDecompositionOptions const decompositionOptions, - ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & A, + ArrayView< std::complex< T >, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A, ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues, - ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors, + ArrayView< std::complex< T >, 2, USD_V, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors, ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & support, Workspace< std::complex< T > > & workspace, SymmetricMatrixStorageType const storageType ) { - // Unclear about the touch here since half of A is destroyed, maybe it's not necessary. MemorySpace const space = getSpaceForBackend( backend ); - A.move( space, true ); - eigenValues.move( space, true ); + + // The A matrix isn't touched because it is destroyed. + A.move( space, false ); eigenVectors.move( space, true ); - support.move( space, true ); + +#if defined( LVARRAY_USE_MAGMA ) + // MAGMA wants the eigenvalues and support on the CPU. + if( backend == BuiltInBackends::MAGMA_GPU ) + { + eigenValues.move( MemorySpace::host, true ); + support.move( MemorySpace::host, true ); + } + else +#endif + { + eigenValues.move( space, true ); + support.move( space, true ); + } return heevr( std::forward< BACK_END >( backend ), diff --git a/src/dense/linearSolve.cpp b/src/dense/linearSolve.cpp new file mode 100644 index 00000000..9833710f --- /dev/null +++ b/src/dense/linearSolve.cpp @@ -0,0 +1,278 @@ +#include "linearSolve.hpp" +#include "backendHelpers.hpp" + +extern "C" +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv ) +void LVARRAY_SGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + float * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + float * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv ) +void LVARRAY_DGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + double * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + double * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv ) +void LVARRAY_CGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + std::complex< float > * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + std::complex< float > * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv ) +void LVARRAY_ZGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + std::complex< double > * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + std::complex< double > * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +} // extern "C" + +namespace LvArray +{ +namespace dense +{ + +template< typename T > +void gesv( + BuiltInBackends const backend, + Matrix< T > const & A, + Matrix< T > const & B, + Vector< DenseInt > const & pivots ) +{ + LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); + LVARRAY_ERROR_IF( !A.isColumnMajor, "The matrix A must be column major." ); + + LVARRAY_ERROR_IF_NE( A.nRows, B.nRows ); + LVARRAY_ERROR_IF( !B.isColumnMajor, "The matrix B must be column major." ); + + LVARRAY_ERROR_IF_NE( pivots.size, A.nRows ); + + DenseInt const N = A.nCols; + DenseInt const NRHS = B.nCols; + DenseInt const LDA = A.stride; + DenseInt const LDB = B.stride; + DenseInt INFO = 0; + + if( backend == BuiltInBackends::LAPACK ) + { + if( std::is_same< T, float >::value ) + { + LVARRAY_SGESV( + &N, + &NRHS, + reinterpret_cast< float * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< float * >( B.data ), + &LDB, + &INFO ); + } + if( std::is_same< T, double >::value ) + { + LVARRAY_DGESV( + &N, + &NRHS, + reinterpret_cast< double * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< double * >( B.data ), + &LDB, + &INFO ); + } + if( IsComplexT< T, float > ) + { + LVARRAY_CGESV( + &N, + &NRHS, + reinterpret_cast< std::complex< float > * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< std::complex< float > * >( B.data ), + &LDB, + &INFO ); + } + if( IsComplexT< T, double > ) + { + LVARRAY_ZGESV( + &N, + &NRHS, + reinterpret_cast< std::complex< double > * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< std::complex< double > * >( B.data ), + &LDB, + &INFO ); + } + } +#if defined( LVARRAY_USE_MAGMA ) + else if( backend == BuiltInBackends::MAGMA ) + { + if( std::is_same< T, float >::value ) + { + magma_sgesv( + N, + NRHS, + reinterpret_cast< float * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< float * >( B.data ), + LDB, + &INFO ); + } + if( std::is_same< T, double >::value ) + { + magma_dgesv( + N, + NRHS, + reinterpret_cast< double * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< double * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, float > ) + { + magma_cgesv( + N, + NRHS, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaFloatComplex * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, double > ) + { + magma_zgesv( + N, + NRHS, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaDoubleComplex * >( B.data ), + LDB, + &INFO ); + } + } + else if( backend == BuiltInBackends::MAGMA_GPU ) + { + if( std::is_same< T, float >::value ) + { + magma_sgesv_gpu( + N, + NRHS, + reinterpret_cast< float * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< float * >( B.data ), + LDB, + &INFO ); + } + if( std::is_same< T, double >::value ) + { + magma_dgesv_gpu( + N, + NRHS, + reinterpret_cast< double * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< double * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, float > ) + { + magma_cgesv_gpu( + N, + NRHS, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaFloatComplex * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, double > ) + { + magma_zgesv_gpu( + N, + NRHS, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaDoubleComplex * >( B.data ), + LDB, + &INFO ); + } + } +#endif + else + { + LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) ); + } + + LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." ); + LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 << + " ) is exactly zero so the solution could not be computed." ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< float >( + BuiltInBackends const backend, + Matrix< float > const & A, + Matrix< float > const & B, + Vector< DenseInt > const & pivots ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< double >( + BuiltInBackends const backend, + Matrix< double > const & A, + Matrix< double > const & B, + Vector< DenseInt > const & pivots ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< std::complex< float > >( + BuiltInBackends const backend, + Matrix< std::complex< float > > const & A, + Matrix< std::complex< float > > const & B, + Vector< DenseInt > const & pivots ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< std::complex< double > >( + BuiltInBackends const backend, + Matrix< std::complex< double > > const & A, + Matrix< std::complex< double > > const & B, + Vector< DenseInt > const & pivots ); + + +} // namespace dense +} // namespace LvArray diff --git a/src/dense/linearSolve.hpp b/src/dense/linearSolve.hpp new file mode 100644 index 00000000..3efe7719 --- /dev/null +++ b/src/dense/linearSolve.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +/** + * @brief Solves the matrix equation A X = B for X using (s, d, c, z)gesv. + * + * @tparam T The type of values in the matrices. Must be one of float, double, std::complex< float >, or std::complex< double >. + * @param backend The built in backend that implements (s, d, c, z)gesv. + * @param A The input matrix, which is overwritten with L and U from the LU decomposition. + * @param B The input right hand side, is overwritten with the solution X. + * @param pivots The permutation matrix used when factoring A. + * + * @note When using @c MAGMA_GPU as the backend both @param A and @param B should be on the GPU while @param pivots + * remains on the host. + */ +template< typename T > +void gesv( + BuiltInBackends const backend, + Matrix< T > const & A, + Matrix< T > const & B, + Vector< DenseInt > const & pivots ); + +/** + * + */ +template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE > +void gesv( + BACK_END && backend, + ArraySlice< T, 2, USD_A, INDEX_TYPE > const & A, + ArraySlice< T, NDIM_B, USD_B, INDEX_TYPE > const & B, + ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & pivots ) +{ + Matrix< T > AMatrix( A ); + Matrix< T > BMatrix( B ); + Vector< DenseInt > pivotsVector( pivots ); + + gesv( + std::forward< BACK_END >( backend ), + AMatrix, + BMatrix, + pivots ); +} + +/** + * + */ +template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +void gesv( + BACK_END && backend, + ArrayView< T, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A, + ArrayView< T, NDIM_B, USD_B, INDEX_TYPE, BUFFER_TYPE > const & B, + ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & pivots ) +{ + // TODO(corbett5): Unclear about the touch here since A is destroyed but the LU decomposition may still be useful. + MemorySpace const space = getSpaceForBackend( backend ); + A.move( space, true ); + B.move( space, true ); + +#if defined( LVARRAY_USE_MAGMA ) + // MAGMA wants the pivots on the CPU. + if( backend == BuiltInBackends::MAGMA_GPU ) + { + pivots.move( MemorySpace::host, true ); + } + else +#endif + { + pivots.move( space, true ); + } + + return gesv( + std::forward< BACK_END >( backend ), + A.toSlice(), + B.toSlice(), + pivots.toSlice() ); +} + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt index f324797e..4d58286d 100644 --- a/unitTests/dense/CMakeLists.txt +++ b/unitTests/dense/CMakeLists.txt @@ -10,6 +10,7 @@ # set( testSources testEigenDecomposition.cpp + testLinearSolve.cpp ) # diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp index 9234362c..bc29aa05 100644 --- a/unitTests/dense/testEigenDecomposition.cpp +++ b/unitTests/dense/testEigenDecomposition.cpp @@ -10,6 +10,10 @@ #include "../testUtils.hpp" +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + namespace LvArray { namespace testing @@ -18,21 +22,28 @@ namespace testing using namespace dense; template< typename T > -using Array1d = Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, DEFAULT_BUFFER >; +using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >; template< typename T, typename PERM > -using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >; +using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >; + +// TODO(corbett5): significantly improve this test. template< typename T > struct HEEVR_TEST { HEEVR_TEST( BuiltInBackends const backend ): m_backend( backend ) - {} + { + m_matrix.setName( "matrix" ); + m_eigenvalues.setName( "m_eigenvalues" ); + m_eigenvectors.setName( "eigenvectors" ); + m_support.setName( "support" ); + } void threeByThreeEigenvalues() { - resize( 3, 3, 0 ); + resize( 20, 20, 0 ); m_matrix( 1, 1 ) = 2; m_matrix( 0, 0 ) = 3; @@ -60,7 +71,7 @@ struct HEEVR_TEST { m_matrix.resize( n, n ); m_eigenvalues.resize( nvals ); - m_eigenvectors.resize( n, nvec );; + m_eigenvectors.resize( n, nvec ); m_support.resize( 2 * n ); } @@ -86,6 +97,8 @@ TEST( eigenvalues_double, lapack ) test.threeByThreeEigenvalues(); } +#if defined( LVARRAY_USE_MAGMA ) + TEST( eigenvalues_float, magma ) { HEEVR_TEST< float > test( BuiltInBackends::MAGMA ); @@ -114,13 +127,24 @@ TEST( eigenvalues_double, magma_gpu ) test.threeByThreeEigenvalues(); } +#endif + } // namespace testing } // namespace LvArray // This is the default gtest main method. It is included for ease of debugging. int main( int argc, char * * argv ) { +#if defined( LVARRAY_USE_MAGMA ) + magma_init(); +#endif + ::testing::InitGoogleTest( &argc, argv ); int const result = RUN_ALL_TESTS(); + +#if defined( LVARRAY_USE_MAGMA ) + magma_finalize(); +#endif + return result; } diff --git a/unitTests/dense/testLinearSolve.cpp b/unitTests/dense/testLinearSolve.cpp new file mode 100644 index 00000000..7a1ab3c8 --- /dev/null +++ b/unitTests/dense/testLinearSolve.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +// Source includes +#include "dense/linearSolve.hpp" + +#include "../testUtils.hpp" + +#include "output.hpp" + +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + +#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \ + EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \ + EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError ) + +namespace LvArray +{ +namespace testing +{ + +using namespace dense; + +template< typename T > +using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >; + +template< typename T, typename PERM > +using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >; + + +template< typename T > +struct GESV_Test : public ::testing::Test +{ + void test( BuiltInBackends const backend, DenseInt const N, DenseInt const nrhs ) + { + Array2d< T, RAJA::PERM_JI > A( N, N ); + Array2d< T, RAJA::PERM_JI > B( N, nrhs ) ; + Array1d< DenseInt > pivots( N ); + + for( DenseInt row = 0; row < N; ++row ) + { + for( DenseInt col = 0; col < N; ++col ) + { + A( row, col ) = randomNumber(); + } + + for( DenseInt col = 0; col < nrhs; ++col ) + { + B( row, col ) = randomNumber(); + } + } + + Array2d< T, RAJA::PERM_JI > ACopy( A ); + Array2d< T, RAJA::PERM_JI > X( B ); + gesv( backend, ACopy.toView(), X.toView(), pivots ); + + // TODO(corbett5): replace this with matrix matrix multiplication + X.move( MemorySpace::host, true ); + for( DenseInt i = 0; i < N; ++i ) + { + for( DenseInt j = 0; j < nrhs; ++j ) + { + T dot = 0; + for( DenseInt k = 0; k < N; ++k ) + { + dot += A( i, k ) * X( k, j ); + } + + EXPECT_COMPLEX_NEAR( dot, B( i, j ), 10 * N * std::numeric_limits< RealVersion< T > >::epsilon() ); + } + } + } + +private: + + template< typename _T=T > + std::enable_if_t< !IsComplex< _T >, T > + randomNumber() + { return m_dist( m_gen ); } + + template< typename _T=T > + std::enable_if_t< IsComplex< _T >, T > + randomNumber() + { return { m_dist( m_gen ), m_dist( m_gen ) }; } + + std::mt19937_64 m_gen; + std::uniform_real_distribution< RealVersion< T > > m_dist; +}; + +using GESV_Test_types = ::testing::Types< + float, + double, + std::complex< float >, + std::complex< double > + >; +TYPED_TEST_SUITE( GESV_Test, GESV_Test_types, ); + +TYPED_TEST( GESV_Test, LAPACK_2x2 ) +{ + this->test( BuiltInBackends::LAPACK, 2, 1 ); + this->test( BuiltInBackends::LAPACK, 2, 2 ); +} + +TYPED_TEST( GESV_Test, LAPACK_10x10 ) +{ + this->test( BuiltInBackends::LAPACK, 10, 1 ); + this->test( BuiltInBackends::LAPACK, 10, 3 ); +} + +TYPED_TEST( GESV_Test, LAPACK_100x100 ) +{ + this->test( BuiltInBackends::LAPACK, 100, 1 ); + this->test( BuiltInBackends::LAPACK, 100, 10 ); +} + +TYPED_TEST( GESV_Test, LAPACK_1000x1000 ) +{ + this->test( BuiltInBackends::LAPACK, 100, 1 ); + this->test( BuiltInBackends::LAPACK, 100, 10 ); +} + +#if defined( LVARRAY_USE_MAGMA ) + +TYPED_TEST( GESV_Test, MAGMA_2x2 ) +{ + this->test( BuiltInBackends::MAGMA, 2, 1 ); + this->test( BuiltInBackends::MAGMA, 2, 2 ); +} + +TYPED_TEST( GESV_Test, MAGMA_10x10 ) +{ + this->test( BuiltInBackends::MAGMA, 10, 1 ); + this->test( BuiltInBackends::MAGMA, 10, 3 ); +} + +TYPED_TEST( GESV_Test, MAGMA_100x100 ) +{ + this->test( BuiltInBackends::MAGMA, 100, 1 ); + this->test( BuiltInBackends::MAGMA, 100, 10 ); +} + +TYPED_TEST( GESV_Test, MAGMA_1000x1000 ) +{ + this->test( BuiltInBackends::MAGMA, 100, 1 ); + this->test( BuiltInBackends::MAGMA, 100, 10 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_2x2 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 2, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 2, 2 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_10x10 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 10, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 10, 3 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_100x100 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 100, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 100, 10 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_1000x1000 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 100, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 100, 10 ); +} + +#endif + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ +#if defined( LVARRAY_USE_MAGMA ) + magma_init(); +#endif + + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + +#if defined( LVARRAY_USE_MAGMA ) + magma_finalize(); +#endif + + return result; +} From 3165719492cee7cd653e9444fbfd0d229edd3ca0 Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Tue, 6 Sep 2022 22:02:54 -0700 Subject: [PATCH 33/34] Squash --- .../blueos_3_ppc64le_ib_p9/compilers.yaml | 10 +++++----- .../spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml | 10 ++++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml index b8353dd0..652d26c4 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml @@ -1,11 +1,11 @@ compilers: - compiler: - spec: clang@10.0.1 + spec: clang@upstream-2019.03.19 paths: - cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + cc: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang + cxx: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r flags: cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml index 265a6c5f..575d66db 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml @@ -3,14 +3,20 @@ packages: target: [default] compiler: [gcc, clang, xl] providers: - blas: [netlib-lapack] - lapack: [netlib-lapack] + blas: [netlib-lapack, essl] + lapack: [netlib-lapack, essl] netlib-lapack: buildable: False externals: - spec: netlib-lapack@3.10.0 ~external-blas prefix: /usr/tcetmp/packages/lapack/lapack-3.10.0-P9-xl-2022.03.10/ + + essl: + buildable: False + externals: + - spec: essl@6.2.1 ~ilp64 threads=openmp +cuda +lapack + prefix: /usr/tcetmp/packages/essl/essl-6.2.1/ cuda: buildable: False From 3725d1a70d8b243d2682b5b6aa9a4d1c44eff3e5 Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Tue, 9 May 2023 23:47:14 -0700 Subject: [PATCH 34/34] Started testing gemm, need to work out some stuff. --- scripts/uberenv/packages/lvarray/package.py | 6 +- .../toss_4_x86_64_ib/packages.yaml | 10 +- src/dense/BlasLapackInterface.cpp | 210 ++++++++++++++++++ src/dense/BlasLapackInterface.hpp | 31 +++ src/dense/CMakeLists.txt | 6 +- src/dense/backendHelpers.hpp | 78 ++++++- src/dense/common.cpp | 26 +-- src/dense/common.hpp | 190 ++++++++-------- src/dense/dense.hpp | 52 +++++ src/dense/eigenDecomposition.cpp | 20 +- src/dense/linearSolve.cpp | 16 +- unitTests/dense/CMakeLists.txt | 5 +- unitTests/dense/testgemm.cpp | 199 +++++++++++++++++ 13 files changed, 695 insertions(+), 154 deletions(-) create mode 100644 src/dense/BlasLapackInterface.cpp create mode 100644 src/dense/BlasLapackInterface.hpp create mode 100644 src/dense/dense.hpp create mode 100644 unitTests/dense/testgemm.cpp diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index 7fc306fd..df299f3e 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -65,15 +65,10 @@ class Lvarray(CMakePackage, CudaPackage): variant('addr2line', default=True, description='Build support for addr2line.') -<<<<<<< HEAD variant('tpl_build_type', default='none', description='TPL build type', values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none')) - - # conflicts('~lapack', when='+magma') -======= conflicts('~lapack', when='+magma') ->>>>>>> cde43f2 (Building and compiling with MAGMA. GPU not yet working, think it's something to do with the new workspaces.) depends_on('blt@0.4.1:', when='@0.2.0:', type='build') @@ -114,6 +109,7 @@ class Lvarray(CMakePackage, CudaPackage): depends_on('umpire build_type={}'.format(bt)) depends_on('chai build_type={}'.format(bt), when='+chai') depends_on('caliper build_type={}'.format(bt), when='+caliper') + depends_on('magma build_type={}'.format(bt), when='+magma') phases = ['hostconfig', 'cmake', 'build', 'install'] diff --git a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml index ea2998fc..d3d2714a 100644 --- a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml @@ -3,14 +3,14 @@ packages: target: [default] compiler: [gcc, clang, intel] providers: - blas: [intel-mkl] - lapack: [intel-mkl] + blas: [intel-oneapi-mkl] + lapack: [intel-oneapi-mkl] - intel-mkl: + intel-oneapi-mkl: buildable: False externals: - - spec: intel-mkl@2020.0.166 threads=openmp - prefix: /usr/tce/packages/mkl/mkl-2020.0/ + - spec: intel-oneapi-mkl@2022.1.0 + prefix: /usr/tce/backend/installations/linux-rhel8-x86_64/intel-19.0.4/intel-oneapi-mkl-2022.1.0-sksz67twjxftvwchnagedk36gf7plkrp/ cmake: buildable: False diff --git a/src/dense/BlasLapackInterface.cpp b/src/dense/BlasLapackInterface.cpp new file mode 100644 index 00000000..ca4309c5 --- /dev/null +++ b/src/dense/BlasLapackInterface.cpp @@ -0,0 +1,210 @@ +#include "BlasLapackInterface.hpp" +#include "backendHelpers.hpp" + +extern "C" +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_SGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( sgemm ) +void LVARRAY_SGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + float const * ALPHA, + float const * A, + int const * LDA, + float const * B, + int const * LDB, + float const * BETA, + float * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_DGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( dgemm ) +void LVARRAY_DGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + double const * ALPHA, + double const * A, + int const * LDA, + double const * B, + int const * LDB, + double const * BETA, + double * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( cgemm ) +void LVARRAY_CGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + std::complex< float > const * ALPHA, + std::complex< float > const * A, + int const * LDA, + std::complex< float > const * B, + int const * LDB, + std::complex< float > const * BETA, + std::complex< float > * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( zgemm ) +void LVARRAY_ZGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + std::complex< double > const * ALPHA, + std::complex< double > const * A, + int const * LDA, + std::complex< double > const * B, + int const * LDB, + std::complex< double > const * BETA, + std::complex< double > * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv ) +void LVARRAY_SGESV( + int const * N, + int const * NRHS, + float * A, + int const * LDA, + int * IPIV, + float * B, + int const * LDB, + int * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv ) +void LVARRAY_DGESV( + int const * N, + int const * NRHS, + double * A, + int const * LDA, + int * IPIV, + double * B, + int const * LDB, + int * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv ) +void LVARRAY_CGESV( + int const * N, + int const * NRHS, + std::complex< float > * A, + int const * LDA, + int * IPIV, + std::complex< float > * B, + int const * LDB, + int * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv ) +void LVARRAY_ZGESV( + int const * N, + int const * NRHS, + std::complex< double > * A, + int const * LDA, + int * IPIV, + std::complex< double > * B, + int const * LDB, + int * INFO ); + +} // extern "C" + +namespace LvArray +{ +namespace dense +{ + +char toLapackChar( Operation const op ) +{ + if( op == Operation::NO_OP ) return 'N'; + if( op == Operation::TRANSPOSE ) return 'T'; + if( op == Operation::ADJOINT ) return 'C'; + + LVARRAY_ERROR( "Unknown operation: " << int( op ) ); + return '\0'; +} + + +template< typename T > +void BlasLapackInterface< T >::gemm( + Operation opA, + Operation opB, + T const alpha, + Matrix< T const > const & A, + Matrix< T const > const & B, + T const beta, + Matrix< T > const & C ) +{ + char const TRANSA = toLapackChar( opA ); + char const TRANSB = toLapackChar( opB ); + int const M = C.sizes[ 0 ]; + int const N = C.sizes[ 1 ]; + int const K = opA == Operation::NO_OP ? A.sizes[ 1 ] : A.sizes[ 0 ]; + int const LDA = std::max( std::ptrdiff_t{ 1 }, A.strides[ 1 ] ); + int const LDB = std::max( std::ptrdiff_t{ 1 }, B.strides[ 1 ] ); + int const LDC = std::max( std::ptrdiff_t{ 1 }, C.strides[ 1 ] ); + + TypeDispatch< T >::dispatch( LVARRAY_SGEMM, LVARRAY_DGEMM, LVARRAY_CGEMM, LVARRAY_ZGEMM, + &TRANSA, + &TRANSB, + &M, + &N, + &K, + &alpha, + A.data, + &LDA, + B.data, + &LDB, + &beta, + C.data, + &LDC ); +} + + +template< typename T > +void BlasLapackInterface< T >::gesv( + Matrix< T > const & A, + Matrix< T > const & B, + Vector< int > const & pivots ) +{ + int const N = A.sizes[ 0 ]; + int const NRHS = B.sizes[ 1 ]; + int const LDA = A.strides[ 1 ]; + int const LDB = B.strides[ 1 ]; + int INFO = 0; + + TypeDispatch< T >::dispatch( LVARRAY_SGESV, LVARRAY_DGESV, LVARRAY_CGESV, LVARRAY_ZGESV, + &N, + &NRHS, + A.data, + &LDA, + pivots.data, + B.data, + &LDB, + &INFO ); + + LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." ); + LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 << + " ) is exactly zero so the solution could not be computed." ); +} + +template class BlasLapackInterface< float >; +template class BlasLapackInterface< double >; +template class BlasLapackInterface< std::complex< float > >; +template class BlasLapackInterface< std::complex< double > >; + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/BlasLapackInterface.hpp b/src/dense/BlasLapackInterface.hpp new file mode 100644 index 00000000..ed747828 --- /dev/null +++ b/src/dense/BlasLapackInterface.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +template< typename T > +struct BlasLapackInterface +{ + static constexpr MemorySpace MEMORY_SPACE = MemorySpace::host; + + static void gemm( + Operation opA, + Operation opB, + T const alpha, + Matrix< T const > const & A, + Matrix< T const > const & B, + T const beta, + Matrix< T > const & C ); + + static void gesv( + Matrix< T > const & A, + Matrix< T > const & B, + Vector< int > const & pivots ); +}; + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt index 0a1de30b..8c7b4b0c 100644 --- a/src/dense/CMakeLists.txt +++ b/src/dense/CMakeLists.txt @@ -1,14 +1,12 @@ set( lvarraydense_headers common.hpp backendHelpers.hpp - eigenDecomposition.hpp - linearSolve.hpp + BlasLapackInterface.hpp ) set( lvarraydense_sources common.cpp - eigenDecomposition.cpp - linearSolve.cpp + BlasLapackInterface.cpp ) set( dependencies lvarray ${lvarray_dependencies} blas lapack ) diff --git a/src/dense/backendHelpers.hpp b/src/dense/backendHelpers.hpp index 144ad845..5de71cf8 100644 --- a/src/dense/backendHelpers.hpp +++ b/src/dense/backendHelpers.hpp @@ -1,12 +1,82 @@ #pragma once -#if defined( LVARRAY_USE_MAGMA ) - #include -#endif +#include /// This macro provide a flexible interface for Fortran naming convention for compiled objects // #ifdef FORTRAN_MANGLE_NO_UNDERSCORE #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name // #else // #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name ## _ -// #endif \ No newline at end of file +// #endif + +namespace LvArray +{ +namespace dense +{ + +template< typename T > +struct TypeDispatch +{}; + +template<> +struct TypeDispatch< float > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT && fFloat, + F_DOUBLE &&, + F_CFLOAT &&, + F_CDOUBLE &&, + ARGS && ... args ) + { + return fFloat( std::forward< ARGS >( args ) ... ); + } +}; + +template<> +struct TypeDispatch< double > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT &&, + F_DOUBLE && fDouble, + F_CFLOAT &&, + F_CDOUBLE &&, + ARGS && ... args ) + { + return fDouble( std::forward< ARGS >( args ) ... ); + } +}; + +template<> +struct TypeDispatch< std::complex< float > > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT &&, + F_DOUBLE &&, + F_CFLOAT && fCFloat, + F_CDOUBLE &&, + ARGS && ... args ) + { + return fCFloat( std::forward< ARGS >( args ) ... ); + } +}; + +template<> +struct TypeDispatch< std::complex< double > > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT &&, + F_DOUBLE &&, + F_CFLOAT &&, + F_CDOUBLE && fCDouble, + ARGS && ... args ) + { + return fCDouble( std::forward< ARGS >( args ) ... ); + } +}; + +} // namespace dense +} // namespace LvArray diff --git a/src/dense/common.cpp b/src/dense/common.cpp index 8843ca82..b1cab9fe 100644 --- a/src/dense/common.cpp +++ b/src/dense/common.cpp @@ -5,26 +5,16 @@ namespace LvArray namespace dense { -//////////////////////////////////////////////////////////////////////////////////////////////////// -char const * getOption( SymmetricMatrixStorageType const option ) +Operation transposeOp( Operation const op ) { - static constexpr char const * const upper = "U"; - static constexpr char const * const lower = "L"; + switch( op ) + { + case Operation::NO_OP: return Operation::TRANSPOSE; + case Operation::TRANSPOSE: return Operation::NO_OP; + case Operation::ADJOINT: LVARRAY_ERROR( "Not supported" ); + } - return option == SymmetricMatrixStorageType::UPPER_TRIANGULAR ? upper : lower; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -MemorySpace getSpaceForBackend( BuiltInBackends const backend ) -{ -#if defined( LVARRAY_USE_MAGMA ) - // TODO: This needs to be changed to MemorySpace::hip or whatever. - if( backend == BuiltInBackends::MAGMA_GPU ) return MemorySpace::cuda; -#else - LVARRAY_UNUSED_VARIABLE( backend ); -#endif - - return MemorySpace::host; + return Operation::NO_OP; } } // namespace dense diff --git a/src/dense/common.hpp b/src/dense/common.hpp index 9c4fda87..376b589c 100644 --- a/src/dense/common.hpp +++ b/src/dense/common.hpp @@ -41,10 +41,14 @@ enum class SymmetricMatrixStorageType LOWER_TRIANGULAR, }; -/** - * TODO: move to internal namespace - */ -char const * getOption( SymmetricMatrixStorageType const option ); +enum class Operation +{ + NO_OP, + TRANSPOSE, + ADJOINT, +}; + +Operation transposeOp( Operation const op ); /** * @@ -64,84 +68,76 @@ static constexpr bool IsComplex = !std::is_same< RealVersion< T >, T >::value; template< typename T, typename U > static constexpr bool IsComplexT = IsComplex< T > && std::is_same< RealVersion< T >, U >::value; -/** - * - */ -enum class BuiltInBackends -{ - LAPACK, -#if defined( LVARRAY_USE_MAGMA ) - MAGMA, - MAGMA_GPU, -#endif -}; - -/** - * - */ -MemorySpace getSpaceForBackend( BuiltInBackends const backend ); - -/** - * - */ -using DenseInt = int; - /** * */ template< typename T > struct Matrix { - /** - * - */ - template< typename INDEX_TYPE > - Matrix( ArraySlice< T, 2, 0, INDEX_TYPE > const & slice ): - nRows{ integerConversion< DenseInt >( slice.size( 0 ) ) }, - nCols{ integerConversion< DenseInt >( slice.size( 1 ) ) }, - stride{ integerConversion< DenseInt >( slice.stride( 1 ) ) }, - isColumnMajor{ true }, - data{ slice.data() } - {} - - /** - * - */ - template< typename INDEX_TYPE, int USD > - Matrix( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ): - nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) }, - nCols{ integerConversion< DenseInt >( 1 ) }, - stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) }, - isColumnMajor{ true }, - data{ slice.data() } - {} + Matrix( + typeManipulation::CArray< std::ptrdiff_t, 2 > const & sizesIn, + typeManipulation::CArray< std::ptrdiff_t, 2 > const & stridesIn, + T * const dataIn ): + sizes{ sizesIn }, + strides{ stridesIn }, + data{ dataIn } + { + LVARRAY_ERROR_IF_LT( sizes[ 0 ], 0 ); + LVARRAY_ERROR_IF_LT( sizes[ 1 ], 0 ); + LVARRAY_ERROR_IF_LT( strides[ 0 ], 0 ); + LVARRAY_ERROR_IF_LT( strides[ 1 ], 0 ); + } - /** - * - */ Matrix( T & value ): - nRows{ 1 }, - nCols{ 1 }, - stride{ 1 }, - isColumnMajor{ true }, + sizes{ 1, 1 }, + strides{ 1, 1 }, data{ &value } {} - /** - * - */ + Matrix( Matrix< std::remove_const_t< T > > const & src ): + sizes{ src.sizes }, + strides{ src.strides }, + data{ src.data } + {} + bool isSquare() const + { return sizes[0] == sizes[1]; } + + bool isColumnMajor() const + { return strides[ 0 ] == 1; } + + bool isRowMajor() const + { return strides[ 1 ] == 1; } + + bool isContiguous() const + { return isColumnMajor() || isRowMajor(); } + + std::ptrdiff_t nRows() const + { return sizes[ 0 ]; } + + std::ptrdiff_t nCols() const + { return sizes[ 1 ]; } + + Matrix transpose() const { - return nRows == nCols; + return Matrix( { sizes[ 1 ], sizes[ 0 ] }, { strides[ 1 ], strides[ 0 ] }, data ); } - DenseInt const nRows; - DenseInt const nCols; - DenseInt const stride; - bool const isColumnMajor; - T * const data; + typeManipulation::CArray< std::ptrdiff_t, 2 > sizes; + typeManipulation::CArray< std::ptrdiff_t, 2 > strides; + T * data; }; +template< typename T, typename PERM, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +Matrix< T > toMatrix( + Array< T, 2, PERM, INDEX_TYPE, BUFFER_TYPE > const & array, + MemorySpace const space, + bool const touch ) +{ + array.move( space, touch ); + return Matrix< T >( array.dimsArray(), array.stridesArray(), array.data() ); +} + /** * */ @@ -150,8 +146,8 @@ struct Vector { template< int USD, typename INDEX_TYPE > Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ): - size{ integerConversion< DenseInt >( slice.size() ) }, - stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) }, + size{ integerConversion< std::ptrdiff_t >( slice.size() ) }, + stride{ integerConversion< std::ptrdiff_t >( slice.stride( 0 ) ) }, data{ slice.data() } {} @@ -161,8 +157,8 @@ struct Vector data{ &value } {} - DenseInt const size; - DenseInt const stride; + std::ptrdiff_t const size; + std::ptrdiff_t const stride; T * const data; }; @@ -183,17 +179,17 @@ struct Workspace virtual Vector< RealVersion< T > > rwork() = 0; - virtual Vector< DenseInt > iwork() = 0; + virtual Vector< int > iwork() = 0; - virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) = 0; + virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; - virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) = 0; + virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; - virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) = 0; + virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; - virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) = 0; + virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; - virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) = 0; + virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; }; /** @@ -223,44 +219,44 @@ struct ArrayWorkspace : public Workspace< T > virtual Vector< RealVersion< T > > rwork() override { return m_rwork.toSlice(); } - virtual Vector< DenseInt > iwork() override + virtual Vector< int > iwork() override { return m_iwork.toSlice(); } - virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override + virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) override { m_work.resizeWithoutInitializationOrDestruction( space, newSize ); } - virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) override + virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) override { m_work2.resizeWithoutInitializationOrDestruction( space, newSize ); } - virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) override + virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) override { m_work3.resizeWithoutInitializationOrDestruction( space, newSize ); } - virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override + virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) override { m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); } - virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override + virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) override { m_iwork.resizeWithoutInitializationOrDestruction( space, newSize ); } private: - Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work; + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work; - Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work2; + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work2; - Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work3; + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work3; - Array< RealVersion< T >, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_rwork; + Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_rwork; - Array< DenseInt, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_iwork; + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_iwork; }; /** @@ -287,28 +283,28 @@ struct OptimalSizeCalculation : public Workspace< T > virtual Vector< int > iwork() override { return m_iwork; } - virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } - virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } - virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } - virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } - virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override + virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } - DenseInt optimalWorkSize() const - { return static_cast< DenseInt >( m_work.real() ); } + std::ptrdiff_t optimalWorkSize() const + { return static_cast< std::ptrdiff_t >( m_work.real() ); } - DenseInt optimalRWorkSize() const - { return static_cast< DenseInt >( m_rwork ); } + std::ptrdiff_t optimalRWorkSize() const + { return static_cast< std::ptrdiff_t >( m_rwork ); } - DenseInt optimalIWorkSize() const + std::ptrdiff_t optimalIWorkSize() const { return m_iwork; } private: @@ -320,7 +316,7 @@ struct OptimalSizeCalculation : public Workspace< T > RealVersion< T > m_rwork { -1 }; - DenseInt m_iwork { -1 }; + int m_iwork { -1 }; }; } // namespace dense diff --git a/src/dense/dense.hpp b/src/dense/dense.hpp new file mode 100644 index 00000000..2fcf202d --- /dev/null +++ b/src/dense/dense.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +template< typename INTERFACE, typename MATRIX_A, typename MATRIX_B, typename MATRIX_C, typename SCALAR > +void gemm( + Operation opA, + Operation opB, + SCALAR const alpha, + MATRIX_A const & Ain, + MATRIX_B const & Bin, + SCALAR const beta, + MATRIX_C const & Cin ) +{ + Matrix< SCALAR const > A = toMatrix( Ain, INTERFACE::MEMORY_SPACE, false ); + Matrix< SCALAR const > B = toMatrix( Bin, INTERFACE::MEMORY_SPACE, false ); + Matrix< SCALAR > const C = toMatrix( Cin, INTERFACE::MEMORY_SPACE, true ); + + // Check the sizes + LVARRAY_ERROR_IF_NE( C.sizes[ 0 ], A.sizes[ 0 + (opA != Operation::NO_OP) ] ); + LVARRAY_ERROR_IF_NE( C.sizes[ 1 ], B.sizes[ 1 - (opB != Operation::NO_OP) ] ); + LVARRAY_ERROR_IF_NE( A.sizes[ 1 - (opA != Operation::NO_OP) ], + B.sizes[ 0 + (opB != Operation::NO_OP) ] ); + + // Check that everything is contiguous + LVARRAY_ERROR_IF( !A.isContiguous(), "Matrix A must have one stride on dimension." ); + LVARRAY_ERROR_IF( !B.isContiguous(), "Matrix B must have one stride one dimension." ); + LVARRAY_ERROR_IF( !C.isColumnMajor(), "Matrix C must be column major." ); + + // TODO(corbett5): Don't think this will work for Hermitian matrices. + if( !A.isColumnMajor() ) + { + A = A.transpose(); + opA = transposeOp( opA ); + } + if( !B.isColumnMajor() ) + { + B = B.transpose(); + opB = transposeOp( opB ); + } + + INTERFACE::gemm( opA, opB, alpha, A, B, beta, C ); +} + + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp index e70b6561..68236057 100644 --- a/src/dense/eigenDecomposition.cpp +++ b/src/dense/eigenDecomposition.cpp @@ -90,8 +90,8 @@ DenseInt heevr( char const * const JOBZ = decompositionOptions.typeArg(); char const * const RANGE = decompositionOptions.rangeArg(); char const * const UPLO = getOption( storageType ); - DenseInt const N = A.nCols; - DenseInt const LDA = A.stride; + DenseInt const N = A.sizes[ 1 ]; + DenseInt const LDA = A.strides[ 1 ]; T const VL = decompositionOptions.rangeMin; T const VU = decompositionOptions.rangeMax; @@ -112,11 +112,11 @@ DenseInt heevr( if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS ) { - LVARRAY_ERROR_IF_NE( eigenvectors.nRows, N ); - LVARRAY_ERROR_IF_LT( eigenvectors.nCols, maxEigenvaluesToFind ); + LVARRAY_ERROR_IF_NE( eigenvectors.sizes[ 0 ], N ); + LVARRAY_ERROR_IF_LT( eigenvectors.sizes[ 1 ], maxEigenvaluesToFind ); } - DenseInt const LDZ = std::max( 1, eigenvectors.stride ); + DenseInt const LDZ = std::max( 1, eigenvectors.strides[ 1 ] ); if( decompositionOptions.range == EigenDecompositionOptions::ALL || ( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX && @@ -348,12 +348,12 @@ DenseInt heevr( { // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries. // I'm not sure exactly how this would work for the eigenvectors though. - LVARRAY_ERROR_IF( !A.isColumnMajor, "Row major is not yet supported." ); - LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor, "Row major is not yet supported." ); + LVARRAY_ERROR_IF( !A.isColumnMajor(), "Row major is not yet supported." ); + LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor(), "Row major is not yet supported." ); - bool const reallocateWork = workspace.work().size < 2 * A.nRows; - bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows; - bool const reallocateIWork = workspace.iwork().size < 10 * A.nRows; + bool const reallocateWork = workspace.work().size < 2 * A.sizes[ 0 ]; + bool const reallocateRWork = workspace.rwork().size < 24 * A.sizes[ 0 ]; + bool const reallocateIWork = workspace.iwork().size < 10 * A.sizes[ 0 ]; if( reallocateWork || reallocateRWork || reallocateIWork ) { diff --git a/src/dense/linearSolve.cpp b/src/dense/linearSolve.cpp index 9833710f..33d5f503 100644 --- a/src/dense/linearSolve.cpp +++ b/src/dense/linearSolve.cpp @@ -67,17 +67,17 @@ void gesv( Vector< DenseInt > const & pivots ) { LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); - LVARRAY_ERROR_IF( !A.isColumnMajor, "The matrix A must be column major." ); + LVARRAY_ERROR_IF( !A.isColumnMajor(), "The matrix A must be column major." ); - LVARRAY_ERROR_IF_NE( A.nRows, B.nRows ); - LVARRAY_ERROR_IF( !B.isColumnMajor, "The matrix B must be column major." ); + LVARRAY_ERROR_IF_NE( A.sizes[ 0 ], B.sizes[ 0 ] ); + LVARRAY_ERROR_IF( !B.isColumnMajor(), "The matrix B must be column major." ); - LVARRAY_ERROR_IF_NE( pivots.size, A.nRows ); + LVARRAY_ERROR_IF_NE( pivots.size, A.sizes[ 0 ] ); - DenseInt const N = A.nCols; - DenseInt const NRHS = B.nCols; - DenseInt const LDA = A.stride; - DenseInt const LDB = B.stride; + DenseInt const N = A.sizes[ 1 ]; + DenseInt const NRHS = B.sizes[ 1 ]; + DenseInt const LDA = A.strides[ 1 ]; + DenseInt const LDB = B.strides[ 1 ]; DenseInt INFO = 0; if( backend == BuiltInBackends::LAPACK ) diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt index 4d58286d..f87b2fda 100644 --- a/unitTests/dense/CMakeLists.txt +++ b/unitTests/dense/CMakeLists.txt @@ -9,9 +9,8 @@ # Specify list of tests # set( testSources - testEigenDecomposition.cpp - testLinearSolve.cpp - ) + testgemm.cpp +) # # Add gtest C++ based tests diff --git a/unitTests/dense/testgemm.cpp b/unitTests/dense/testgemm.cpp new file mode 100644 index 00000000..51f50773 --- /dev/null +++ b/unitTests/dense/testgemm.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +// Source includes +#include "dense/dense.hpp" +#include "dense/BlasLapackInterface.hpp" + +#include "../testUtils.hpp" + +#include + +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + +#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \ + EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \ + EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError ) + +namespace LvArray +{ +namespace testing +{ + +// This should probably go in a common place +template< typename T, typename PERM > +using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >; + +template< typename T > +std::enable_if_t< std::is_floating_point< T >::value, T > +randomValue( std::mt19937 & gen ) +{ return std::uniform_real_distribution< T >{ -1, 1 }( gen ); } + +template< typename T > +std::enable_if_t< dense::IsComplex< T >, T > +randomValue( std::mt19937 & gen ) +{ + return { std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ), + std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ) }; +} + +template< typename T, typename PERM > +Array2d< T, PERM > randomMatrix( std::ptrdiff_t const N, std::ptrdiff_t const M ) +{ + std::mt19937 gen( std::random_device{}() ); + + Array2d< T, PERM > const ret( N, M ); + + for( std::ptrdiff_t r = 0; r < N; ++r ) + { + for( std::ptrdiff_t c = 0; c < M; ++c ) + { + ret( r, c ) = T{10} * randomValue< T >( gen ); + } + } + + return ret; +} + +template< typename T, typename PERM > +std::enable_if_t< std::is_floating_point< T >::value > +checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol ) +{ + ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) ); + ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) ); + + for( std::ptrdiff_t i = 0; i < lhs.size(); ++i ) + { + EXPECT_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol ); + } +} + +template< typename T, typename PERM > +std::enable_if_t< dense::IsComplex< T > > +checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol ) +{ + ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) ); + ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) ); + + for( std::ptrdiff_t i = 0; i < lhs.size(); ++i ) + { + EXPECT_COMPLEX_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol ); + } +} + +template< typename INTERFACE, typename T, typename PERM_A, typename PERM_B > +struct GemmTest +{ + std::mt19937 gen(); + + void Rij_eq_AikBkj() + { + std::mt19937 gen( std::random_device{}() ); + + int const N = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen ); + int const M = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen ); + int const K = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen ); + + T const alpha = T{10} * randomValue< T >( gen ); + T const beta = T{10} * randomValue< T >( gen ); + + Array2d< T, PERM_A > const A = randomMatrix< T, PERM_A >( N, K ); + Array2d< T, PERM_B > const B = randomMatrix< T, PERM_B >( K, M ); + Array2d< T, RAJA::PERM_JI > const C = randomMatrix< T, RAJA::PERM_JI >( N, M ); + + Array2d< T, PERM_A > const Acopy = A; + Array2d< T, PERM_B > const Bcopy = B; + Array2d< T, RAJA::PERM_JI > const Ccopy = C; + + dense::gemm< INTERFACE >( dense::Operation::NO_OP, dense::Operation::NO_OP, alpha, A, B, beta, C ); + + A.move( MemorySpace::host, false ); + B.move( MemorySpace::host, false ); + C.move( MemorySpace::host, false ); + + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + T dot = 0; + for( std::ptrdiff_t k = 0; k < K; ++k ) + { + dot += Acopy( i, k ) * Bcopy( k, j ); + } + + Ccopy( i, j ) = alpha * dot + beta * Ccopy( i, j ); + } + } + + checkEqual( A, Acopy, 0 ); + checkEqual( B, Bcopy, 0 ); + checkEqual( C, Ccopy, 1e3 * std::numeric_limits< dense::RealVersion< T > >::epsilon() ); + } +}; + +TEST( LapackInterface_float, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_double, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_float, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_double, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_float, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_double, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_float, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_double, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ +#if defined( LVARRAY_USE_MAGMA ) + magma_init(); +#endif + + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + +#if defined( LVARRAY_USE_MAGMA ) + magma_finalize(); +#endif + + return result; +}