From f81806357445a1d7751dd67ada044e1fd49fb498 Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Mon, 8 May 2023 23:07:38 -0700
Subject: [PATCH 01/34] Squash

---
 .../toss_3_x86_64_ib/compilers.yaml           | 65 +++----------------
 1 file changed, 10 insertions(+), 55 deletions(-)

diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
index 3d9648a7..a4af3f37 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
@@ -1,11 +1,11 @@
 compilers:
 - compiler:
-    spec: clang@10.0.1
+    spec: clang@14.0.6
     paths:
-      cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      cc: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang
+      cxx: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
     flags:
       cflags: -march=native -mtune=native
       cxxflags: -march=native -mtune=native
@@ -15,12 +15,12 @@ compilers:
     environment: {}
     extra_rpaths: []
 - compiler:
-    spec: clang@11.0.1
+    spec: gcc@12.1.1
     paths:
-      cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      cc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
     flags:
       cflags: -march=native -mtune=native
       cxxflags: -march=native -mtune=native
@@ -29,48 +29,3 @@ compilers:
     modules: []
     environment: {}
     extra_rpaths: []
-- compiler:
-    spec: gcc@7.3.0
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: gcc@8.3.1
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-8.3.1/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: intel@19.1.2
-    paths:
-      cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc
-      cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc
-      f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
-      fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
-    flags:
-      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native
-      cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []

From 9ed565c0ea525894bfc71a47207dde1ce35478ad Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Thu, 26 Jan 2023 10:58:05 -0800
Subject: [PATCH 02/34] Fixed cmake so that GEOSX TPLs work and also updated
 spack.

---
 cmake/SetupTPL.cmake                          |  51 +++----
 host-configs/LLNL/lassen-base.cmake           |   3 -
 host-configs/LLNL/quartz-base.cmake           |   3 -
 scripts/uberenv/packages/lvarray/package.py   |  26 ++--
 scripts/uberenv/project.json                  |   4 +-
 .../blueos_3_ppc64le_ib_p9/compilers.yaml     | 132 +++++++++++++++---
 .../toss_3_x86_64_ib_python/packages.yaml     |   7 +
 7 files changed, 161 insertions(+), 65 deletions(-)

diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake
index bff94834..29a60128 100644
--- a/cmake/SetupTPL.cmake
+++ b/cmake/SetupTPL.cmake
@@ -1,19 +1,22 @@
 set(thirdPartyLibs "")
 
-################################
+###############################
 # CAMP
-################################
-if(NOT EXISTS ${CAMP_DIR})
-    message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
-endif()
+###############################
+if(CAMP_DIR STREQUAL RAJA_DIR)
+    message(STATUS "LvArray using CAMP from RAJA.")
+else()
+    if(NOT EXISTS ${CAMP_DIR})
+        message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
+    endif()
 
-message(STATUS "Using CAMP from ${CAMP_DIR}")
+    message(STATUS "LvArray using CAMP from ${CAMP_DIR}")
 
-find_package(camp REQUIRED PATHS ${CAMP_DIR})
+    find_package(camp REQUIRED PATHS ${CAMP_DIR})
 
-set(ENABLE_CAMP ON CACHE BOOL "")
+    set(thirdPartyLibs ${thirdPartyLibs} camp)
+endif()
 
-set(thirdPartyLibs ${thirdPartyLibs} camp)
 
 ################################
 # RAJA
@@ -22,7 +25,7 @@ if(NOT EXISTS ${RAJA_DIR})
     message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.")
 endif()
 
-message(STATUS "Using RAJA from ${RAJA_DIR}")
+message(STATUS "LvArray using RAJA from ${RAJA_DIR}")
 
 find_package(RAJA REQUIRED PATHS ${RAJA_DIR})
 
@@ -39,14 +42,14 @@ if(ENABLE_UMPIRE)
         message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.")
     endif()
 
-    message(STATUS "Using Umpire from ${UMPIRE_DIR}")
+    message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}")
 
     find_package(umpire REQUIRED
                  PATHS ${UMPIRE_DIR})
     
     set(thirdPartyLibs ${thirdPartyLibs} umpire)
 else()
-    message(STATUS "Not using Umpire.")
+    message(STATUS "LvArray not using Umpire.")
 endif()
 
 ################################
@@ -65,32 +68,32 @@ if(ENABLE_CHAI)
         message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.")
     endif()
 
-    message(STATUS "Using CHAI from ${CHAI_DIR}")
+    message(STATUS "LvArray using CHAI from ${CHAI_DIR}")
 
     find_package(chai REQUIRED
                  PATHS ${CHAI_DIR})
-
-    # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
-    get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
-    list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
-    set_target_properties(chai
-                          PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
+    
+    # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
+    # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
+    # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
+    # set_target_properties(chai
+    #                       PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
 
     set(thirdPartyLibs ${thirdPartyLibs} chai)
 else()
-    message(STATUS "Not using CHAI.")
+    message(STATUS "LvArray not using CHAI.")
 endif()
 
 
-################################
+###############################
 # CALIPER
-################################
+###############################
 if(ENABLE_CALIPER)
     if(NOT EXISTS ${CALIPER_DIR})
         message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.")
     endif()
 
-    message(STATUS "Using caliper from ${CALIPER_DIR}")
+    message(STATUS "LvArray using caliper from ${CALIPER_DIR}")
 
     find_package(caliper REQUIRED
                  PATHS ${CALIPER_DIR})
@@ -102,7 +105,7 @@ if(ENABLE_CALIPER)
 
     set(thirdPartyLibs ${thirdPartyLibs} caliper)
 else()
-    message(STATUS "Not using caliper.")
+    message(STATUS "LvArray not using caliper.")
 endif()
 
 ################################
diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake
index 5a443bb9..31a8f048 100644
--- a/host-configs/LLNL/lassen-base.cmake
+++ b/host-configs/LLNL/lassen-base.cmake
@@ -11,9 +11,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 set(ENABLE_CHAI ON CACHE BOOL "")
 set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 
-set(ENABLE_CALIPER ON CACHE BOOL "")
-set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "")
-
 set(ENABLE_ADDR2LINE ON CACHE BOOL "")
 
 # Cuda options
diff --git a/host-configs/LLNL/quartz-base.cmake b/host-configs/LLNL/quartz-base.cmake
index b7eb21df..ef4128a6 100644
--- a/host-configs/LLNL/quartz-base.cmake
+++ b/host-configs/LLNL/quartz-base.cmake
@@ -12,9 +12,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 set(ENABLE_CHAI ON CACHE BOOL "")
 set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 
-set(ENABLE_CALIPER ON CACHE BOOL "")
-set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "")
-
 # set(ENABLE_PYLVARRAY ON CACHE BOOL "")
 # set(PYTHON_DIR /usr/tce/packages/python/python-3.7.2 CACHE PATH "")
 
diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index 9c4b47d9..cf9d5548 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -56,32 +56,36 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('docs', default=False, description='Build docs')
     variant('addr2line', default=True,
             description='Build support for addr2line.')
-
+        
     depends_on('blt', when='@0.2.0:', type='build')
 
     depends_on('camp')
-    depends_on('camp+cuda', when='+cuda')
 
     depends_on('raja')
-    depends_on('raja+cuda', when='+cuda')
 
-    # At the moment Umpire doesn't support shared when building with CUDA.
     depends_on('umpire', when='+umpire')
-    depends_on('umpire+cuda~shared', when='+umpire+cuda')
 
     depends_on('chai+raja', when='+chai')
-    depends_on('chai+raja+cuda', when='+chai+cuda')
 
     depends_on('caliper', when='+caliper')
 
     depends_on('python +shared +pic', when='+pylvarray')
-    depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray')
-    depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray')
+    depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray')
+    depends_on('py-scipy@1.5.2:', when='+pylvarray')
     depends_on('py-pip', when='+pylvarray')
 
     depends_on('doxygen@1.8.13:', when='+docs', type='build')
     depends_on('py-sphinx@1.6.3:', when='+docs', type='build')
 
+    with when('+cuda'):
+        for sm_ in CudaPackage.cuda_arch_values:
+            depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+    
+
     phases = ['hostconfig', 'cmake', 'build', 'install']
 
     @run_after('build')
@@ -285,10 +289,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write("#{0}\n\n".format("-" * 80))
 
         if "+caliper" in spec:
-            cfg.write("#{0}\n".format("-" * 80))
-            cfg.write("# Caliper\n")
-            cfg.write("#{0}\n\n".format("-" * 80))
-
             cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
             cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
         else:
@@ -297,6 +297,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write('#{0}\n'.format('-' * 80))
         cfg.write('# Python\n')
         cfg.write('#{0}\n\n'.format('-' * 80))
+
         if '+pylvarray' in spec:
             cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True))
             cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3')))
@@ -306,6 +307,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write("#{0}\n".format("-" * 80))
         cfg.write("# Documentation\n")
         cfg.write("#{0}\n\n".format("-" * 80))
+
         if "+docs" in spec:
             cfg.write(cmake_cache_option("ENABLE_DOCS", True))
             sphinx_dir = spec['py-sphinx'].prefix
diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json
index 9822f975..703db7a4 100644
--- a/scripts/uberenv/project.json
+++ b/scripts/uberenv/project.json
@@ -3,8 +3,8 @@
   "package_version" : "develop",
   "package_final_phase" : "hostconfig",
   "package_source_dir" : "../..",
-  "spack_url": "https://github.com/corbett5/spack",
-  "spack_branch": "package/corbett/lvarray-update",
+  "spack_url": "https://github.com/spack/spack",
+  "spack_branch": "v0.19.0",
   "spack_activate" : {},
   "spack_clean_packages": ["lvarray"],
   "build_jobs": 100
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
index b1bf26cb..b8353dd0 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
@@ -2,30 +2,90 @@ compilers:
 - compiler:
     spec: clang@10.0.1
     paths:
-      cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
     spec: clang@11.0.1
     paths:
-      cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@12.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@13.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@14.0.4
+    paths:
+      cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang
+      cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@7.3.0
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
@@ -37,25 +97,55 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@9.3.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@10.2.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
-    spec: xl@16.1.1
+    spec: intel@19.1.2
     paths:
-      cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc
-      cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC
-      f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
-      fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
+      cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc
+      cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc
+      f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
+      fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
     flags:
-      cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
-      cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
+      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native
+      cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
index 0c6b833b..a6fbda09 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
@@ -107,3 +107,10 @@ packages:
     externals:
     - spec: pkg-config@0.27.1
       prefix: /usr/bin/
+  
+  ninja:
+    buildable: False
+    externals:
+    - spec: ninja@kitware
+      prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/
+

From 329d7f1fd8b79439adeaf7bf6e632b1f27be3bd2 Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Mon, 8 May 2023 22:53:05 -0700
Subject: [PATCH 03/34] Squash

---
 Notes.txt                                     |  1 +
 ...quartz-toss_3_x86_64_ib-clang@10.0.1.cmake | 93 +++++++++++++++++++
 .../toss_3_x86_64_ib_python/packages.yaml     |  2 +-
 3 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 Notes.txt
 create mode 100644 new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake

diff --git a/Notes.txt b/Notes.txt
new file mode 100644
index 00000000..d07775c5
--- /dev/null
+++ b/Notes.txt
@@ -0,0 +1 @@
+./scripts/uberenv/uberenv.py --prefix=../uberenv-libs/ --spack-config-dir=./scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/ --spec="%clang@10.0.1 +umpire +chai +caliper +pylvarray +benchmarks +examples ^caliper ~adiak ~mpi ~libunwind ~libdw ~papi"
\ No newline at end of file
diff --git a/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake
new file mode 100644
index 00000000..90ac014b
--- /dev/null
+++ b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake
@@ -0,0 +1,93 @@
+#################################################################################
+# Generated host-config - Edit at own risk!
+#################################################################################
+#--------------------------------------------------------------------------------
+# SYS_TYPE: toss_3_x86_64_ib
+# Compiler Spec: clang@10.0.1
+# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake
+#--------------------------------------------------------------------------------
+
+set(BLT_SOURCE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/blt-0.5.2-6nztad6saell6ikor6wtxp6qycxtfwh4" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Compilers
+#--------------------------------------------------------------------------------
+
+set(CMAKE_C_COMPILER "/usr/tce/bin/clang-10.0.1" CACHE PATH "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/bin/clang++-10.0.1" CACHE PATH "")
+
+set(CMAKE_C_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(ENABLE_CUDA OFF CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# CAMP
+#--------------------------------------------------------------------------------
+
+set(CAMP_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/camp-2022.03.2-2q75xbq2h4ykcyvasoqg55torawlabkw" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# RAJA
+#--------------------------------------------------------------------------------
+
+set(RAJA_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/raja-2022.03.0-jkp4hp7ifyxkxzkbho5ngdnk4x3opaoy" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Umpire
+#--------------------------------------------------------------------------------
+
+set(ENABLE_UMPIRE ON CACHE BOOL "")
+
+set(UMPIRE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/umpire-2022.03.1-aerit7injc3hmn2ripnsxtnlwxicjmuu" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# CHAI
+#--------------------------------------------------------------------------------
+
+set(ENABLE_CHAI ON CACHE BOOL "")
+
+set(CHAI_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/chai-2022.03.0-s6w2gsrreu7krgzboekmlukmfestpg7k" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Caliper
+#--------------------------------------------------------------------------------
+
+#--------------------------------------------------------------------------------
+# Caliper
+#--------------------------------------------------------------------------------
+
+set(ENABLE_CALIPER ON CACHE BOOL "")
+
+set(CALIPER_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/caliper-2.8.0-3fwkrbu4bhnc4bqvhrqcydrzxslq6ryz" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Python
+#--------------------------------------------------------------------------------
+
+set(ENABLE_PYLVARRAY OFF CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# Documentation
+#--------------------------------------------------------------------------------
+
+set(ENABLE_DOCS OFF CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# addr2line
+#--------------------------------------------------------------------------------
+
+set(ENABLE_ADDR2LINE ON CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# Other
+#--------------------------------------------------------------------------------
+
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
index a6fbda09..5b3c9fbe 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
@@ -112,5 +112,5 @@ packages:
     buildable: False
     externals:
     - spec: ninja@kitware
-      prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/
+      prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ninja
 

From d31c0ffdcdc83d875336d50550c546f1e37d12be Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Tue, 9 May 2023 00:54:59 -0700
Subject: [PATCH 04/34] Got Spack working on Quartz again, need to do Lassen
 and figure out python stuff.

---
 host-configs/LLNL/lassen-base.cmake                 |  3 +++
 host-configs/LLNL/quartz-base.cmake                 |  3 +++
 scripts/uberenv/packages/lvarray/package.py         | 13 +++++++++++--
 .../toss_3_x86_64_ib_python/packages.yaml           |  4 ++--
 .../compilers.yaml                                  |  4 ++--
 .../packages.yaml                                   |  0
 6 files changed, 21 insertions(+), 6 deletions(-)
 rename scripts/uberenv/spack_configs/{toss_3_x86_64_ib => toss_4_x86_64_ib}/compilers.yaml (94%)
 rename scripts/uberenv/spack_configs/{toss_3_x86_64_ib => toss_4_x86_64_ib}/packages.yaml (100%)

diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake
index 31a8f048..5a443bb9 100644
--- a/host-configs/LLNL/lassen-base.cmake
+++ b/host-configs/LLNL/lassen-base.cmake
@@ -11,6 +11,9 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 set(ENABLE_CHAI ON CACHE BOOL "")
 set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 
+set(ENABLE_CALIPER ON CACHE BOOL "")
+set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "")
+
 set(ENABLE_ADDR2LINE ON CACHE BOOL "")
 
 # Cuda options
diff --git a/host-configs/LLNL/quartz-base.cmake b/host-configs/LLNL/quartz-base.cmake
index ef4128a6..b7eb21df 100644
--- a/host-configs/LLNL/quartz-base.cmake
+++ b/host-configs/LLNL/quartz-base.cmake
@@ -12,6 +12,9 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 set(ENABLE_CHAI ON CACHE BOOL "")
 set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 
+set(ENABLE_CALIPER ON CACHE BOOL "")
+set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "")
+
 # set(ENABLE_PYLVARRAY ON CACHE BOOL "")
 # set(PYTHON_DIR /usr/tce/packages/python/python-3.7.2 CACHE PATH "")
 
diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index cf9d5548..b377bdfa 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -56,6 +56,8 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('docs', default=False, description='Build docs')
     variant('addr2line', default=True,
             description='Build support for addr2line.')
+    variant('tpl_build_type', default='none', description='TPL build type',
+            values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none'))
         
     depends_on('blt', when='@0.2.0:', type='build')
 
@@ -82,9 +84,16 @@ class Lvarray(CMakePackage, CudaPackage):
             depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
             depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
             depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
-            depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
-            depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='+chai cuda_arch={0}'.format(sm_))
+            depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='+caliper cuda_arch={0}'.format(sm_))
     
+    for bt in ('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel'):
+        with when('tpl_build_type={}'.format(bt)):
+            depends_on('camp build_type={}'.format(bt))
+            depends_on('raja build_type={}'.format(bt))
+            depends_on('umpire build_type={}'.format(bt))
+            depends_on('chai build_type={}'.format(bt), when='+chai')
+            depends_on('caliper build_type={}'.format(bt), when='+caliper')
 
     phases = ['hostconfig', 'cmake', 'build', 'install']
 
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
index 5b3c9fbe..43971e78 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
@@ -111,6 +111,6 @@ packages:
   ninja:
     buildable: False
     externals:
-    - spec: ninja@kitware
-      prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ninja
+    - spec: ninja@1.11.0
+      prefix: /usr/tce/packages/ninja/ninja-1.11.0
 
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml
similarity index 94%
rename from scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
rename to scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml
index a4af3f37..15bdbccd 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
+++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml
@@ -9,7 +9,7 @@ compilers:
     flags:
       cflags: -march=native -mtune=native
       cxxflags: -march=native -mtune=native
-    operating_system: rhel7
+    operating_system: rhel8
     target: x86_64
     modules: []
     environment: {}
@@ -24,7 +24,7 @@ compilers:
     flags:
       cflags: -march=native -mtune=native
       cxxflags: -march=native -mtune=native
-    operating_system: rhel7
+    operating_system: rhel8
     target: x86_64
     modules: []
     environment: {}
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
similarity index 100%
rename from scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml
rename to scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml

From e09e98707edb6e6f6387fc29e68f630d2680c511 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.spock.olcf.ornl.gov>
Date: Tue, 22 Feb 2022 18:15:46 -0500
Subject: [PATCH 05/34] small umpire and raja versioning and rocm changes

---
 CMakeLists.txt                                |  3 ++
 cmake/blt                                     |  2 +-
 .../{ => ORNL}/ascent-gcc@8.1.1.cmake         |  0
 host-configs/ORNL/crusher-cce@13.0.1.cmake    | 41 +++++++++++++++++++
 host-configs/ORNL/spock-cce@12.0.3.cmake      | 39 ++++++++++++++++++
 src/CMakeLists.txt                            |  4 +-
 src/bufferManipulation.hpp                    | 15 +++++++
 unitTests/testUtils.hpp                       |  1 +
 8 files changed, 102 insertions(+), 3 deletions(-)
 rename host-configs/{ => ORNL}/ascent-gcc@8.1.1.cmake (100%)
 create mode 100644 host-configs/ORNL/crusher-cce@13.0.1.cmake
 create mode 100644 host-configs/ORNL/spock-cce@12.0.3.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc8cf73d..e53d193d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,6 +35,7 @@ if( NOT is_submodule )
 
     option( ENABLE_ADDR2LINE "Enable addr2line usage in stacktraces" ON )
     option( ENABLE_CUDA "Build with CUDA" OFF )
+    option( ENABLE_HIP "Build with HIP" OFF )
     option( ENABLE_UMPIRE "Build with UMPIRE" OFF )
     option( ENABLE_CHAI "Build with CHAI" OFF )
     option( ENABLE_CALIPER "Build with Caliper" OFF )
@@ -80,6 +81,8 @@ blt_list_append( TO lvarray_dependencies ELEMENTS chai IF ENABLE_CHAI )
 
 blt_list_append( TO lvarray_dependencies ELEMENTS cuda IF ENABLE_CUDA )
 
+blt_list_append( TO lvarray_dependencies ELEMENTS hip hip_runtime IF ENABLE_HIP )
+
 blt_list_append( TO lvarray_dependencies ELEMENTS caliper IF ENABLE_CALIPER )
 
 
diff --git a/cmake/blt b/cmake/blt
index c253509a..ddd5a0ca 160000
--- a/cmake/blt
+++ b/cmake/blt
@@ -1 +1 @@
-Subproject commit c253509ab2daf759eb857958597f6f34ab8c1713
+Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb
diff --git a/host-configs/ascent-gcc@8.1.1.cmake b/host-configs/ORNL/ascent-gcc@8.1.1.cmake
similarity index 100%
rename from host-configs/ascent-gcc@8.1.1.cmake
rename to host-configs/ORNL/ascent-gcc@8.1.1.cmake
diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
new file mode 100644
index 00000000..2a359fd5
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -0,0 +1,41 @@
+set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") 
+
+# Set up the tpls
+set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" )
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" )
+
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-aeczo5gctizktwwt5x7xlmuyoarwipag/" CACHE PATH "" )
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-yubforuougga3ujwwpfz3tmybqhroczp/" CACHE PATH "" )
+
+set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
+set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" )
+
+# C++ options
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "")
+
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
+
+set( ENABLE_MPI ON CACHE BOOL "" FORCE )
+set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
+
+# HIP Options
+set( ENABLE_HIP ON CACHE BOOL "" FORCE )
+set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
+set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
+set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+#set( CMAKE_CXX_FLAGS "-std=c++14 -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE )
+
+set( HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE )
+set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE )
+set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE )
+
+# GTEST options
+set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
+set(gtest_disable_pthreads ON CACHE BOOL "")
diff --git a/host-configs/ORNL/spock-cce@12.0.3.cmake b/host-configs/ORNL/spock-cce@12.0.3.cmake
new file mode 100644
index 00000000..f0764c32
--- /dev/null
+++ b/host-configs/ORNL/spock-cce@12.0.3.cmake
@@ -0,0 +1,39 @@
+set(CONFIG_NAME "spock-cce@12.0.3" CACHE PATH "") 
+
+# Set up the tpls
+set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen2/cce-12.0.3" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-qpmhf6p7n5sarmks55hgjnzff3ncs7jd/" CACHE PATH "" )
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-frggdmwjevbxy4a6kw7ctgrhyv7erfhr/" CACHE PATH "" )
+
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-nkdetdg5tjyzzf5yjzo32jxwkmwfjjqn/" CACHE PATH "" )
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-wun25mr5qf7vo6x2vblhzh2ivs7vr4g6/" CACHE PATH "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-a5ponjo23u7smy7w4a4jj7im47shrsxk/" CACHE PATH "" )
+
+set(METIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/metis-5.1.0-rbblqiymq6eoursordyaq2ghimzpd22v/" CACHE PATH "" )
+set(PARMETIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/parmetis-4.0.3-mliemgo6vxrahsz4f6u5agdqyfpk2yd2/" CACHE PATH "" )
+
+# C++ options
+#set(CMAKE_C_COMPILER "/opt/cray/pe/cce/12.0.3/bin/craycc" CACHE PATH "")
+#set(CMAKE_CXX_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayCC" CACHE PATH "")
+#set(CMAKE_Fortran_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayftn" CACHE PATH "")
+
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.11/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.11/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.11/bin/ftn" CACHE PATH "")
+
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
+
+set( ENABLE_MPI ON CACHE BOOL "" FORCE )
+set( ENABLE_FIND_MPI OFF CACHE BOOL "" FORCE )
+
+# HIP Options
+set( ENABLE_HIP ON CACHE BOOL "" FORCE )
+set( HIP_ROOT "/opt/rocm-4.2.0" CACHE PATH "" )
+set( HIP_VERSION_STRING "4.2.0" CACHE STRING "" )
+set( CMAKE_HIP_ARCHITECTURES "gfx908" CACHE STRING "" FORCE )
+
+# GTEST options
+set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
+set(gtest_disable_pthreads ON CACHE BOOL "")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 03f627c2..da7c512f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -38,7 +38,7 @@ set( lvarray_headers
      sortedArrayManipulationHelpers.hpp
      system.hpp
      tensorOps.hpp
-     totalview/tv_data_display.h
+#     totalview/tv_data_display.h
      typeManipulation.hpp
      umpireInterface.hpp )
 
@@ -46,7 +46,7 @@ blt_list_append( TO lvarray_headers ELEMENTS ChaiBuffer.hpp IF ENABLE_CHAI )
 
 set( lvarray_sources
      system.cpp
-     totalview/tv_data_display.c
+#     totalview/tv_data_display.c
      umpireInterface.cpp )
 
 blt_add_library( NAME             lvarray
diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp
index 62b94539..b06a4e4c 100644
--- a/src/bufferManipulation.hpp
+++ b/src/bufferManipulation.hpp
@@ -69,6 +69,21 @@ namespace bufferManipulation
  */
 HAS_MEMBER_FUNCTION_NO_RTYPE( move, MemorySpace::host, true );
 
+
+template < typename T >
+struct ContainerShim
+{
+  ContainerShim( T * begin, T * end )
+    : m_begin( begin )
+    , m_end( end )
+  {}
+  T * begin() const { return m_begin; }
+  T * end() const { return m_end; }
+  T * m_begin;
+  T * m_end;
+};
+  
+  
 /**
  * @class VoidBuffer
  * @brief This class implements the default behavior for the Buffer methods related
diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp
index 5a2db2bf..161c8453 100644
--- a/unitTests/testUtils.hpp
+++ b/unitTests/testUtils.hpp
@@ -20,6 +20,7 @@
 
 // TPL includes
 #include <RAJA/RAJA.hpp>
+#include <umpire/strategy/QuickPool.hpp>
 #include <gtest/gtest.h>
 
 // System includes

From f5a81fee1c4302c53c9821ea8fba4c9c663de083 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Mon, 21 Mar 2022 16:04:07 -0400
Subject: [PATCH 06/34] ongoing crusher/rocm work

---
 cmake/Config.cmake                          |   1 +
 host-configs/ORNL/crusher-cce@13.0.1.cmake  |  21 +-
 src/ArrayOfArraysView.hpp                   |   3 +
 src/ChaiBuffer.hpp                          |  18 +-
 src/LvArrayConfig.hpp.in                    |   2 +
 src/Macros.hpp                              |   6 +-
 src/SortedArrayView.hpp                     |   3 +
 unitTests/testArray1DOfArray1D.cpp          |   2 +-
 unitTests/testArray1DOfArray1DOfArray1D.cpp |   2 +-
 unitTests/testArrayOfArrays.cpp             |   4 +-
 unitTests/testArrayOfSets.cpp               |   2 +-
 unitTests/testArray_ChaiBuffer.cpp          |  42 +-
 unitTests/testCRSMatrix.cpp                 |   4 +-
 unitTests/testChaiBuffer.cpp                | 132 ++-
 unitTests/testMath.cpp                      |  10 +-
 unitTests/testMemcpy.cpp                    | 101 ++-
 unitTests/testSortedArray.cpp               |   2 +-
 unitTests/testSortedArrayManipulation.cpp   |   4 +-
 unitTests/testSparsityPattern.cpp           |   4 +-
 unitTests/testStackArray.cpp                |   2 +-
 unitTests/testTensorOpsEigen.cpp            |   2 +-
 unitTests/testTensorOpsFixedSize.cpp        |   2 +-
 unitTests/testTensorOpsInverse.hpp          |   4 +-
 unitTests/testTensorOpsNoSize.cpp           |   2 +-
 unitTests/testTensorOpsOneSize.cpp          |   2 +-
 unitTests/testTensorOpsThreeSizes.hpp       |   2 +-
 unitTests/testTensorOpsTwoSizes.hpp         |   2 +-
 unitTests/testTensorOpsTwoSizes1.cpp        | 930 +++++++++++++++++++-
 unitTests/testTypeManipulation.cpp          |  17 +
 unitTests/testUtils.hpp                     |  23 +-
 30 files changed, 1305 insertions(+), 46 deletions(-)

diff --git a/cmake/Config.cmake b/cmake/Config.cmake
index 0a44fd1b..cf8ff35b 100644
--- a/cmake/Config.cmake
+++ b/cmake/Config.cmake
@@ -2,6 +2,7 @@
 set( PREPROCESSOR_DEFINES UMPIRE
                           CHAI
                           CUDA
+			  HIP
                           TOTALVIEW_OUTPUT
                           CALIPER )
 
diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
index 2a359fd5..65830097 100644
--- a/host-configs/ORNL/crusher-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -7,8 +7,10 @@ set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
 set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" )
 set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" )
 
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
 set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-aeczo5gctizktwwt5x7xlmuyoarwipag/" CACHE PATH "" )
 set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" )
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
 set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-yubforuougga3ujwwpfz3tmybqhroczp/" CACHE PATH "" )
 
 set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
@@ -28,14 +30,27 @@ set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
 set( ENABLE_HIP ON CACHE BOOL "" FORCE )
 set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
 set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
+
 set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
 set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
-#set( CMAKE_CXX_FLAGS "-std=c++14 -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE )
+
+#set( CMAKE_CXX_FLAGS "--offload-arch=gfx90a -x hip -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE )
 
 set( HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE )
-set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE )
-set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE )
+set( HIP_HIPCC_FLAGS "-std=c++14 --amdgpu-target=gfx90a" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE )
+
+set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link
+set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} )
 
 # GTEST options
 set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
 set(gtest_disable_pthreads ON CACHE BOOL "")
+
+set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+#set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
+set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
+set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
+set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
+
+#BLT
+set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
\ No newline at end of file
diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 706f2014..52c8df15 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -587,6 +587,9 @@ class ArrayOfArraysView
 
   #if defined(LVARRAY_USE_CUDA)
     if( space == MemorySpace::cuda ) touch = false;
+  #endif
+  #if defined(LVARRAY_USE_HIP)
+    if( space == MemorySpace::hip ) touch = false;
   #endif
     m_offsets.move( space, touch );
   }
diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp
index b5d26fa1..6b0d45ec 100644
--- a/src/ChaiBuffer.hpp
+++ b/src/ChaiBuffer.hpp
@@ -56,7 +56,11 @@ inline chai::ExecutionSpace toChaiExecutionSpace( MemorySpace const space )
   if( space == MemorySpace::host )
     return chai::CPU;
 #if defined(LVARRAY_USE_CUDA)
-  if( space == MemorySpace::cuda || space == MemorySpace::hip )
+  if( space == MemorySpace::cuda )
+    return chai::GPU;
+#endif
+#if defined(LVARRAY_USE_HIP)
+  if( space == MemorySpace::hip )
     return chai::GPU;
 #endif
 
@@ -79,6 +83,10 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space )
   if( space == chai::GPU )
     return MemorySpace::cuda;
 #endif
+#if defined(LVARRAY_USE_HIP)
+  if( space == chai::GPU )
+    return MemorySpace::hip;
+#endif
 
   LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) );
 
@@ -185,7 +193,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__)
+  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) )
     move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true );
   #endif
   }
@@ -203,7 +211,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__)
+  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) )
     moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true );
   #else
     LVARRAY_UNUSED_VARIABLE( size );
@@ -370,7 +378,7 @@ class ChaiBuffer
   inline
   void moveNested( MemorySpace const space, std::ptrdiff_t const size, bool const touch ) const
   {
-  #if defined(LVARRAY_USE_CUDA)
+  #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP )
     chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space );
     if( m_pointerRecord == nullptr ||
         m_capacity == 0 ||
@@ -398,7 +406,7 @@ class ChaiBuffer
    */
   void move( MemorySpace const space, bool const touch ) const
   {
-  #if defined(LVARRAY_USE_CUDA)
+  #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
     chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space );
     if( m_pointerRecord == nullptr ||
         m_capacity == 0 ||
diff --git a/src/LvArrayConfig.hpp.in b/src/LvArrayConfig.hpp.in
index 2c997ab5..dcbd30b3 100644
--- a/src/LvArrayConfig.hpp.in
+++ b/src/LvArrayConfig.hpp.in
@@ -26,6 +26,8 @@
 
 #cmakedefine LVARRAY_USE_CUDA
 
+#cmakedefine LVARRAY_USE_HIP
+
 #cmakedefine LVARRAY_USE_TOTALVIEW_OUTPUT
 
 #cmakedefine LVARRAY_USE_CALIPER
diff --git a/src/Macros.hpp b/src/Macros.hpp
index 544f5e19..a2060c1a 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -22,7 +22,7 @@
 #include <iostream>
 #include <type_traits>
 
-#if defined(LVARRAY_USE_CUDA)
+#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
   #include <cassert>
 #endif
 
@@ -91,7 +91,7 @@
  *       and a stack trace along with the provided message. On device none of this is
  *       guaranteed. In fact it is only guaranteed to abort the current kernel.
  */
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   #if !defined(NDEBUG)
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
@@ -535,7 +535,7 @@
  */
 #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" )
 
-#if defined(LVARRAY_USE_CUDA) && defined(__CUDACC__)
+#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIPCC__) )
 /// Mark a function for both host and device usage.
 #define LVARRAY_HOST_DEVICE __host__ __device__
 
diff --git a/src/SortedArrayView.hpp b/src/SortedArrayView.hpp
index ab7ca790..8559a3fc 100644
--- a/src/SortedArrayView.hpp
+++ b/src/SortedArrayView.hpp
@@ -274,6 +274,9 @@ class SortedArrayView
   {
   #if defined(LVARRAY_USE_CUDA)
     if( space == MemorySpace::cuda ) touch = false;
+  #endif
+  #if defined(LVARRAY_USE_HIP)
+    if( space == MemorySpace::hip ) touch = false;
   #endif
     m_values.move( space, touch );
   }
diff --git a/unitTests/testArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1D.cpp
index faa53b52..7ff271cb 100644
--- a/unitTests/testArray1DOfArray1D.cpp
+++ b/unitTests/testArray1DOfArray1D.cpp
@@ -233,7 +233,7 @@ using Array1DOfArray1DTestTypes = ::testing::Types<
   , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy >
   , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy >
 #endif
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArray1DOfArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1DOfArray1D.cpp
index 5dc93fe8..cdd17fe2 100644
--- a/unitTests/testArray1DOfArray1DOfArray1D.cpp
+++ b/unitTests/testArray1DOfArray1DOfArray1D.cpp
@@ -272,7 +272,7 @@ using Array1DOfArray1DOfArray1DTestTypes = ::testing::Types<
   , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy >
   , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy >
 #endif
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI)
   , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArrayOfArrays.cpp b/unitTests/testArrayOfArrays.cpp
index 784fd448..aa20086b 100644
--- a/unitTests/testArrayOfArrays.cpp
+++ b/unitTests/testArrayOfArrays.cpp
@@ -1284,7 +1284,7 @@ using ArrayOfArraysViewTestTypes = ::testing::Types<
   , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
@@ -1467,7 +1467,7 @@ using ArrayOfArraysViewAtomicTestTypes = ::testing::Types<
   , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArrayOfSets.cpp b/unitTests/testArrayOfSets.cpp
index d3b9f540..ac71a76b 100644
--- a/unitTests/testArrayOfSets.cpp
+++ b/unitTests/testArrayOfSets.cpp
@@ -925,7 +925,7 @@ using ArrayOfSetsViewTestTypes = ::testing::Types<
   , std::pair< ArrayOfSets< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA)  || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< ArrayOfSets< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< ArrayOfSets< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArray_ChaiBuffer.cpp b/unitTests/testArray_ChaiBuffer.cpp
index 34825981..5ef2a6a1 100644
--- a/unitTests/testArray_ChaiBuffer.cpp
+++ b/unitTests/testArray_ChaiBuffer.cpp
@@ -42,6 +42,10 @@ class ArrayTest : public ::testing::Test
     auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
+  #elif defined(LVARRAY_USE_HIP)
+    auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
+    std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip };
+    std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
   #else
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool };
@@ -61,13 +65,19 @@ class ArrayTest : public ::testing::Test
     array.move( MemorySpace::cuda, true );
     EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" );
 
+    array.move( MemorySpace::host, true );
+    EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" );
+  #elif defined(LVARRAY_USE_HIP)
+    array.move( MemorySpace::hip, true );
+    EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" );
+
     array.move( MemorySpace::host, true );
     EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" );
   #endif
   }
 
 #if defined( LVARRAY_USE_CUDA )
-  void testDeviceAlloc()
+  void testCudaDeviceAlloc()
   {
     Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array;
 
@@ -86,6 +96,26 @@ class ArrayTest : public ::testing::Test
     }
   }
 #endif
+#if defined(LVARRAY_USE_HIP)
+  void testHIPDeviceAlloc()
+  {
+    Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array;
+
+    array.resizeWithoutInitializationOrDestruction( MemorySpace::hip, 100 );
+
+    T * const devPtr = array.data();
+    forall< parallelDevicePolicy< 32 > >( array.size(), [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          new ( &devPtr[ i ] ) T( i );
+        } );
+
+    array.move( MemorySpace::host, true );
+    for( int i = 0; i < array.size(); ++i )
+    {
+      EXPECT_EQ( array[ i ], T( i ) );
+    }
+  }
+#endif
 };
 
 /// The list of types to instantiate ArrayTest with.
@@ -104,9 +134,17 @@ TYPED_TEST( ArrayTest, AllocatorConstruction )
 
 TYPED_TEST( ArrayTest, DeviceAlloc )
 {
-  this->testDeviceAlloc();
+  this->testCudaDeviceAlloc();
 }
 
+#endif
+#if defined(LVARRAY_USE_HIP)
+
+TYPED_TEST( ArrayTest, DeviceAlloc )
+{
+  this->testHIPDeviceAlloc();
+}
+  
 #endif
 
 } // namespace testing
diff --git a/unitTests/testCRSMatrix.cpp b/unitTests/testCRSMatrix.cpp
index 987aa4e9..3c6c0556 100644
--- a/unitTests/testCRSMatrix.cpp
+++ b/unitTests/testCRSMatrix.cpp
@@ -1036,7 +1036,7 @@ using CRSMatrixViewTestTypes = ::testing::Types<
   , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, ChaiBuffer >, serialPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
@@ -1276,7 +1276,7 @@ using CRSMatrixViewAtomicTestTypes = ::testing::Types<
   , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testChaiBuffer.cpp b/unitTests/testChaiBuffer.cpp
index 8c6d9937..ae12886f 100644
--- a/unitTests/testChaiBuffer.cpp
+++ b/unitTests/testChaiBuffer.cpp
@@ -41,6 +41,10 @@ class ChaiBufferTest : public ::testing::Test
     auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
+  #elif defined( LVARRAY_USE_HIP )
+    auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
+    std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip };
+    std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };    
   #else
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool };
@@ -62,6 +66,12 @@ class ChaiBufferTest : public ::testing::Test
     buffer.move( MemorySpace::cuda, true );
     EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" );
 
+    buffer.move( MemorySpace::host, true );
+    EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" );
+  #elif defined(LVARRAY_USE_HIP)
+    buffer.move( MemorySpace::hip, true );
+    EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" );
+
     buffer.move( MemorySpace::host, true );
     EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" );
   #endif
@@ -188,6 +198,126 @@ class ChaiBufferTest : public ::testing::Test
       EXPECT_EQ( buffer[ i ], T( i ) );
     }
 
+    bufferManipulation::free( buffer, size );
+  }
+#elif defined( LVARRAY_USE_HIP )
+  void testMove()
+  {
+    ChaiBuffer< T > buffer( true );
+
+    int const size = 100;
+    buffer.reallocate( 0, MemorySpace::host, size );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+
+    for( int i = 0; i < size; ++i )
+    {
+      new ( &buffer[ i ] ) T( i );
+    }
+
+    buffer.move( MemorySpace::hip, true );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+    T * const devPtr = buffer.data();
+
+    forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          devPtr[ i ] += devPtr[ i ];
+        } );
+
+    // Check that the device changes are seen on the host. Then modify the values without touching.
+    buffer.move( MemorySpace::host, false );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    for( int i = 0; i < size; ++i )
+    {
+      EXPECT_EQ( buffer[ i ], T( i ) + T( i ) );
+      buffer[ i ] = T( 0 );
+    }
+
+    buffer.move( MemorySpace::hip, true );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+    forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          devPtr[ i ] += devPtr[ i ];
+        } );
+
+    buffer.move( MemorySpace::host, false );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    for( int i = 0; i < size; ++i )
+    {
+      EXPECT_EQ( buffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) );
+    }
+
+    bufferManipulation::free( buffer, size );
+  }
+
+  void testCapture()
+  {
+    ChaiBuffer< T > buffer( true );
+
+    int const size = 100;
+    buffer.reallocate( 0, MemorySpace::host, size );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+
+    for( int i = 0; i < size; ++i )
+    {
+      new ( &buffer[ i ] ) T( i );
+    }
+
+    forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i )
+        {
+          buffer[ i ] += buffer[ i ];
+        } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+
+
+    // Check that the device changes are seen on the host. Then modify the values without touching.
+    ChaiBuffer< T const > constBuffer( buffer );
+    forall< serialPolicy >( size, [constBuffer] ( int const i )
+    {
+      EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) );
+      const_cast< T & >( constBuffer[ i ] ) = T( 0 );
+    } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host );
+
+    forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i )
+        {
+          buffer[ i ] += buffer[ i ];
+        } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+
+    forall< serialPolicy >( size, [constBuffer] ( int const i )
+    {
+      EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) );
+    } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host );
+
+    bufferManipulation::free( buffer, size );
+  }
+
+  void testDeviceRealloc()
+  {
+    ChaiBuffer< T > buffer( true );
+
+    int const size = 100;
+    buffer.reallocate( 0, MemorySpace::hip, size );
+
+    T * const devPtr = buffer.data();
+    forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          new ( &devPtr[ i ] ) T( i );
+        } );
+
+    buffer.move( MemorySpace::host, true );
+    for( int i = 0; i < size; ++i )
+    {
+      EXPECT_EQ( buffer[ i ], T( i ) );
+    }
+
     bufferManipulation::free( buffer, size );
   }
 #endif
@@ -205,7 +335,7 @@ TYPED_TEST( ChaiBufferTest, AllocatorConstruction )
   this->testAllocatorConstruction();
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
 
 TYPED_TEST( ChaiBufferTest, Move )
 {
diff --git a/unitTests/testMath.cpp b/unitTests/testMath.cpp
index 08502c4f..d7c76b19 100644
--- a/unitTests/testMath.cpp
+++ b/unitTests/testMath.cpp
@@ -145,14 +145,15 @@ using TestMathTypes = ::testing::Types<
   , std::pair< long long int, serialPolicy >
   , std::pair< float, serialPolicy >
   , std::pair< double, serialPolicy >
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
   , std::pair< int, parallelDevicePolicy< 32 > >
   , std::pair< long int, parallelDevicePolicy< 32 > >
   , std::pair< long long int, parallelDevicePolicy< 32 > >
   , std::pair< float, parallelDevicePolicy< 32 > >
   , std::pair< double, parallelDevicePolicy< 32 > >
-  , std::pair< __half, parallelDevicePolicy< 32 > >
 #endif
+#if defined( LVARRAY_USE_CUDA )
+  , std::pair< __half, parallelDevicePolicy< 32 > >
   >;
 
 TYPED_TEST_SUITE( TestMath, TestMathTypes, );
@@ -331,7 +332,7 @@ struct TestMath2 : public ::testing::Test
   }
 };
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
 
 using TestMath2Types = ::testing::Types<
   std::pair< __half2, parallelDevicePolicy< 32 > >
@@ -403,7 +404,8 @@ void forAllHalvesinMinus1to1( bool const include1, LAMBDA && lambda )
         }
       } );
 }
-
+#endif
+#if defined(LVARRAY_USE_CUDA)
 void asinHalfAccuracy()
 {
   RAJA::ReduceMax< RAJA::cuda_reduce, double > maxDiff( 0 );
diff --git a/unitTests/testMemcpy.cpp b/unitTests/testMemcpy.cpp
index f3adcece..0e44243d 100644
--- a/unitTests/testMemcpy.cpp
+++ b/unitTests/testMemcpy.cpp
@@ -242,7 +242,106 @@ void testAsyncMemcpyDevice()
     EXPECT_EQ( x[ i ], -i );
   }
 }
+#elif defined(LVARRAY_USE_HIP)
 
+template< template< typename > class BUFFER_TYPE >
+void testMemcpyDevice()
+{
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    x[ i ] = i;
+  }
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() );
+  y.move( MemorySpace::hip );
+  int * yPtr = y.data();
+
+  memcpy< 0, 0 >( y, {}, x.toViewConst(), {} );
+
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        PORTABLE_EXPECT_EQ( yPtr[ i ], i );
+        yPtr[ i ] *= 2;
+      } );
+
+  memcpy< 0, 0 >( x, {}, y.toViewConst(), {} );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], 2 * i );
+  }
+
+  // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing
+  // to host memory but the subsequent memcpy should pick up that it's previous space is on device.
+  y.move( MemorySpace::host );
+
+  ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView();
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        yView[ i ] = -i;
+      } );
+
+  memcpy< 0, 0 >( x, {}, y.toViewConst(), {} );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], -i );
+  }
+}
+
+template< template< typename > class BUFFER_TYPE >
+void testAsyncMemcpyDevice()
+{
+  camp::resources::Resource stream{ camp::resources::Hip{} };
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    x[ i ] = i;
+  }
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() );
+  y.move( MemorySpace::hip );
+  int * yPtr = y.data();
+
+  camp::resources::Event e = memcpy< 0, 0 >( stream, y.toView(), {}, x.toViewConst(), {} );
+  stream.wait_for( &e );
+
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        PORTABLE_EXPECT_EQ( yPtr[ i ], i );
+        yPtr[ i ] *= 2;
+      } );
+
+  e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} );
+  stream.wait_for( &e );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], 2 * i );
+  }
+
+  // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing
+  // to host memory but the subsequent memcpy should pick up that it's previous space is on device.
+  y.move( MemorySpace::host );
+
+  ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView();
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        yView[ i ] = -i;
+      } );
+
+  e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} );
+  stream.wait_for( &e );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], -i );
+  }
+}
 #endif
 
 TEST( TestMemcpy, MallocBuffer1D )
@@ -282,7 +381,7 @@ TEST( TestMemcpy, ChaiBuffer2D )
   testMemcpy2D< ChaiBuffer >();
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
 
 TEST( TestMemcpy, ChaiBufferDevice )
 {
diff --git a/unitTests/testSortedArray.cpp b/unitTests/testSortedArray.cpp
index 5198bd24..ae145fbd 100644
--- a/unitTests/testSortedArray.cpp
+++ b/unitTests/testSortedArray.cpp
@@ -451,7 +451,7 @@ using SortedArrayViewTestTypes = ::testing::Types<
   std::pair< SortedArray< int, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
   , std::pair< SortedArray< Tensor, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
   , std::pair< SortedArray< TestString, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI)
   , std::pair< SortedArray< int, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< SortedArray< Tensor, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testSortedArrayManipulation.cpp b/unitTests/testSortedArrayManipulation.cpp
index 2d784cb2..ae376cb4 100644
--- a/unitTests/testSortedArrayManipulation.cpp
+++ b/unitTests/testSortedArrayManipulation.cpp
@@ -190,7 +190,7 @@ using SingleArrayTestTypes = ::testing::Types<
   , std::tuple< TestString, sortedArrayManipulation::less< TestString >, serialPolicy >
   , std::tuple< TestString, sortedArrayManipulation::greater< TestString >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > >
   , std::tuple< int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > >
   , std::tuple< Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > >
@@ -290,7 +290,7 @@ using DualArrayTestTypes = ::testing::Types<
   , std::tuple< TestString, TestString, sortedArrayManipulation::less< TestString >, serialPolicy >
   , std::tuple< TestString, TestString, sortedArrayManipulation::greater< TestString >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) )&& defined(LVARRAY_USE_CHAI)
   , std::tuple< int, int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > >
   , std::tuple< int, int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > >
   , std::tuple< Tensor, Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > >
diff --git a/unitTests/testSparsityPattern.cpp b/unitTests/testSparsityPattern.cpp
index 50ec30f9..fee7a995 100644
--- a/unitTests/testSparsityPattern.cpp
+++ b/unitTests/testSparsityPattern.cpp
@@ -1016,7 +1016,7 @@ using SparsityPatternViewTestTypes = ::testing::Types<
 #endif
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< SparsityPattern< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #if !defined( __ibmxl__ )
   , std::pair< SparsityPattern< uint, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
@@ -1171,7 +1171,7 @@ using CRSMatrixTestTypes = ::testing::Types<
   std::pair< CRSMatrix< int, int, std::ptrdiff_t, MallocBuffer >, serialPolicy >
   , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, MallocBuffer >, serialPolicy >
   , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, MallocBuffer >, serialPolicy >
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testStackArray.cpp b/unitTests/testStackArray.cpp
index 249ccebb..e29206ab 100644
--- a/unitTests/testStackArray.cpp
+++ b/unitTests/testStackArray.cpp
@@ -281,7 +281,7 @@ using StackArrayCaptureTestTypes = ::testing::Types<
   , std::pair< RAJA::PERM_KIJ, serialPolicy >
   , std::pair< RAJA::PERM_KJI, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA)
+#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
   , std::pair< RAJA::PERM_I, parallelDevicePolicy< 32 > >
   , std::pair< RAJA::PERM_IJ, parallelDevicePolicy< 32 > >
   , std::pair< RAJA::PERM_JI, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsEigen.cpp b/unitTests/testTensorOpsEigen.cpp
index 46ff354d..2c556ec7 100644
--- a/unitTests/testTensorOpsEigen.cpp
+++ b/unitTests/testTensorOpsEigen.cpp
@@ -243,7 +243,7 @@ using TestEigendecompositionTypes = ::testing::Types<
   , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< std::int64_t, double, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy >
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< std::int64_t, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsFixedSize.cpp b/unitTests/testTensorOpsFixedSize.cpp
index e66fd5a3..c4ba14cb 100644
--- a/unitTests/testTensorOpsFixedSize.cpp
+++ b/unitTests/testTensorOpsFixedSize.cpp
@@ -569,7 +569,7 @@ using FixedSizeSquareMatrixTestTypes = ::testing::Types<
   std::tuple< double, std::integral_constant< int, 2 >, serialPolicy >
   , std::tuple< double, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp
index 4909a686..9edfa950 100644
--- a/unitTests/testTensorOpsInverse.hpp
+++ b/unitTests/testTensorOpsInverse.hpp
@@ -375,7 +375,7 @@ using InverseTestTypes = ::testing::Types<
   , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< int, double, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< int, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
@@ -400,7 +400,7 @@ using InverseFloatOnlyTestTypes = ::testing::Types<
   , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsNoSize.cpp b/unitTests/testTensorOpsNoSize.cpp
index b08e5ae1..8c1112d4 100644
--- a/unitTests/testTensorOpsNoSize.cpp
+++ b/unitTests/testTensorOpsNoSize.cpp
@@ -349,7 +349,7 @@ using NoSizeTestTypes = ::testing::Types<
   std::tuple< double, serialPolicy >
   , std::tuple< int, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, parallelDevicePolicy< 32 > >
   , std::tuple< int, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testTensorOpsOneSize.cpp b/unitTests/testTensorOpsOneSize.cpp
index fc351c75..78946638 100644
--- a/unitTests/testTensorOpsOneSize.cpp
+++ b/unitTests/testTensorOpsOneSize.cpp
@@ -693,7 +693,7 @@ using OneSizeTestTypes = ::testing::Types<
   , std::tuple< int, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< double, std::integral_constant< int, 6 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< int, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, std::integral_constant< int, 6 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsThreeSizes.hpp b/unitTests/testTensorOpsThreeSizes.hpp
index 5a27092a..b4546a9b 100644
--- a/unitTests/testTensorOpsThreeSizes.hpp
+++ b/unitTests/testTensorOpsThreeSizes.hpp
@@ -530,7 +530,7 @@ using ThreeSizesTestTypes = ::testing::Types<
                 std::integral_constant< int, 3 >,
                 serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double,
                 std::integral_constant< int, 2 >,
                 std::integral_constant< int, 3 >,
diff --git a/unitTests/testTensorOpsTwoSizes.hpp b/unitTests/testTensorOpsTwoSizes.hpp
index 07978011..5492b2b5 100644
--- a/unitTests/testTensorOpsTwoSizes.hpp
+++ b/unitTests/testTensorOpsTwoSizes.hpp
@@ -930,7 +930,7 @@ using TwoSizesTestTypes = ::testing::Types<
   , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy >
   , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
   , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsTwoSizes1.cpp b/unitTests/testTensorOpsTwoSizes1.cpp
index 7f5a97d5..101c4671 100644
--- a/unitTests/testTensorOpsTwoSizes1.cpp
+++ b/unitTests/testTensorOpsTwoSizes1.cpp
@@ -6,13 +6,941 @@
  */
 
 // Source includes
-#include "testTensorOpsTwoSizes.hpp"
+//#include "testTensorOpsTwoSizes.hpp"
+
+// Source includes
+#include "tensorOps.hpp"
+#include "Array.hpp"
+#include "testUtils.hpp"
+#include "output.hpp"
+#include "testTensorOpsCommon.hpp"
+
+// TPL includes
+#include <gtest/gtest.h>
 
 namespace LvArray
 {
 namespace testing
 {
 
+template< typename T_N_M_POLICY_TUPLE >
+class TwoSizesTest : public ::testing::Test
+{
+public:
+  using T = std::tuple_element_t< 0, T_N_M_POLICY_TUPLE >;
+  static constexpr std::ptrdiff_t N = std::tuple_element_t< 1, T_N_M_POLICY_TUPLE > {};
+  static constexpr std::ptrdiff_t M = std::tuple_element_t< 2, T_N_M_POLICY_TUPLE > {};
+  using POLICY = std::tuple_element_t< 3, T_N_M_POLICY_TUPLE >;
+
+  void SetUp() override
+  {
+    fill( m_matrixA_IJK.toSlice(), m_matrixASeed );
+    fill( m_matrixA_IKJ.toSlice(), m_matrixASeed );
+    fill( m_matrixA_KJI.toSlice(), m_matrixASeed );
+    fill( m_matrixA_local, m_matrixASeed );
+
+    fill( m_matrixB_IJK.toSlice(), m_matrixBSeed );
+    fill( m_matrixB_IKJ.toSlice(), m_matrixBSeed );
+    fill( m_matrixB_KJI.toSlice(), m_matrixBSeed );
+    fill( m_matrixB_local, m_matrixBSeed );
+
+    fill( m_matrixNN_IJK.toSlice(), m_matrixNNSeed );
+    fill( m_matrixNN_IKJ.toSlice(), m_matrixNNSeed );
+    fill( m_matrixNN_KJI.toSlice(), m_matrixNNSeed );
+    fill( m_matrixNN_local, m_matrixNNSeed );
+
+    fill( m_matrixMN_IJK.toSlice(), m_matrixMNSeed );
+    fill( m_matrixMN_IKJ.toSlice(), m_matrixMNSeed );
+    fill( m_matrixMN_KJI.toSlice(), m_matrixMNSeed );
+    fill( m_matrixMN_local, m_matrixMNSeed );
+
+    fill( m_vectorN_IJ.toSlice(), m_vectorNSeed );
+    fill( m_vectorN_JI.toSlice(), m_vectorNSeed );
+    fill( m_vectorN_local, m_vectorNSeed );
+
+    fill( m_vectorM_IJ.toSlice(), m_vectorMSeed );
+    fill( m_vectorM_JI.toSlice(), m_vectorMSeed );
+    fill( m_vectorM_local, m_vectorMSeed );
+  }
+
+  void testScale()
+  {
+    T scale = T( 3.14 );
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_matrixA_local[ i ][ j ] * scale;
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    std::ptrdiff_t const aSeed = m_matrixASeed;
+    forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, aSeed] LVARRAY_HOST_DEVICE ( int )
+        {
+          tensorOps::scale< N, M >( matrixA_IJK[ 0 ], scale );
+          CHECK_EQUALITY_2D( N, M, matrixA_IJK[ 0 ], result );
+
+          tensorOps::scale< N, M >( matrixA_IKJ[ 0 ], scale );
+          CHECK_EQUALITY_2D( N, M, matrixA_IKJ[ 0 ], result );
+
+          tensorOps::scale< N, M >( matrixA_KJI[ 0 ], scale );
+          CHECK_EQUALITY_2D( N, M, matrixA_KJI[ 0 ], result );
+
+          T matrix_local[ N ][ M ];
+          fill( matrix_local, aSeed );
+          tensorOps::scale< N, M >( matrix_local, scale );
+          CHECK_EQUALITY_2D( N, M, matrix_local, result );
+        } );
+  }
+
+  void testFill()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI] LVARRAY_HOST_DEVICE ( int )
+        {
+          for( int i = 0; i < 3; ++i )
+          {
+            T const value = 3.14 * i;
+            tensorOps::fill< N, M >( matrixA_IJK[ 0 ], value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrixA_IJK( 0, j, k ), value );
+              }
+            }
+
+            tensorOps::fill< N, M >( matrixA_IKJ[ 0 ], value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrixA_IKJ( 0, j, k ), value );
+              }
+            }
+
+            tensorOps::fill< N, M >( matrixA_KJI[ 0 ], value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrixA_KJI( 0, j, k ), value );
+              }
+            }
+
+            T matrix_local[ N ][ M ];
+            tensorOps::fill< N, M >( matrix_local, value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrix_local[ j ][ k ], value );
+              }
+            }
+          }
+        } );
+  }
+
+  void testAiBj()
+  {
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_vectorN_local[ i ] * m_vectorM_local[ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local,
+                          vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( matrix, matrixSeed ); \
+            tensorOps::Rij_eq_AiBj< N, M >( matrix, vectorN, vectorM ); \
+            CHECK_EQUALITY_2D( N, M, matrix, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAiBj()
+  {
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_vectorN_local[ i ] * m_vectorM_local[ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( matrix, matrixSeed ); \
+            tensorOps::Rij_add_AiBj< N, M >( matrix, vectorN, vectorM ); \
+            CHECK_EQUALITY_2D( N, M, matrix, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testAijBj()
+  {
+    T result[ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ];
+      }
+      result[ i ] = dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const vectorNSeed = m_vectorNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorN, vectorNSeed ); \
+            tensorOps::Ri_eq_AijBj< N, M >( vectorN, matrix, vectorM ); \
+            CHECK_EQUALITY_1D( N, vectorN, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorN_local[ N ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAijBj()
+  {
+    T result[ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ];
+      }
+      result[ i ] = m_vectorN_local[ i ] + dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const vectorNSeed = m_vectorNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorN, vectorNSeed ); \
+            tensorOps::Ri_add_AijBj< N, M >( vectorN, matrix, vectorM ); \
+            CHECK_EQUALITY_1D( N, vectorN, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorN_local[ N ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testAjiBj()
+  {
+    T result[ M ];
+    for( std::ptrdiff_t i = 0; i < M; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ];
+      }
+      result[ i ] = dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView();
+
+    std::ptrdiff_t const vectorMSeed = m_vectorMSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorM, vectorMSeed ); \
+            tensorOps::Ri_eq_AjiBj< M, N >( vectorM, matrix, vectorN ); \
+            CHECK_EQUALITY_1D( M, vectorM, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorM_local[ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAjiBj()
+  {
+    T result[ M ];
+    for( std::ptrdiff_t i = 0; i < M; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ];
+      }
+      result[ i ] = m_vectorM_local[ i ] + dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView();
+
+    std::ptrdiff_t const vectorMSeed = m_vectorMSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorM, vectorMSeed ); \
+            tensorOps::Ri_add_AjiBj< M, N >( vectorM, matrix, vectorN ); \
+            CHECK_EQUALITY_1D( M, vectorM, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorM_local[ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testCopy()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst();
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::copy< N, M >( dstMatrix, srcMatrix ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, srcMatrix )
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testScaledCopy()
+  {
+    T scale = T( 3.14 );
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = scale * m_matrixB_local[ i ][ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst();
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::scaledCopy< N, M >( dstMatrix, srcMatrix, scale ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, result )
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testAdd()
+  {
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_matrixB_local[ i ][ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst();
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::add< N, M >( dstMatrix, srcMatrix ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, result )
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testScaledAdd()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toView();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toView();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toView();
+
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed ]
+                      LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::scaledAdd< N, M >( dstMatrix, srcMatrix, scale ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, result ); \
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrixA_local[ N ][ M ];
+          fill( matrixA_local, matrixSeed );
+
+          T const scale = T( 3.14 );
+          T result[ N ][ M ];
+          for( std::ptrdiff_t i = 0; i < N; ++i )
+          {
+            for( std::ptrdiff_t j = 0; j < M; ++j )
+            {
+              result[ i ][ j ] = matrixA_local[ i ][ j ] + scale * matrixB_local[ i ][ j ];
+            }
+          }
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+            #undef _TEST_PERMS
+            #undef _TEST
+        } );
+  }
+
+  void testAkiAkj()
+  {
+    T result[ N ][ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        T dot = 0;
+        for( std::ptrdiff_t k = 0; k < M; ++k )
+        {
+          dot += m_matrixMN_local[ k ][ i ] * m_matrixMN_local[ k ][ j ];
+        }
+        result[ i ][ j ] = dot;
+      }
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixMN_IJK = m_matrixMN_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixMN_IKJ = m_matrixMN_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixMN_KJI = m_matrixMN_KJI.toViewConst();
+    T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local;
+
+    ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView();
+
+    std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixMN_IJK, matrixMN_IKJ, matrixMN_KJI, matrixMN_local, matrixNN_IJK,
+                       matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( matrixNN, matrixMN ) \
+            fill( matrixNN, matrixNNSeed ); \
+            tensorOps::Rij_eq_AkiAkj< N, M >( matrixNN, matrixMN ); \
+            CHECK_EQUALITY_2D( N, N, matrixNN, result )
+
+          #define _TEST_PERMS( matrixNN, matrixMN0, matrixMN1, matrixMN2, matrixMN3 ) \
+            _TEST( matrixNN, matrixMN0 ); \
+            _TEST( matrixNN, matrixMN1 ); \
+            _TEST( matrixNN, matrixMN2 ); \
+            _TEST( matrixNN, matrixMN3 )
+
+          T matrixNN_local[ N ][ N ];
+
+          _TEST_PERMS( matrixNN_IJK[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixNN_KJI[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixNN_local, matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAikAjk()
+  {
+    T result[ N ][ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        T dot = 0;
+        for( std::ptrdiff_t k = 0; k < M; ++k )
+        {
+          dot += m_matrixA_local[ i ][ k ] * m_matrixA_local[ j ][ k ];
+        }
+        result[ i ][ j ] = m_matrixNN_local[ i ][ j ] + dot;
+      }
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrixA_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView();
+
+    std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixA_local, matrixNN_IJK,
+                       matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( matrixNN, matrixA ) \
+            fill( matrixNN, matrixNNSeed ); \
+            tensorOps::Rij_add_AikAjk< N, M >( matrixNN, matrixA ); \
+            CHECK_EQUALITY_2D( N, N, matrixNN, result )
+
+          #define _TEST_PERMS( matrixNN, matrixA0, matrixA1, matrixA2, matrixA3 ) \
+            _TEST( matrixNN, matrixA0 ); \
+            _TEST( matrixNN, matrixA1 ); \
+            _TEST( matrixNN, matrixA2 ); \
+            _TEST( matrixNN, matrixA3 )
+
+          T matrixNN_local[ N ][ N ];
+
+          _TEST_PERMS( matrixNN_IJK[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+          _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+          _TEST_PERMS( matrixNN_KJI[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+          _TEST_PERMS( matrixNN_local, matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testTranspose()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixMN_IJK_view = m_matrixMN_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixMN_IKJ_view = m_matrixMN_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixMN_KJI_view = m_matrixMN_KJI.toViewConst();
+    T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [=] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::transpose< N, M >( dstMatrix, srcMatrix ); \
+            for( int i = 0; i < N; ++i ) \
+            { \
+              for( int j = 0; j < M; ++j ) \
+              { \
+                PORTABLE_EXPECT_EQ( dstMatrix[ i ][ j ], srcMatrix[ j ][ i ] ); \
+              } \
+            }
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+
+      #undef _TEST_PERMS
+      #undef _TEST
+        } );
+  }
+
+private:
+  std::ptrdiff_t const m_matrixASeed = 0;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixA_IJK { 1, N, M };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixA_IKJ { 1, N, M };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixA_KJI { 1, N, M };
+  T m_matrixA_local[ N ][ M ];
+
+  std::ptrdiff_t const m_matrixBSeed = m_matrixASeed + N * M;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixB_IJK { 1, N, M };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixB_IKJ { 1, N, M };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixB_KJI { 1, N, M };
+  T m_matrixB_local[ N ][ M ];
+
+  std::ptrdiff_t const m_matrixNNSeed = m_matrixBSeed + N * M;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixNN_IJK { 1, N, N };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixNN_IKJ { 1, N, N };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixNN_KJI { 1, N, N };
+  T m_matrixNN_local[ N ][ N ];
+
+  std::ptrdiff_t const m_matrixMNSeed = m_matrixNNSeed + N * N;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixMN_IJK { 1, M, N };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixMN_IKJ { 1, M, N };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixMN_KJI { 1, M, N };
+  T m_matrixMN_local[ M ][ N ];
+
+  std::ptrdiff_t const m_vectorNSeed = m_matrixMNSeed + N * M;
+  ArrayT< T, RAJA::PERM_IJ > m_vectorN_IJ { 1, N };
+  ArrayT< T, RAJA::PERM_JI > m_vectorN_JI { 1, N };
+  T m_vectorN_local[ N ];
+
+  std::ptrdiff_t const m_vectorMSeed = m_vectorNSeed + N;
+  ArrayT< T, RAJA::PERM_IJ > m_vectorM_IJ { 1, M };
+  ArrayT< T, RAJA::PERM_JI > m_vectorM_JI { 1, M };
+  T m_vectorM_local[ M ];
+};
+
+
+using TwoSizesTestTypes = ::testing::Types<
+  std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, serialPolicy >
+  , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy >
+  , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy >
+
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
+  , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
+  , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > >
+  , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
+#endif
+  >;
+
+TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes );
+
+  
 TYPED_TEST( TwoSizesTest, scale )
 {
   this->testScale();
diff --git a/unitTests/testTypeManipulation.cpp b/unitTests/testTypeManipulation.cpp
index 5bef9a4c..45ad8380 100644
--- a/unitTests/testTypeManipulation.cpp
+++ b/unitTests/testTypeManipulation.cpp
@@ -78,6 +78,23 @@ CUDA_TEST( typeManipulation, forEachArg )
         }, intReducer, floatReducer, doubleReducer );
       } );
 
+  EXPECT_EQ( intReducer.get(), 2 );
+  EXPECT_EQ( floatReducer.get(), 4 );
+  EXPECT_EQ( doubleReducer.get(), 7 );
+#eli defined(LVARRAY_USE_HIP)
+  // Test on device.
+  RAJA::ReduceSum< RAJA::hip_reduce, int > intReducer( 1 );
+  RAJA::ReduceSum< RAJA::hip_reduce, float > floatReducer( 3 );
+  RAJA::ReduceSum< RAJA::hip_reduce, double > doubleReducer( 6 );
+  forall< parallelDevicePolicy< 32 > >( 1, [intReducer, floatReducer, doubleReducer] LVARRAY_DEVICE ( int )
+      {
+        // This has to be a host-device lambda to avoid errors.
+        typeManipulation::forEachArg( [] LVARRAY_HOST_DEVICE ( auto & reducer )
+        {
+          reducer += 1;
+        }, intReducer, floatReducer, doubleReducer );
+      } );
+
   EXPECT_EQ( intReducer.get(), 2 );
   EXPECT_EQ( floatReducer.get(), 4 );
   EXPECT_EQ( doubleReducer.get(), 7 );
diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp
index 161c8453..639b20ad 100644
--- a/unitTests/testUtils.hpp
+++ b/unitTests/testUtils.hpp
@@ -73,6 +73,19 @@ struct RAJAHelper< RAJA::cuda_exec< N > >
   static constexpr MemorySpace space = MemorySpace::cuda;
 };
 
+#elif defined(LVARRAY_USE_HIP)
+
+template< unsigned long THREADS_PER_BLOCK >
+using parallelDevicePolicy = RAJA::hip_exec< THREADS_PER_BLOCK >;
+
+template< unsigned long N >
+struct RAJAHelper< RAJA::hip_exec< N > >
+{
+  using ReducePolicy = RAJA::hip_reduce;
+  using AtomicPolicy = RAJA::hip_atomic;
+  static constexpr MemorySpace space = MemorySpace::hip;
+};
+  
 #endif
 
 template< typename POLICY, typename INDEX_TYPE, typename LAMBDA >
@@ -104,14 +117,14 @@ LAYOUT const & getRAJAViewLayout( RAJA::View< T, LAYOUT > const & view )
 }
 
 
-#ifndef __CUDA_ARCH__
-#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R )
-#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \
-    STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R );
-#else
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define PORTABLE_EXPECT_EQ( L, R ) LVARRAY_ERROR_IF_NE( L, R )
 #define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) LVARRAY_ERROR_IF_GE_MSG( math::abs( ( L ) -( R ) ), EPSILON, \
                                                                        STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ) );
+#else
+#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R )
+#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \
+    STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R );
 #endif
 
 // Comparator that compares a std::pair by it's first object.

From 0e2996f424a4c8ad06807f13dc569b2d9a1d48c9 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Fri, 25 Mar 2022 14:43:35 -0400
Subject: [PATCH 07/34] hip changes, crusher tpl installs

---
 CMakeLists.txt                             |  2 +-
 host-configs/ORNL/crusher-cce@13.0.1.cmake | 29 ++++++++++++++--------
 src/ChaiBuffer.hpp                         | 11 ++++++--
 src/Macros.hpp                             | 26 ++++++++++++++++---
 src/system.cpp                             |  7 +++++-
 5 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e53d193d..f682d16b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,7 +81,7 @@ blt_list_append( TO lvarray_dependencies ELEMENTS chai IF ENABLE_CHAI )
 
 blt_list_append( TO lvarray_dependencies ELEMENTS cuda IF ENABLE_CUDA )
 
-blt_list_append( TO lvarray_dependencies ELEMENTS hip hip_runtime IF ENABLE_HIP )
+blt_list_append( TO lvarray_dependencies ELEMENTS blt::hip IF ENABLE_HIP )
 
 blt_list_append( TO lvarray_dependencies ELEMENTS caliper IF ENABLE_CALIPER )
 
diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
index 65830097..39684b93 100644
--- a/host-configs/ORNL/crusher-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -4,14 +4,18 @@ set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "")
 set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
 set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
 
-set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" )
-set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" )
+#set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" )
+#set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" )
+set(CAMP_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" )
 
 set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
-set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-aeczo5gctizktwwt5x7xlmuyoarwipag/" CACHE PATH "" )
-set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" )
+set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-xpf6nnrxjhhggterbeto5ugxdgftpmon" CACHE PATH "" )
+
 set(ENABLE_CHAI TRUE CACHE BOOL "" )
-set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-yubforuougga3ujwwpfz3tmybqhroczp/" CACHE PATH "" )
+set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-develop-6yofhoaebc3bnz5wbzqnweeknbpomgrt" CACHE PATH "" )
+
+#set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" )
+set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-develop-6rh55pqg6dxvconxa52itkvdnptm3mfl" CACHE PATH "" )
 
 set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
 set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" )
@@ -28,19 +32,24 @@ set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
 
 # HIP Options
 set( ENABLE_HIP ON CACHE BOOL "" FORCE )
+set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
 set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
 set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
 
 set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
 set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
 
-#set( CMAKE_CXX_FLAGS "--offload-arch=gfx90a -x hip -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE )
+# set( CMAKE_CXX_FLAGS "-D__HIP_PLATFORM_AMD__ -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1 --rocm-path=${HIP_ROOT} -x hip" CACHE STRING "" FORCE )
+
+# set( HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE )
+# set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE )
 
-set( HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE )
-set( HIP_HIPCC_FLAGS "-std=c++14 --amdgpu-target=gfx90a" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE )
+# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link
+# set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} )
 
-set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link
-set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} )
+set(CMAKE_CXX_FLAGS "-D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE)
+set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
 
 # GTEST options
 set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp
index 6b0d45ec..e82bd499 100644
--- a/src/ChaiBuffer.hpp
+++ b/src/ChaiBuffer.hpp
@@ -78,14 +78,20 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space )
   if( space == chai::NONE )
     return MemorySpace::undefined;
   if( space == chai::CPU )
+  {
+    std::cout << "toHost" << std::endl;
     return MemorySpace::host;
+  }
 #if defined(LVARRAY_USE_CUDA)
   if( space == chai::GPU )
     return MemorySpace::cuda;
 #endif
 #if defined(LVARRAY_USE_HIP)
   if( space == chai::GPU )
+  {
+    std::cout << "toHIPGPU" << std::endl;
     return MemorySpace::hip;
+  }
 #endif
 
   LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) );
@@ -149,6 +155,7 @@ class ChaiBuffer
 
     for( int space = chai::CPU; space < chai::NUM_EXECUTION_SPACES; ++space )
     {
+      // std::cout << space << std::endl
       m_pointerRecord->m_allocators[ space ] = internal::getArrayManager().getAllocatorId( chai::ExecutionSpace( space ) );
     }
   }
@@ -193,7 +200,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) )
+  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) )
     move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true );
   #endif
   }
@@ -211,7 +218,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_ARCH__) )
+  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) )
     moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true );
   #else
     LVARRAY_UNUSED_VARIABLE( size );
diff --git a/src/Macros.hpp b/src/Macros.hpp
index a2060c1a..e5549bb6 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -22,7 +22,20 @@
 #include <iostream>
 #include <type_traits>
 
-#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
+#if defined(LVARRAY_USE_CUDA)
+  #define LVARRAY_GPU_LANG CUDA
+#elif defined(LVARRAY_USE_HIP)
+  #define LVARRAY_GPU_LANG HIP
+#endif
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  #define LVARRAY_ON_DEVICE 1
+#else
+  #define LVARRAY_ON_DEVICE 0
+#endif
+
+
+#if defined(LVARRAY_GPU_LANG)
   #include <cassert>
 #endif
 
@@ -91,7 +104,7 @@
  *       and a stack trace along with the provided message. On device none of this is
  *       guaranteed. In fact it is only guaranteed to abort the current kernel.
  */
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#if defined(__CUDA_ARCH__)
   #if !defined(NDEBUG)
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
@@ -118,6 +131,7 @@
     } \
   } while( false )
   #endif
+//#elif defined(__HIP_DEVICE_COMPILE__)
 #else
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
@@ -535,7 +549,7 @@
  */
 #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" )
 
-#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIPCC__) )
+#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIP_DEVICE_COMPILE__) )
 /// Mark a function for both host and device usage.
 #define LVARRAY_HOST_DEVICE __host__ __device__
 
@@ -549,7 +563,11 @@
  *   call host only code. This is safe as long as the host only instantiations are only called on
  *   the host. To use place directly above a the template.
  */
-#define DISABLE_HD_WARNING _Pragma("hd_warning_disable")
+#if defined(LVARRAY_USE_CUDA)
+  #define DISABLE_HD_WARNING _Pragma("hd_warning_disable")
+#else
+  #define DISABLE_HD_WARNING
+#endif
 #else
 /// Mark a function for both host and device usage.
 #define LVARRAY_HOST_DEVICE
diff --git a/src/system.cpp b/src/system.cpp
index 25a2ec13..a6532ac5 100644
--- a/src/system.cpp
+++ b/src/system.cpp
@@ -417,11 +417,16 @@ std::string calculateSize( size_t const bytes )
     suffix = "MB";
     shift = 20;
   }
-  else
+  else if( bytes >> 10 != 0 )
   {
     suffix = "KB";
     shift = 10;
   }
+  else
+  {
+    suffix = "B";
+    shift = 0;
+  }
 
   double const units = double( bytes ) / ( 1 << shift );
 

From ada2118dce68bc47eb3916663478f39b7863b4ea Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Mon, 4 Apr 2022 13:59:22 -0400
Subject: [PATCH 08/34] ongoing hip work and debugging

---
 host-configs/ORNL/crusher-cce@13.0.1.cmake | 21 +++---
 src/Array.hpp                              | 10 +--
 src/ArrayOfArraysView.hpp                  |  4 ++
 src/ArraySlice.hpp                         |  4 +-
 src/ArrayView.hpp                          |  3 +
 src/CRSMatrix.hpp                          |  2 +-
 src/CRSMatrixView.hpp                      |  3 +-
 src/ChaiBuffer.hpp                         | 11 +--
 src/Macros.hpp                             | 64 +++++++++--------
 src/arrayManipulation.hpp                  |  3 +-
 src/bufferManipulation.hpp                 |  2 +-
 src/math.hpp                               | 82 +++++++++++++---------
 src/sortedArrayManipulation.hpp            |  2 +-
 unitTests/testTensorOpsFixedSize.cpp       |  1 -
 14 files changed, 114 insertions(+), 98 deletions(-)

diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
index 39684b93..e12e2ec6 100644
--- a/host-configs/ORNL/crusher-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -1,21 +1,19 @@
+
 set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") 
 
 # Set up the tpls
 set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
 set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
 
-#set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-3zz2mkf2wvglevvl4ozepe4tzhwtchoa/" CACHE PATH "" )
-#set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-3qpdz6h2dzvfm5t7uabpz2ykiheza5b4/" CACHE PATH "" )
 set(CAMP_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" )
 
+set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" )
+
 set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
-set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-xpf6nnrxjhhggterbeto5ugxdgftpmon" CACHE PATH "" )
+set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" )
 
 set(ENABLE_CHAI TRUE CACHE BOOL "" )
-set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-develop-6yofhoaebc3bnz5wbzqnweeknbpomgrt" CACHE PATH "" )
-
-#set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-twro7k3cfsmp7s6mkiugsqncivj6w327/" CACHE PATH "" )
-set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-develop-6rh55pqg6dxvconxa52itkvdnptm3mfl" CACHE PATH "" )
+set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" )
 
 set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
 set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" )
@@ -45,10 +43,13 @@ set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
 # set( HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE )
 # set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE )
 
-# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fpgu-rdc --hip-link
+# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fgpu-rdc --hip-link
 # set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} )
 
-set(CMAKE_CXX_FLAGS "-D__HIP_ARCH_GFX90A__=1" CACHE STRING "" FORCE)
+# BLT WTF
+#set(CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE)
+#set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE)
+
 set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
 
 # GTEST options
@@ -56,7 +57,7 @@ set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
 set(gtest_disable_pthreads ON CACHE BOOL "")
 
 set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
-#set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
+set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
 set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
 set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
 set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
diff --git a/src/Array.hpp b/src/Array.hpp
index d05769cd..28ef6f95 100644
--- a/src/Array.hpp
+++ b/src/Array.hpp
@@ -91,10 +91,10 @@ class Array : public ArrayView< T,
   {
     this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims );
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(LVARRAY_DEVICE_COMPILE) 
     setName( "" );
 #endif
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE)
     Array::TV_ttf_display_type( nullptr );
 #endif
   }
@@ -121,10 +121,10 @@ class Array : public ArrayView< T,
   {
     this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims );
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(LVARRAY_DEVICE_COMPILE)
     setName( "" );
 #endif
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE)
     Array::TV_ttf_display_type( nullptr );
 #endif
   }
@@ -588,7 +588,7 @@ class Array : public ArrayView< T,
   void setName( std::string const & name )
   { this->m_dataBuffer.template setName< decltype(*this) >( name ); }
 
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE)
   /**
    * @brief Static function that will be used by Totalview to display the array contents.
    * @param av A pointer to the array that is being displayed.
diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 52c8df15..e3aeb72a 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -202,12 +202,14 @@ class ArrayOfArraysView
    * @brief A constructor to create an uninitialized ArrayOfArraysView.
    * @note An uninitialized ArrayOfArraysView should not be used until it is assigned to.
    */
+  LVARRAY_HOST_DEVICE
   ArrayOfArraysView() = default;
 
   /**
    * @brief Default copy constructor.
    * @note The copy constructor will trigger the copy constructor for @tparam BUFFER_TYPE
    */
+  LVARRAY_HOST_DEVICE
   ArrayOfArraysView( ArrayOfArraysView const & ) = default;
 
   /**
@@ -244,6 +246,7 @@ class ArrayOfArraysView
    * @brief Default copy assignment operator.
    * @return *this.
    */
+  LVARRAY_HOST_DEVICE
   inline
   ArrayOfArraysView & operator=( ArrayOfArraysView const & ) = default;
 
@@ -252,6 +255,7 @@ class ArrayOfArraysView
    * @param src the SparsityPatternView to be moved from.
    * @return *this.
    */
+  LVARRAY_HOST_DEVICE
   inline
   ArrayOfArraysView & operator=( ArrayOfArraysView && src )
   {
diff --git a/src/ArraySlice.hpp b/src/ArraySlice.hpp
index b4e22345..374979e3 100644
--- a/src/ArraySlice.hpp
+++ b/src/ArraySlice.hpp
@@ -126,7 +126,7 @@ class ArraySlice
     m_dims( inputDimensions ),
     m_strides( inputStrides )
   {
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK)
     ArraySlice::TV_ttf_display_type( nullptr );
 #endif
   }
@@ -341,7 +341,7 @@ class ArraySlice
 
   ///@}
 
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK)
   /**
    * @brief Static function that will be used by Totalview to display the array contents.
    * @param av A pointer to the array that is being displayed.
diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp
index aabd48bf..799ac461 100644
--- a/src/ArrayView.hpp
+++ b/src/ArrayView.hpp
@@ -118,6 +118,7 @@ class ArrayView
    * @brief A constructor to create an uninitialized ArrayView.
    * @note An uninitialized ArrayView should not be used until it is assigned to.
    */
+  LVARRAY_HOST_DEVICE
   ArrayView() = default;
 
   /**
@@ -185,6 +186,7 @@ class ArrayView
    * ArrayView< int, 1, 0, std::ptrdiff_t, MallocBuffer > anotherView = std::move( view );
    * @endcode
    */
+  //LVARRAY_HOST_DEVICE
   ArrayView( ArrayView && source ) = default;
 
   /**
@@ -206,6 +208,7 @@ class ArrayView
   {}
 
   /// The default destructor.
+  LVARRAY_HOST_DEVICE
   ~ArrayView() = default;
 
   /**
diff --git a/src/CRSMatrix.hpp b/src/CRSMatrix.hpp
index ddd786c5..daffdd9e 100644
--- a/src/CRSMatrix.hpp
+++ b/src/CRSMatrix.hpp
@@ -139,7 +139,7 @@ class CRSMatrix : protected CRSMatrixView< T, COL_TYPE, INDEX_TYPE, BUFFER_TYPE
       RAJA::forall< POLICY >( RAJA::TypedRangeSegment< INDEX_TYPE >( 0, numRows() ),
                               [view] LVARRAY_HOST_DEVICE ( INDEX_TYPE const row )
         {
-          INDEX_TYPE const nnz = view.numNonZeros( row );
+	  INDEX_TYPE const nnz = view.numNonZeros( row );
           T * const entries = view.getEntries( row );
           arrayManipulation::destroy( entries, nnz );
         } );
diff --git a/src/CRSMatrixView.hpp b/src/CRSMatrixView.hpp
index bc954672..fe3c7c99 100644
--- a/src/CRSMatrixView.hpp
+++ b/src/CRSMatrixView.hpp
@@ -106,12 +106,13 @@ class CRSMatrixView : protected SparsityPatternView< COL_TYPE, INDEX_TYPE, BUFFE
   /**
    * @brief Default copy constructor.
    */
+  LVARRAY_HOST_DEVICE
   CRSMatrixView( CRSMatrixView const & ) = default;
 
   /**
    * @brief Default move constructor.
    */
-  inline
+  LVARRAY_HOST_DEVICE inline
   CRSMatrixView( CRSMatrixView && ) = default;
 
   /**
diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp
index e82bd499..f78998a8 100644
--- a/src/ChaiBuffer.hpp
+++ b/src/ChaiBuffer.hpp
@@ -78,20 +78,14 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space )
   if( space == chai::NONE )
     return MemorySpace::undefined;
   if( space == chai::CPU )
-  {
-    std::cout << "toHost" << std::endl;
     return MemorySpace::host;
-  }
 #if defined(LVARRAY_USE_CUDA)
   if( space == chai::GPU )
     return MemorySpace::cuda;
 #endif
 #if defined(LVARRAY_USE_HIP)
   if( space == chai::GPU )
-  {
-    std::cout << "toHIPGPU" << std::endl;
     return MemorySpace::hip;
-  }
 #endif
 
   LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) );
@@ -155,7 +149,6 @@ class ChaiBuffer
 
     for( int space = chai::CPU; space < chai::NUM_EXECUTION_SPACES; ++space )
     {
-      // std::cout << space << std::endl
       m_pointerRecord->m_allocators[ space ] = internal::getArrayManager().getAllocatorId( chai::ExecutionSpace( space ) );
     }
   }
@@ -200,7 +193,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) )
+  #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE)
     move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true );
   #endif
   }
@@ -218,7 +211,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if ( defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) ) || ( defined(LVARRAY_USE_HIP) && !defined(__HIP_DEVICE_COMPILE__) )
+  #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE)
     moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true );
   #else
     LVARRAY_UNUSED_VARIABLE( size );
diff --git a/src/Macros.hpp b/src/Macros.hpp
index e5549bb6..d4ff7562 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -12,6 +12,8 @@
 
 #pragma once
 
+#pragma clang diagnostic ignored "-Wfloat-equal"
+
 // Source includes
 #include "LvArrayConfig.hpp"
 #include "system.hpp"
@@ -22,22 +24,23 @@
 #include <iostream>
 #include <type_traits>
 
-#if defined(LVARRAY_USE_CUDA)
-  #define LVARRAY_GPU_LANG CUDA
-#elif defined(LVARRAY_USE_HIP)
-  #define LVARRAY_GPU_LANG HIP
+
+#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
+  #define LVARRAY_USE_DEVICE
 #endif
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  #define LVARRAY_ON_DEVICE 1
-#else
-  #define LVARRAY_ON_DEVICE 0
+  #define LVARRAY_DEVICE_COMPILE
 #endif
 
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  #define LVARRAY_DECORATE
+#endif
 
-#if defined(LVARRAY_GPU_LANG)
+
+//#if !defined(NDEBUG) && defined(LVARRAY_DEVICE_COMPILE)
   #include <cassert>
-#endif
+//#endif
 
 /**
  * @brief Convert @p A into a string.
@@ -51,6 +54,8 @@
  */
 #define STRINGIZE( A ) STRINGIZE_NX( A )
 
+//#pragma message "LVARRAY_DEVICE_COMPILE: " STRINGIZE(LVARRAY_DEVICE_COMPILE)
+
 /**
  * @brief Mark @p X as an unused argument, used to silence compiler warnings.
  * @param X the unused argument.
@@ -104,8 +109,8 @@
  *       and a stack trace along with the provided message. On device none of this is
  *       guaranteed. In fact it is only guaranteed to abort the current kernel.
  */
-#if defined(__CUDA_ARCH__)
-  #if !defined(NDEBUG)
+#if defined(LVARRAY_DEVICE_COMPILE)
+//  #if !defined(NDEBUG)
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
   { \
@@ -114,24 +119,23 @@
       assert( false && "EXP = " STRINGIZE( EXP ) "MSG = " STRINGIZE( MSG ) ); \
     } \
   } while( false )
-  #else
-#define LVARRAY_ERROR_IF( EXP, MSG ) \
-  do \
-  { \
-    if( EXP ) \
-    { \
-      constexpr char const * formatString = "***** ERROR\n" \
-                                            "***** LOCATION: " LOCATION "\n" \
-                                                                        "***** Block: [%u, %u, %u]\n" \
-                                                                        "***** Thread: [%u, %u, %u]\n" \
-                                                                        "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
-                                                                                                                                            "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
-      printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \
-      asm ( "trap;" ); \
-    } \
-  } while( false )
-  #endif
-//#elif defined(__HIP_DEVICE_COMPILE__)
+//   #else
+// #define LVARRAY_ERROR_IF( EXP, MSG ) \
+//   do \
+//   { \
+//     if( EXP ) \
+//     { \
+//       constexpr char const * formatString = "***** ERROR\n" \
+//                                             "***** LOCATION: " LOCATION "\n" \
+//                                                                         "***** Block: [%u, %u, %u]\n" \
+//                                                                         "***** Thread: [%u, %u, %u]\n" \
+//                                                                         "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
+//                                                                                                                                             "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
+//       printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \
+//       asm ( "trap;" ); \
+//     } \
+//   } while( false )
+//   #endif
 #else
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
@@ -549,7 +553,7 @@
  */
 #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" )
 
-#if ( defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) ) || ( defined(LVARRAY_USE_HIP) && defined(__HIP_DEVICE_COMPILE__) )
+#if defined(LVARRAY_DECORATE)
 /// Mark a function for both host and device usage.
 #define LVARRAY_HOST_DEVICE __host__ __device__
 
diff --git a/src/arrayManipulation.hpp b/src/arrayManipulation.hpp
index 5409e60f..4b5c2d55 100644
--- a/src/arrayManipulation.hpp
+++ b/src/arrayManipulation.hpp
@@ -297,8 +297,7 @@ void resize( T * const LVARRAY_RESTRICT ptr,
   {
     if( newSize - size > 0 )
     {
-      std::size_t const sizeDiff = integerConversion< std::size_t >( newSize - size );
-      std::memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) );
+      memset( reinterpret_cast< void * >( ptr + size ), 0, ( newSize - size ) * sizeof( T ) );
     }
   }
   else
diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp
index b06a4e4c..548cfe2b 100644
--- a/src/bufferManipulation.hpp
+++ b/src/bufferManipulation.hpp
@@ -292,7 +292,7 @@ void resize( BUFFER & buf, std::ptrdiff_t const size, std::ptrdiff_t const newSi
 
   arrayManipulation::resize( buf.data(), size, newSize, std::forward< ARGS >( args )... );
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(LVARRAY_DEVICE_COMPILE)
   if( newSize > 0 )
   {
     buf.registerTouch( MemorySpace::host );
diff --git a/src/math.hpp b/src/math.hpp
index f832e0fa..3bf2d9fd 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -134,7 +134,7 @@ __half2 convert( __half2 const, U const u )
 LVARRAY_HOST_DEVICE inline
 __half2 convert( __half2 const, __half const u )
 {
-#if defined( __CUDA_ARCH__ )
+#if defined( LVARRAY_DEVICE_COMPILE )
   return __half2half2( u );
 #else
   return __float2half2_rn( u );
@@ -164,7 +164,7 @@ __half2 convert( __half2 const, U const u, V const v )
 LVARRAY_HOST_DEVICE inline
 __half2 convert( __half2 const, __half const u, __half const v )
 {
-#if defined( __CUDA_ARCH__ )
+#if defined( LVARRAY_DEVICE_COMPILE )
   return __halves2half2( u, v );
 #else
   return __floats2half2_rn( u, v );
@@ -310,7 +310,7 @@ LVARRAY_HOST_DEVICE inline constexpr
 std::enable_if_t< std::is_arithmetic< T >::value, T >
 max( T const a, T const b )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::max( a, b );
 #else
   return std::max( a, b );
@@ -323,8 +323,10 @@ max( T const a, T const b )
 LVARRAY_DEVICE inline
 __half max( __half const a, __half const b )
 {
-#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
   return __hmax( a, b );
+#elif defined(LVARRAY_USE_HIP)
+  return __hgt( a, b ) ? a : b;
 #else
   return a > b ? a : b;
 #endif
@@ -334,8 +336,10 @@ __half max( __half const a, __half const b )
 LVARRAY_DEVICE inline
 __half2 max( __half2 const a, __half2 const b )
 {
-#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
-  return __hmax2( a, b );
+#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    return __hmax2( a, b );
+#elif defined(LVARRAY_USE_HIP)
+    return __hgt2( a, b ) ? a : b;
 #else
   __half2 const aFactor = __hge2( a, b );
   __half2 const bFactor = convert< __half2 >( 1 ) - aFactor;
@@ -357,7 +361,7 @@ LVARRAY_HOST_DEVICE inline constexpr
 std::enable_if_t< std::is_arithmetic< T >::value, T >
 min( T const a, T const b )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::min( a, b );
 #else
   return std::min( a, b );
@@ -370,8 +374,10 @@ min( T const a, T const b )
 LVARRAY_DEVICE inline
 __half min( __half const a, __half const b )
 {
-#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
   return __hmin( a, b );
+#elif defined(LVARRAY_USE_HIP)
+  return __hlt( a, b ) ? a : b;
 #else
   return a < b ? a : b;
 #endif
@@ -381,8 +387,10 @@ __half min( __half const a, __half const b )
 LVARRAY_DEVICE inline
 __half2 min( __half2 const a, __half2 const b )
 {
-#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
   return __hmin2( a, b );
+#elif defined(LVARRAY_USE_HIP) 
+  return __hlt2( a, b ) ? a : b;
 #else
   __half2 const aFactor = __hle2( a, b );
   __half2 const bFactor = convert< __half2 >( 1 ) - aFactor;
@@ -401,7 +409,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline constexpr
 T abs( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::abs( x );
 #else
   return std::abs( x );
@@ -460,7 +468,7 @@ T square( T const x )
 LVARRAY_HOST_DEVICE inline
 float sqrt( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sqrtf( x );
 #else
   return std::sqrt( x );
@@ -472,7 +480,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double sqrt( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sqrt( double( x ) );
 #else
   return std::sqrt( x );
@@ -502,7 +510,7 @@ __half2 sqrt( __half2 const x )
 LVARRAY_HOST_DEVICE inline
 float invSqrt( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::rsqrtf( x );
 #else
   return 1 / std::sqrt( x );
@@ -514,7 +522,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double invSqrt( T const x )
 {
-#if defined( __CUDA_ARCH__ )
+#if defined( LVARRAY_DEVICE_COMPILE )
   return ::rsqrt( double( x ) );
 #else
   return 1 / std::sqrt( x );
@@ -551,7 +559,7 @@ __half2 invSqrt( __half2 const x )
 LVARRAY_HOST_DEVICE inline
 float sin( float const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sinf( theta );
 #else
   return std::sin( theta );
@@ -563,7 +571,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double sin( T const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sin( double( theta ) );
 #else
   return std::sin( theta );
@@ -593,7 +601,7 @@ __half2 sin( __half2 const theta )
 LVARRAY_HOST_DEVICE inline
 float cos( float const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::cosf( theta );
 #else
   return std::cos( theta );
@@ -605,7 +613,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double cos( T const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::cos( double( theta ) );
 #else
   return std::cos( theta );
@@ -635,8 +643,12 @@ __half2 cos( __half2 const theta )
 LVARRAY_HOST_DEVICE inline
 void sincos( float const theta, float & sinTheta, float & cosTheta )
 {
-#if defined(__CUDA_ARCH__)
-  ::sincos( theta, &sinTheta, &cosTheta );
+#if defined(LVARRAY_DEVICE_COMPILE)
+  #if defined(LVARRAY_USE_CUDA)
+    ::sincos( theta, &sinTheta, &cosTheta );
+  #elif defined(LVARRAY_USE_HIP)
+    ::sincosf( theta, &sinTheta, &cosTheta );
+  #endif
 #else
   sinTheta = std::sin( theta );
   cosTheta = std::cos( theta );
@@ -648,8 +660,8 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 void sincos( double const theta, double & sinTheta, double & cosTheta )
 {
-#if defined(__CUDA_ARCH__)
-  ::sincos( theta, &sinTheta, &cosTheta );
+#if defined(LVARRAY_DEVICE_COMPILE)
+  ::sincos( theta, &sinTheta, &cosTheta ); // hip and cuda versions both use double
 #else
   sinTheta = std::sin( theta );
   cosTheta = std::cos( theta );
@@ -661,7 +673,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 void sincos( T const theta, double & sinTheta, double & cosTheta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   double s, c;
   ::sincos( theta, &s, &c );
   sinTheta = s;
@@ -701,7 +713,7 @@ void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta )
 LVARRAY_HOST_DEVICE inline
 float tan( float const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::tanf( theta );
 #else
   return std::tan( theta );
@@ -713,7 +725,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double tan( T const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::tan( double( theta ) );
 #else
   return std::tan( theta );
@@ -845,7 +857,7 @@ T atan2Impl( T const y, T const x )
 LVARRAY_HOST_DEVICE inline
 float asin( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::asinf( x );
 #else
   return std::asin( x );
@@ -857,7 +869,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double asin( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::asin( double( x ) );
 #else
   return std::asin( x );
@@ -887,7 +899,7 @@ __half2 asin( __half2 const x )
 LVARRAY_HOST_DEVICE inline
 float acos( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::acosf( x );
 #else
   return std::acos( x );
@@ -899,7 +911,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double acos( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::acos( double( x ) );
 #else
   return std::acos( x );
@@ -930,7 +942,7 @@ __half2 acos( __half2 const x )
 LVARRAY_HOST_DEVICE inline
 float atan2( float const y, float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::atan2f( y, x );
 #else
   return std::atan2( y, x );
@@ -942,7 +954,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double atan2( T const y, T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::atan2( double( y ), double( x ) );
 #else
   return std::atan2( y, x );
@@ -979,7 +991,7 @@ __half2 atan2( __half2 const y, __half2 const x )
 LVARRAY_HOST_DEVICE inline
 float exp( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::expf( x );
 #else
   return std::exp( x );
@@ -991,7 +1003,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double exp( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::exp( double( x ) );
 #else
   return std::exp( x );
@@ -1021,7 +1033,7 @@ __half2 exp( __half2 const x )
 LVARRAY_HOST_DEVICE inline
 float log( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::logf( x );
 #else
   return std::log( x );
@@ -1033,7 +1045,7 @@ template< typename T >
 LVARRAY_HOST_DEVICE inline
 double log( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::log( double( x ) );
 #else
   return std::log( x );
diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp
index 7e9cae5d..d4bdbeed 100644
--- a/src/sortedArrayManipulation.hpp
+++ b/src/sortedArrayManipulation.hpp
@@ -211,7 +211,7 @@ LVARRAY_HOST_DEVICE inline void makeSorted( RandomAccessIterator const first,
                                             RandomAccessIterator const last,
                                             Compare && comp=Compare() )
 {
-#ifdef __CUDA_ARCH__
+#if defined(LVARRAY_DEVICE_COMPILE)
   if( last - first > internal::INTROSORT_THRESHOLD )
   {
     internal::introsortLoop( first, last, comp );
diff --git a/unitTests/testTensorOpsFixedSize.cpp b/unitTests/testTensorOpsFixedSize.cpp
index c4ba14cb..21392a17 100644
--- a/unitTests/testTensorOpsFixedSize.cpp
+++ b/unitTests/testTensorOpsFixedSize.cpp
@@ -616,6 +616,5 @@ TYPED_TEST( FixedSizeSquareMatrixTest, denseToSymmetric )
 {
   this->denseToSymmetric();
 }
-
 } // namespace testing
 } // namespace LvArray

From dc6df9d0623ecba7d0686c5050f140370dd13944 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 12 May 2022 10:10:55 -0700
Subject: [PATCH 09/34] reactivate device error macro

---
 src/Macros.hpp | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/Macros.hpp b/src/Macros.hpp
index d4ff7562..b38f757b 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -110,7 +110,7 @@
  *       guaranteed. In fact it is only guaranteed to abort the current kernel.
  */
 #if defined(LVARRAY_DEVICE_COMPILE)
-//  #if !defined(NDEBUG)
+  #if !defined(NDEBUG)
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
   { \
@@ -119,23 +119,24 @@
       assert( false && "EXP = " STRINGIZE( EXP ) "MSG = " STRINGIZE( MSG ) ); \
     } \
   } while( false )
-//   #else
-// #define LVARRAY_ERROR_IF( EXP, MSG ) \
-//   do \
-//   { \
-//     if( EXP ) \
-//     { \
-//       constexpr char const * formatString = "***** ERROR\n" \
-//                                             "***** LOCATION: " LOCATION "\n" \
-//                                                                         "***** Block: [%u, %u, %u]\n" \
-//                                                                         "***** Thread: [%u, %u, %u]\n" \
-//                                                                         "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
-//                                                                                                                                             "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
-//       printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \
-//       asm ( "trap;" ); \
-//     } \
-//   } while( false )
-//   #endif
+  #else
+#define LVARRAY_ERROR_IF( EXP, MSG ) \
+  do \
+  { \
+    if( EXP ) \
+    { \
+      constexpr char const * formatString = "***** ERROR\n" \
+                                            "***** LOCATION: " LOCATION "\n" \
+                                                                        "***** Block: [%u, %u, %u]\n" \
+                                                                        "***** Thread: [%u, %u, %u]\n" \
+                                                                        "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
+                                                                                                                                            "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
+      printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \
+      asm ( "trap;" ); \
+    } \
+  } while( false )
+  #endif
+
 #else
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \

From 08ef87505014303e0c7c95d7802aa03022d2c09f Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Fri, 13 May 2022 13:20:59 -0700
Subject: [PATCH 10/34] hip device namespace issue

---
 src/arrayManipulation.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/arrayManipulation.hpp b/src/arrayManipulation.hpp
index 4b5c2d55..21f708e1 100644
--- a/src/arrayManipulation.hpp
+++ b/src/arrayManipulation.hpp
@@ -297,7 +297,8 @@ void resize( T * const LVARRAY_RESTRICT ptr,
   {
     if( newSize - size > 0 )
     {
-      memset( reinterpret_cast< void * >( ptr + size ), 0, ( newSize - size ) * sizeof( T ) );
+      std::size_t const sizeDiff = integerConversion< std::size_t >( newSize - size );
+      memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) );
     }
   }
   else

From cee3b1cb3f6468f943598a3c929c57b345184b25 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Thu, 12 May 2022 13:16:34 -0400
Subject: [PATCH 11/34] more crusher changes

---
 host-configs/ORNL/crusher-cce@13.0.1.cmake | 16 ++++++++++------
 src/ArrayView.hpp                          | 10 +++++-----
 src/indexing.hpp                           |  2 +-
 src/sortedArrayManipulation.hpp            |  2 +-
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
index e12e2ec6..d76cde9b 100644
--- a/host-configs/ORNL/crusher-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -4,16 +4,17 @@ set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "")
 # Set up the tpls
 set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
 set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "")
 
-set(CAMP_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" )
+set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" )
 
-set(RAJA_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" )
+set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" )
 
 set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
-set(UMPIRE_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" )
 
 set(ENABLE_CHAI TRUE CACHE BOOL "" )
-set(CHAI_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen3/cce-13.0.1/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" )
 
 set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
 set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" )
@@ -46,9 +47,12 @@ set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
 # set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fgpu-rdc --hip-link
 # set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} )
 
+set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+
 # BLT WTF
-#set(CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE)
-#set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE)
+# set(CMAKE_HIP_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE)
+# set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc " CACHE STRING "" FORCE)
 
 set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
 
diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp
index 799ac461..7f9f8df3 100644
--- a/src/ArrayView.hpp
+++ b/src/ArrayView.hpp
@@ -186,7 +186,7 @@ class ArrayView
    * ArrayView< int, 1, 0, std::ptrdiff_t, MallocBuffer > anotherView = std::move( view );
    * @endcode
    */
-  //LVARRAY_HOST_DEVICE
+  LVARRAY_HOST_DEVICE
   ArrayView( ArrayView && source ) = default;
 
   /**
@@ -515,7 +515,7 @@ class ArrayView
    * @note This method is only active when NDIM > 1.
    */
   template< int _NDIM=NDIM >
-  LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK
+  LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK
   std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > >
   operator[]( INDEX_TYPE const index ) const & noexcept
   {
@@ -534,7 +534,7 @@ class ArrayView
    *   prevents that from happening.
    */
   template< int _NDIM=NDIM >
-  LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK
+  LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK
   std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > >
   operator[]( INDEX_TYPE const index ) const && noexcept = delete;
 
@@ -544,7 +544,7 @@ class ArrayView
    * @note This method is only active when NDIM == 1.
    */
   template< int _NDIM=NDIM >
-  LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK
+  LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK
   std::enable_if_t< _NDIM == 1, T & >
   operator[]( INDEX_TYPE const index ) const & noexcept
   {
@@ -558,7 +558,7 @@ class ArrayView
    * @param indices The indices of the value to access.
    */
   template< typename ... INDICES >
-  LVARRAY_HOST_DEVICE inline constexpr
+  LVARRAY_HOST_DEVICE __forceinline__ constexpr
   T & operator()( INDICES... indices ) const
   {
     static_assert( sizeof ... (INDICES) == NDIM, "number of indices does not match NDIM" );
diff --git a/src/indexing.hpp b/src/indexing.hpp
index 2dca4597..dbb6219f 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -44,7 +44,7 @@ struct ConditionalMultiply
    * @param b The right multiplication operand.
    */
   template< typename A, typename B >
-  static inline LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b )
+  static __forceinline__ LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b )
   { return a * b; }
 };
 
diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp
index d4bdbeed..1291929d 100644
--- a/src/sortedArrayManipulation.hpp
+++ b/src/sortedArrayManipulation.hpp
@@ -407,7 +407,7 @@ bool isSortedUnique( ITER first, ITER const last, Compare && comp=Compare() )
  */
 DISABLE_HD_WARNING
 template< typename T, typename Compare=less< T > >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE __forceinline__
 std::ptrdiff_t find( T const * const LVARRAY_RESTRICT ptr,
                      std::ptrdiff_t const size,
                      T const & value,

From 9926a7013b9f8290c1d39ab78bf083a07573d803 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Fri, 13 May 2022 17:03:19 -0400
Subject: [PATCH 12/34] raja api change

---
 src/ArrayOfArraysView.hpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index e3aeb72a..d042d091 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -732,12 +732,7 @@ class ArrayOfArraysView
     auto const fillOffsets = [&]()
     {
       m_offsets[ 0 ] = 0;
-//      RAJA::inclusive_scan< POLICY >( capacities,
-//                                      capacities + numSubArrays,
-//                                      m_offsets.data() + 1 );
-
-      RAJA::inclusive_scan< POLICY >( RAJA::make_span< INDEX_TYPE const * >( capacities, numSubArrays ),
-                                      RAJA::make_span< INDEX_TYPE * >( m_offsets.data()+1, numSubArrays ) );
+      RAJA::inclusive_scan< POLICY >( RAJA::make_span(capacities, numSubArrays), RAJA::make_span(m_offsets.data() + 1, numSubArrays) );
     };
     resizeFromOffsetsImpl( numSubArrays, fillOffsets, buffers ... );
   }

From 50cb3431390413b253c6cfaef508da01b66a3740 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Fri, 20 May 2022 14:00:33 -0400
Subject: [PATCH 13/34] cleanup and changes related to cpu-only build post
 05.17.22 crusher downtime

---
 host-configs/ORNL/crusher-cce@13.0.1.cmake    | 30 ++++------
 .../ORNL/crusher-cpu-cce@13.0.1.cmake         | 60 +++++++++++++++++++
 src/ArrayView.hpp                             |  8 +--
 src/indexing.hpp                              |  2 +-
 src/sortedArrayManipulation.hpp               |  2 +-
 5 files changed, 76 insertions(+), 26 deletions(-)
 create mode 100644 host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake

diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
index d76cde9b..157fc0ab 100644
--- a/host-configs/ORNL/crusher-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -31,30 +31,20 @@ set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
 
 # HIP Options
 set( ENABLE_HIP ON CACHE BOOL "" FORCE )
-set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
 
-set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
-set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
 
-set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
-set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
+  set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
 
-# set( CMAKE_CXX_FLAGS "-D__HIP_PLATFORM_AMD__ -D__HIP_ROCclr -D__HIP_ARCH_GFX90A__=1 --rocm-path=${HIP_ROOT} -x hip" CACHE STRING "" FORCE )
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
 
-# set( HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>:-I/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/include>" CACHE STRING "" FORCE )
-# set( HIP_HIPCC_FLAGS "-std=c++14" CACHE STRING "" FORCE ) # -fgpu-rdc" CACHE STRING "" FORCE )
-
-# set( CMAKE_EXE_LINKER_FLAGS "-L/opt/cray/pe/mpich/8.1.12/ofi/crayclang/10.0/lib -lmpi -L/opt/cray/pe/mpich/8.1.12/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE ) # -fgpu-rdc --hip-link
-# set( CMAKE_CXX_LINK_FLAGS ${CMAKE_EXE_LINKER_FLAGS} )
-
-set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
-set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
-
-# BLT WTF
-# set(CMAKE_HIP_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE)
-# set(CMAKE_CXX_LINK_FLAGS "-fgpu-rdc " CACHE STRING "" FORCE)
-
-set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
+set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
 
 # GTEST options
 set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
new file mode 100644
index 00000000..1a12fc7d
--- /dev/null
+++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
@@ -0,0 +1,60 @@
+
+set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") 
+
+# Set up the tpls
+set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-mej6trivmy7o5vlr6a52cml6tzxb5fvk" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-tmukf35ms7f2pkfswpejbnt3jtnpkakc" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-2022.03.0-unirfq5er4vtyr2koymgi3xxq6h2f5l5" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-aggyh463v2rz6s44laqshylc4xeeg4h7" CACHE PATH "" )
+
+set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
+set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" )
+
+# C++ options
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "")
+
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
+
+set( ENABLE_MPI ON CACHE BOOL "" FORCE )
+set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
+
+# HIP Options
+set( ENABLE_HIP OFF CACHE BOOL "" FORCE )
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
+  set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
+
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
+
+set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
+
+# GTEST options
+set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
+set(gtest_disable_pthreads ON CACHE BOOL "")
+
+set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
+set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
+set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
+set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
+
+#BLT
+set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
\ No newline at end of file
diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp
index 7f9f8df3..1a2a3d03 100644
--- a/src/ArrayView.hpp
+++ b/src/ArrayView.hpp
@@ -515,7 +515,7 @@ class ArrayView
    * @note This method is only active when NDIM > 1.
    */
   template< int _NDIM=NDIM >
-  LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK
+  LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK
   std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > >
   operator[]( INDEX_TYPE const index ) const & noexcept
   {
@@ -534,7 +534,7 @@ class ArrayView
    *   prevents that from happening.
    */
   template< int _NDIM=NDIM >
-  LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK
+  LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK
   std::enable_if_t< (_NDIM > 1), ArraySlice< T, NDIM - 1, USD - 1, INDEX_TYPE > >
   operator[]( INDEX_TYPE const index ) const && noexcept = delete;
 
@@ -544,7 +544,7 @@ class ArrayView
    * @note This method is only active when NDIM == 1.
    */
   template< int _NDIM=NDIM >
-  LVARRAY_HOST_DEVICE __forceinline__ CONSTEXPR_WITHOUT_BOUNDS_CHECK
+  LVARRAY_HOST_DEVICE inline CONSTEXPR_WITHOUT_BOUNDS_CHECK
   std::enable_if_t< _NDIM == 1, T & >
   operator[]( INDEX_TYPE const index ) const & noexcept
   {
@@ -558,7 +558,7 @@ class ArrayView
    * @param indices The indices of the value to access.
    */
   template< typename ... INDICES >
-  LVARRAY_HOST_DEVICE __forceinline__ constexpr
+  LVARRAY_HOST_DEVICE inline constexpr
   T & operator()( INDICES... indices ) const
   {
     static_assert( sizeof ... (INDICES) == NDIM, "number of indices does not match NDIM" );
diff --git a/src/indexing.hpp b/src/indexing.hpp
index dbb6219f..2dca4597 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -44,7 +44,7 @@ struct ConditionalMultiply
    * @param b The right multiplication operand.
    */
   template< typename A, typename B >
-  static __forceinline__ LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b )
+  static inline LVARRAY_HOST_DEVICE constexpr auto multiply( A const a, B const b )
   { return a * b; }
 };
 
diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp
index 1291929d..d4bdbeed 100644
--- a/src/sortedArrayManipulation.hpp
+++ b/src/sortedArrayManipulation.hpp
@@ -407,7 +407,7 @@ bool isSortedUnique( ITER first, ITER const last, Compare && comp=Compare() )
  */
 DISABLE_HD_WARNING
 template< typename T, typename Compare=less< T > >
-LVARRAY_HOST_DEVICE __forceinline__
+LVARRAY_HOST_DEVICE inline
 std::ptrdiff_t find( T const * const LVARRAY_RESTRICT ptr,
                      std::ptrdiff_t const size,
                      T const & value,

From db28fd4513a27dfcbacd17f447e3a82a5f5c6216 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Fri, 20 May 2022 15:35:08 -0400
Subject: [PATCH 14/34] formatting

---
 src/Macros.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/Macros.hpp b/src/Macros.hpp
index b38f757b..cffdaed6 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -109,8 +109,11 @@
  *       and a stack trace along with the provided message. On device none of this is
  *       guaranteed. In fact it is only guaranteed to abort the current kernel.
  */
+// cce processes __host__ functions with __hip_device_compile__=1 when -x hip?
+// the entire compilation unit has __hip_device_compile__=1, whereas __cuda_arch__
+// seems to be scope-defined as it isn't defined in __host__ functions
 #if defined(LVARRAY_DEVICE_COMPILE)
-  #if !defined(NDEBUG)
+  #if !defined(NDEBUG) || __HIP_DEVICE_COMPILE__
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
   { \
@@ -127,16 +130,15 @@
     { \
       constexpr char const * formatString = "***** ERROR\n" \
                                             "***** LOCATION: " LOCATION "\n" \
-                                                                        "***** Block: [%u, %u, %u]\n" \
-                                                                        "***** Thread: [%u, %u, %u]\n" \
-                                                                        "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
-                                                                                                                                            "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
+                                            "***** Block: [%u, %u, %u]\n" \
+                                            "***** Thread: [%u, %u, %u]\n" \
+                                            "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
+                                            "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
       printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \
       asm ( "trap;" ); \
     } \
   } while( false )
   #endif
-
 #else
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \

From eee17ae2bf135ee6c9760b924af4e523652b22eb Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Fri, 3 Jun 2022 15:05:01 -0400
Subject: [PATCH 15/34] cce@14.0.0 and rocm@5.2.0

---
 host-configs/ORNL/crusher-cce@14.0.0.cmake |  69 +++++++
 src/Macros.hpp                             |   4 +
 src/math.hpp                               | 206 +++++++++++----------
 3 files changed, 178 insertions(+), 101 deletions(-)
 create mode 100644 host-configs/ORNL/crusher-cce@14.0.0.cmake

diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake
new file mode 100644
index 00000000..6a509960
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake
@@ -0,0 +1,69 @@
+
+set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "")
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" )
+
+set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" )
+set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.15")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
+
+set( ENABLE_MPI ON CACHE BOOL "" FORCE )
+set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
+
+# HIP Options
+set( ENABLE_HIP ON CACHE BOOL "" FORCE )
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" )
+  
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
+
+# suppress -Werror for now
+set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE )
+
+# GTEST
+set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
+set(gtest_disable_pthreads ON CACHE BOOL "")
+
+# disable most binaries and doc generation
+set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
+set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
+set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
+set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
+
+# BLT trying to find MPI fails on cray with cce
+set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
+
+
+
+
+
+
diff --git a/src/Macros.hpp b/src/Macros.hpp
index cffdaed6..60a545a5 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -31,6 +31,9 @@
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   #define LVARRAY_DEVICE_COMPILE
+  #define LVARRAY_FORCE_INLINE __forceinline__
+#else
+  #define LVARRAY_FORCE_INLINE inline
 #endif
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
@@ -38,6 +41,7 @@
 #endif
 
 
+
 //#if !defined(NDEBUG) && defined(LVARRAY_DEVICE_COMPILE)
   #include <cassert>
 //#endif
diff --git a/src/math.hpp b/src/math.hpp
index 3bf2d9fd..d2b07191 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -45,7 +45,7 @@ namespace internal
  * @return @p u converted to @tparam T.
  */
 template< typename T, typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T convert( T const, U const u )
 { return u; }
 
@@ -55,7 +55,7 @@ T convert( T const, U const u )
  * @return The number of values stored in @tparam T, by default this is 1.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 int numValues( T const )
 { return 1; }
 
@@ -76,7 +76,7 @@ struct SingleType
  * @param x The value to return.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 SingleType< T > getFirst( T const x )
 { return x; }
 
@@ -86,7 +86,7 @@ SingleType< T > getFirst( T const x )
  * @param x The value to return.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 SingleType< T > getSecond( T const x )
 { return x; }
 
@@ -96,10 +96,17 @@ SingleType< T > getSecond( T const x )
  * @param x The first value.
  * @param y The second value.
  */
-template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
-T lessThan( T const x, T const y )
-{ return __hlt( x, y ); }
+  LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
+__half lessThan( __half const x, __half const y )
+{
+  return __hlt( x, y );
+}
+
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
+__half2 lessThan( __half2 const x, __half2 const y )
+{
+  return __hlt2( x, y );
+}
 
 #if defined( LVARRAY_USE_CUDA )
 /**
@@ -110,7 +117,7 @@ T lessThan( T const x, T const y )
  * @return @p u converted to @c __half.
  */
 template< typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 __half convert( __half const, U const u )
 { return __float2half_rn( u ); }
 
@@ -122,7 +129,7 @@ __half convert( __half const, U const u )
  * @return A @c __half2 with both halves having value @p u converted to @c __half.
  */
 template< typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 __half2 convert( __half2 const, U const u )
 { return __float2half2_rn( u ); }
 
@@ -131,7 +138,7 @@ __half2 convert( __half2 const, U const u )
  * @param u The value to convert.
  * @return A @c __half2 with both halves having value @p u.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 __half2 convert( __half2 const, __half const u )
 {
 #if defined( LVARRAY_DEVICE_COMPILE )
@@ -151,7 +158,7 @@ __half2 convert( __half2 const, __half const u )
  * @return A @c __half2 containing @p u as the first value and @p v as the second.
  */
 template< typename U, typename V >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 __half2 convert( __half2 const, U const u, V const v )
 { return __floats2half2_rn( u, v ); }
 
@@ -161,7 +168,7 @@ __half2 convert( __half2 const, U const u, V const v )
  * @param v The second value to convert.
  * @return A @c __half2 containing @p u as the first value and @p v as the second.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 __half2 convert( __half2 const, __half const u, __half const v )
 {
 #if defined( LVARRAY_DEVICE_COMPILE )
@@ -175,7 +182,7 @@ __half2 convert( __half2 const, __half const u, __half const v )
  * @brief Return the number of values stored in a @c __half2, which is 2.
  * @return The number of values stored in a @c __half2, which is 2.
  */
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 int numValues( __half2 const & )
 { return 2; }
 
@@ -193,7 +200,7 @@ struct SingleType< __half2 >
  * @return The fist @c __half in @p x.
  * @param x The value to query.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half getFirst( __half2 const x )
 { return __low2half( x ); }
 
@@ -201,7 +208,7 @@ __half getFirst( __half2 const x )
  * @return The second @c __half in @p x.
  * @param x The value to query.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half getSecond( __half2 const x )
 { return __high2half( x ); }
 
@@ -210,7 +217,7 @@ __half getSecond( __half2 const x )
  * @param x The first value.
  * @param y The second value.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half lessThan( __half const x, __half const y )
 { return __hlt( x, y ); }
 
@@ -219,7 +226,7 @@ __half lessThan( __half const x, __half const y )
  * @param x The first value.
  * @param y The second value.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 lessThan( __half2 const x, __half2 const y )
 { return __hlt2( x, y ); }
 
@@ -238,7 +245,7 @@ __half2 lessThan( __half2 const x, __half2 const y )
  * @return The number of values stored in type @tparam T.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 int numValues()
 { return internal::numValues( T() ); }
 
@@ -258,7 +265,7 @@ using SingleType = typename internal::SingleType< T >::type;
  * @return @p u converted to @tparam T.
  */
 template< typename T, typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T convert( U const u )
 { return internal::convert( T(), u ); }
 
@@ -273,7 +280,7 @@ T convert( U const u )
  * @return @p u, @p v converted to @tparam T.
  */
 template< typename T, typename U, typename V >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T convert( U const u, V const v )
 { return internal::convert( T(), u, v ); }
 
@@ -284,7 +291,7 @@ T convert( U const u, V const v )
  * @note If @code numValues< T >() == 1 @endcode then @p x is returned.
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 SingleType< T > getFirst( T const x )
 { return internal::getFirst( x ); }
 
@@ -295,7 +302,7 @@ SingleType< T > getFirst( T const x )
  * @note If @code numValues< T >() == 1 @endcode then @p x is returned.
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 SingleType< T > getSecond( T const x )
 { return internal::getSecond( x ); }
 
@@ -306,7 +313,7 @@ SingleType< T > getSecond( T const x )
  * @param b The second number.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 std::enable_if_t< std::is_arithmetic< T >::value, T >
 max( T const a, T const b )
 {
@@ -317,10 +324,10 @@ max( T const a, T const b )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc max( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half max( __half const a, __half const b )
 {
 #if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
@@ -333,13 +340,11 @@ __half max( __half const a, __half const b )
 }
 
 /// @copydoc max( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 max( __half2 const a, __half2 const b )
 {
 #if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
     return __hmax2( a, b );
-#elif defined(LVARRAY_USE_HIP)
-    return __hgt2( a, b ) ? a : b;
 #else
   __half2 const aFactor = __hge2( a, b );
   __half2 const bFactor = convert< __half2 >( 1 ) - aFactor;
@@ -357,7 +362,7 @@ __half2 max( __half2 const a, __half2 const b )
  * @param b The second number.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 std::enable_if_t< std::is_arithmetic< T >::value, T >
 min( T const a, T const b )
 {
@@ -371,26 +376,24 @@ min( T const a, T const b )
 #if defined( LVARRAY_USE_CUDA )
 
 /// @copydoc min( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE
+LVARRAY_FORCE_INLINE
 __half min( __half const a, __half const b )
 {
-#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
   return __hmin( a, b );
-#elif defined(LVARRAY_USE_HIP)
-  return __hlt( a, b ) ? a : b;
 #else
   return a < b ? a : b;
 #endif
 }
 
 /// @copydoc min( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE
+LVARRAY_FORCE_INLINE
 __half2 min( __half2 const a, __half2 const b )
 {
-#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
   return __hmin2( a, b );
-#elif defined(LVARRAY_USE_HIP) 
-  return __hlt2( a, b ) ? a : b;
 #else
   __half2 const aFactor = __hle2( a, b );
   __half2 const bFactor = convert< __half2 >( 1 ) - aFactor;
@@ -406,7 +409,7 @@ __half2 min( __half2 const a, __half2 const b )
  * @note This set of overloads is valid for any numeric type.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T abs( T const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -416,10 +419,10 @@ T abs( T const x )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc abs( T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half abs( __half const x )
 {
 #if CUDART_VERSION > 11000
@@ -430,7 +433,7 @@ __half abs( __half const x )
 }
 
 /// @copydoc abs( T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 abs( __half2 const x )
 {
 #if CUDART_VERSION > 11000
@@ -448,7 +451,7 @@ __half2 abs( __half2 const x )
  * @param x The value to square.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T square( T const x )
 { return x * x; }
 
@@ -465,7 +468,7 @@ T square( T const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is @c double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float sqrt( float const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -477,7 +480,7 @@ float sqrt( float const x )
 
 /// @copydoc sqrt( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double sqrt( T const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -487,15 +490,15 @@ double sqrt( T const x )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc sqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half sqrt( __half const x )
 { return ::hsqrt( x ); }
 
 /// @copydoc sqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 sqrt( __half2 const x )
 { return ::h2sqrt( x ); }
 
@@ -507,7 +510,7 @@ __half2 sqrt( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float invSqrt( float const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -519,7 +522,7 @@ float invSqrt( float const x )
 
 /// @copydoc invSqrt( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double invSqrt( T const x )
 {
 #if defined( LVARRAY_DEVICE_COMPILE )
@@ -529,15 +532,15 @@ double invSqrt( T const x )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc invSqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half invSqrt( __half const x )
 { return ::hrsqrt( x ); }
 
 /// @copydoc invSqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 invSqrt( __half2 const x )
 { return ::h2rsqrt( x ); }
 
@@ -556,7 +559,7 @@ __half2 invSqrt( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float sin( float const theta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -568,7 +571,7 @@ float sin( float const theta )
 
 /// @copydoc sin( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double sin( T const theta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -578,15 +581,15 @@ double sin( T const theta )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc sin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half sin( __half const theta )
 { return ::hsin( theta ); }
 
 /// @copydoc sin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 sin( __half2 const theta )
 { return ::h2sin( theta ); }
 
@@ -598,7 +601,7 @@ __half2 sin( __half2 const theta )
  * @note This set of overloads is valid for any numeric type. If @p theta is not a float
  *   it is converted to a double and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float cos( float const theta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -610,7 +613,7 @@ float cos( float const theta )
 
 /// @copydoc cos( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double cos( T const theta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -620,15 +623,15 @@ double cos( T const theta )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc cos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half cos( __half const theta )
 { return ::hcos( theta ); }
 
 /// @copydoc cos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 cos( __half2 const theta )
 { return ::h2cos( theta ); }
 
@@ -640,7 +643,7 @@ __half2 cos( __half2 const theta )
  * @param sinTheta The sine of @p theta.
  * @param cosTheta The cosine of @p theta.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 void sincos( float const theta, float & sinTheta, float & cosTheta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -657,7 +660,7 @@ void sincos( float const theta, float & sinTheta, float & cosTheta )
 
 /// @copydoc sincos( float, float &, float & )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 void sincos( double const theta, double & sinTheta, double & cosTheta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -670,7 +673,7 @@ void sincos( double const theta, double & sinTheta, double & cosTheta )
 
 /// @copydoc sincos( float, float &, float & )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 void sincos( T const theta, double & sinTheta, double & cosTheta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -684,10 +687,10 @@ void sincos( T const theta, double & sinTheta, double & cosTheta )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc sincos( float, float &, float & )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 void sincos( __half const theta, __half & sinTheta, __half & cosTheta )
 {
   sinTheta = ::hsin( theta );
@@ -695,7 +698,7 @@ void sincos( __half const theta, __half & sinTheta, __half & cosTheta )
 }
 
 /// @copydoc sincos( float, float &, float & )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta )
 {
   sinTheta = ::h2sin( theta );
@@ -710,7 +713,7 @@ void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta )
  * @note This set of overloads is valid for any numeric type. If @p theta is not a float
  *   it is converted to a double and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float tan( float const theta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -722,7 +725,7 @@ float tan( float const theta )
 
 /// @copydoc tan( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double tan( T const theta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -732,10 +735,10 @@ double tan( T const theta )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc tan( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half tan( __half const theta )
 {
   __half s, c;
@@ -744,7 +747,7 @@ __half tan( __half const theta )
 }
 
 /// @copydoc tan( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 tan( __half2 const theta )
 {
   __half2 s, c;
@@ -776,7 +779,7 @@ namespace internal
  * @note Modified from https://developer.download.nvidia.com/cg/asin.html
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 T asinImpl( T const x )
 {
   T const negate = lessThan( x, math::convert< T >( 0 ) );
@@ -798,7 +801,7 @@ T asinImpl( T const x )
  * @note Modified from https://developer.download.nvidia.com/cg/acos.html
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 T acosImpl( T const x )
 {
   T const negate = lessThan( x, math::convert< T >( 0 ) );
@@ -820,7 +823,8 @@ T acosImpl( T const x )
  * @note Modified from https://developer.download.nvidia.com/cg/atan2.html
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE
+LVARRAY_FORCE_INLINE
 T atan2Impl( T const y, T const x )
 {
   T const absX = abs( x );
@@ -854,7 +858,7 @@ T atan2Impl( T const y, T const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float asin( float const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -866,7 +870,7 @@ float asin( float const x )
 
 /// @copydoc asin( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double asin( T const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -876,15 +880,15 @@ double asin( T const x )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc asin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half asin( __half const x )
 { return internal::asinImpl( x ); }
 
 /// @copydoc asin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 asin( __half2 const x )
 { return internal::asinImpl( x ); }
 
@@ -896,7 +900,7 @@ __half2 asin( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float acos( float const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -908,7 +912,7 @@ float acos( float const x )
 
 /// @copydoc acos( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double acos( T const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -918,15 +922,15 @@ double acos( T const x )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc acos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half acos( __half const x )
 { return internal::acosImpl( x ); }
 
 /// @copydoc acos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 acos( __half2 const x )
 { return internal::acosImpl( x ); }
 
@@ -939,7 +943,7 @@ __half2 acos( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float atan2( float const y, float const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -951,7 +955,7 @@ float atan2( float const y, float const x )
 
 /// @copydoc atan2( float, float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double atan2( T const y, T const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -964,12 +968,12 @@ double atan2( T const y, T const x )
 #if defined( LVARRAY_USE_CUDA )
 
 /// @copydoc atan2( float, float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half atan2( __half const y, __half const x )
 { return internal::atan2Impl( y, x ); }
 
 /// @copydoc atan2( float, float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 atan2( __half2 const y, __half2 const x )
 { return internal::atan2Impl( y, x ); }
 
@@ -988,7 +992,7 @@ __half2 atan2( __half2 const y, __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float exp( float const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -1000,7 +1004,7 @@ float exp( float const x )
 
 /// @copydoc exp( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double exp( T const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -1010,15 +1014,15 @@ double exp( T const x )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc exp( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half exp( __half const x )
 { return ::hexp( x ); }
 
 /// @copydoc exp( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 exp( __half2 const x )
 { return ::h2exp( x ); }
 
@@ -1030,7 +1034,7 @@ __half2 exp( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float log( float const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -1042,7 +1046,7 @@ float log( float const x )
 
 /// @copydoc log( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double log( T const x )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
@@ -1052,15 +1056,15 @@ double log( T const x )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc log( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half log( __half const x )
 { return ::hlog( x ); }
 
 /// @copydoc log( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 log( __half2 const x )
 { return ::h2log( x ); }
 

From 130837bdc1b2b64dceb02a4cb110e28340744be8 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Tue, 7 Jun 2022 12:05:14 -0700
Subject: [PATCH 16/34] remove spock, add crusher-base

---
 host-configs/ORNL/crusher-base.cmake          | 25 ++++++++++++
 host-configs/ORNL/crusher-cce@13.0.1.cmake    | 26 +------------
 host-configs/ORNL/crusher-cce@14.0.0.cmake    | 29 +-------------
 .../ORNL/crusher-cpu-cce@13.0.1.cmake         | 35 +----------------
 host-configs/ORNL/spock-cce@12.0.3.cmake      | 39 -------------------
 5 files changed, 31 insertions(+), 123 deletions(-)
 create mode 100644 host-configs/ORNL/crusher-base.cmake
 delete mode 100644 host-configs/ORNL/spock-cce@12.0.3.cmake

diff --git a/host-configs/ORNL/crusher-base.cmake b/host-configs/ORNL/crusher-base.cmake
new file mode 100644
index 00000000..53f647fa
--- /dev/null
+++ b/host-configs/ORNL/crusher-base.cmake
@@ -0,0 +1,25 @@
+
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
+
+set( ENABLE_MPI ON CACHE BOOL "" FORCE )
+set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
+
+# HIP Options
+set( ENABLE_HIP ON CACHE BOOL "" FORCE )
+
+# suppress -Werror for now
+set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE )
+
+# GTEST
+set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
+set(gtest_disable_pthreads ON CACHE BOOL "")
+
+# disable most binaries and doc generation
+set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
+set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
+set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
+set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
+
+# BLT trying to find MPI fails on cray with cce
+set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
index 157fc0ab..30cf0bc7 100644
--- a/host-configs/ORNL/crusher-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -1,5 +1,5 @@
 
-set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") 
+set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "")
 
 # Set up the tpls
 set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
@@ -24,37 +24,15 @@ set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "")
 set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "")
 set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "")
 
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
-
-set( ENABLE_MPI ON CACHE BOOL "" FORCE )
-set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
-
-# HIP Options
-set( ENABLE_HIP ON CACHE BOOL "" FORCE )
 
 if( ENABLE_HIP )
   set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
 
-  set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
   set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
 
   set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
   set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
   set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
   set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
 endif()
-
-set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
-
-# GTEST options
-set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
-set(gtest_disable_pthreads ON CACHE BOOL "")
-
-set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
-set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
-set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
-set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
-set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
-
-#BLT
-set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
\ No newline at end of file
diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake
index 6a509960..f3d051a2 100644
--- a/host-configs/ORNL/crusher-cce@14.0.0.cmake
+++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake
@@ -1,5 +1,6 @@
 
 set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "")
+include( crusher-base.cmake )
 
 # Set up the tpls
 set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
@@ -25,44 +26,18 @@ set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "
 set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
 set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
 
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
-
-set( ENABLE_MPI ON CACHE BOOL "" FORCE )
-set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
-
-# HIP Options
-set( ENABLE_HIP ON CACHE BOOL "" FORCE )
-
 if( ENABLE_HIP )
   set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
 
   set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" )
   set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" )
-  
+
   set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
   set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
   set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
   set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
 endif()
 
-# suppress -Werror for now
-set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE )
-
-# GTEST
-set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
-set(gtest_disable_pthreads ON CACHE BOOL "")
-
-# disable most binaries and doc generation
-set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
-set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
-set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
-set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
-set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
-
-# BLT trying to find MPI fails on cray with cce
-set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
-
-
 
 
 
diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
index 1a12fc7d..a4c98307 100644
--- a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
@@ -1,5 +1,6 @@
 
-set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") 
+set(CONFIG_NAME "crusher-cpu-cce@13.0.1" CACHE PATH "")
+include( crusher-base.cmake )
 
 # Set up the tpls
 set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
@@ -24,37 +25,5 @@ set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "")
 set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "")
 set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "")
 
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
-
-set( ENABLE_MPI ON CACHE BOOL "" FORCE )
-set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
-
 # HIP Options
 set( ENABLE_HIP OFF CACHE BOOL "" FORCE )
-
-if( ENABLE_HIP )
-  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
-
-  set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
-  set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
-
-  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
-  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
-  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
-  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
-endif()
-
-set(ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) #suppress adding -Werror
-
-# GTEST options
-set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
-set(gtest_disable_pthreads ON CACHE BOOL "")
-
-set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
-set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
-set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
-set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
-set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
-
-#BLT
-set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
\ No newline at end of file
diff --git a/host-configs/ORNL/spock-cce@12.0.3.cmake b/host-configs/ORNL/spock-cce@12.0.3.cmake
deleted file mode 100644
index f0764c32..00000000
--- a/host-configs/ORNL/spock-cce@12.0.3.cmake
+++ /dev/null
@@ -1,39 +0,0 @@
-set(CONFIG_NAME "spock-cce@12.0.3" CACHE PATH "") 
-
-# Set up the tpls
-set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/scratch/tobin6/spack/opt/spack/cray-sles15-zen2/cce-12.0.3" CACHE PATH "")
-set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
-
-set(BLT_DIR "${GEOSX_TPL_DIR}/blt-0.4.1-qpmhf6p7n5sarmks55hgjnzff3ncs7jd/" CACHE PATH "" )
-set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-frggdmwjevbxy4a6kw7ctgrhyv7erfhr/" CACHE PATH "" )
-
-set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-6.0.0-nkdetdg5tjyzzf5yjzo32jxwkmwfjjqn/" CACHE PATH "" )
-set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-0.14.0-wun25mr5qf7vo6x2vblhzh2ivs7vr4g6/" CACHE PATH "" )
-set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2.4.0-a5ponjo23u7smy7w4a4jj7im47shrsxk/" CACHE PATH "" )
-
-set(METIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/metis-5.1.0-rbblqiymq6eoursordyaq2ghimzpd22v/" CACHE PATH "" )
-set(PARMETIS_DIR "/sw/spock/spack-envs/base/opt/cray-sles15-zen2/cce-12.0.3/parmetis-4.0.3-mliemgo6vxrahsz4f6u5agdqyfpk2yd2/" CACHE PATH "" )
-
-# C++ options
-#set(CMAKE_C_COMPILER "/opt/cray/pe/cce/12.0.3/bin/craycc" CACHE PATH "")
-#set(CMAKE_CXX_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayCC" CACHE PATH "")
-#set(CMAKE_Fortran_COMPILER "/opt/cray/pe/cce/12.0.3/bin/crayftn" CACHE PATH "")
-
-set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.11/bin/cc" CACHE PATH "")
-set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.11/bin/CC" CACHE PATH "")
-set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.11/bin/ftn" CACHE PATH "")
-
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
-
-set( ENABLE_MPI ON CACHE BOOL "" FORCE )
-set( ENABLE_FIND_MPI OFF CACHE BOOL "" FORCE )
-
-# HIP Options
-set( ENABLE_HIP ON CACHE BOOL "" FORCE )
-set( HIP_ROOT "/opt/rocm-4.2.0" CACHE PATH "" )
-set( HIP_VERSION_STRING "4.2.0" CACHE STRING "" )
-set( CMAKE_HIP_ARCHITECTURES "gfx908" CACHE STRING "" FORCE )
-
-# GTEST options
-set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
-set(gtest_disable_pthreads ON CACHE BOOL "")

From 6e3c775f3d6dd0b71240fa3d4cfe2502fe7c10f6 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Tue, 28 Jun 2022 14:23:28 -0400
Subject: [PATCH 17/34] cmake path

---
 host-configs/ORNL/crusher-cce@13.0.1.cmake | 1 +
 host-configs/ORNL/crusher-cce@14.0.0.cmake | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
index 30cf0bc7..a10fda43 100644
--- a/host-configs/ORNL/crusher-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -1,5 +1,6 @@
 
 set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
 
 # Set up the tpls
 set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake
index f3d051a2..967be640 100644
--- a/host-configs/ORNL/crusher-cce@14.0.0.cmake
+++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake
@@ -1,6 +1,6 @@
 
 set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "")
-include( crusher-base.cmake )
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
 
 # Set up the tpls
 set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")

From 9e26620986f5a5406baa228476fd8d4031a77781 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Tue, 28 Jun 2022 13:55:39 -0700
Subject: [PATCH 18/34] lessthan

---
 src/math.hpp | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/math.hpp b/src/math.hpp
index d2b07191..5e29dcaa 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -90,24 +90,6 @@ LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 SingleType< T > getSecond( T const x )
 { return x; }
 
-/**
- * @return 1 if @p x is less than @p y, else 0.
- * @tparam T The type of @p x and @p y.
- * @param x The first value.
- * @param y The second value.
- */
-  LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
-__half lessThan( __half const x, __half const y )
-{
-  return __hlt( x, y );
-}
-
-LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
-__half2 lessThan( __half2 const x, __half2 const y )
-{
-  return __hlt2( x, y );
-}
-
 #if defined( LVARRAY_USE_CUDA )
 /**
  * @brief Convert @p u to @c __half.
@@ -212,6 +194,9 @@ LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half getSecond( __half2 const x )
 { return __high2half( x ); }
 
+#endif
+
+#if defined( LVARRAY_USE_DEVICE )
 /**
  * @return 1 if @p x is less than @p y, else 0.
  * @param x The first value.
@@ -229,7 +214,6 @@ __half lessThan( __half const x, __half const y )
 LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 lessThan( __half2 const x, __half2 const y )
 { return __hlt2( x, y ); }
-
 #endif
 
 } // namespace internal

From 9b7f84aa51c912e17397580416c385362fb47cb1 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 30 Jun 2022 08:27:39 -0700
Subject: [PATCH 19/34] lt

---
 src/math.hpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/math.hpp b/src/math.hpp
index 5e29dcaa..61d0237e 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -90,6 +90,17 @@ LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 SingleType< T > getSecond( T const x )
 { return x; }
 
+/**
+ * @return 1 if @p x is less than @p y, else 0.
+ * @tparam T The type of @p x and @p y.
+ * @param x The first value.
+ * @param y The second value.
+ */
+template< typename T >
+LVARRAY_HOST_DEVICE inline constexpr
+T lessThan( T const x, T const y )
+{ return __hlt( x, y ); }
+
 #if defined( LVARRAY_USE_CUDA )
 /**
  * @brief Convert @p u to @c __half.

From ad7c1d906d9e6f433de1ea923f525573ae398858 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 30 Jun 2022 08:36:07 -0700
Subject: [PATCH 20/34] removing pragma no longer needed on crusher

---
 src/Macros.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Macros.hpp b/src/Macros.hpp
index 60a545a5..6e3536c9 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -12,8 +12,6 @@
 
 #pragma once
 
-#pragma clang diagnostic ignored "-Wfloat-equal"
-
 // Source includes
 #include "LvArrayConfig.hpp"
 #include "system.hpp"

From e531497a149c3a49319ec6aac0e91dcd7601a876 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 30 Jun 2022 08:48:53 -0700
Subject: [PATCH 21/34] remove shim type temporarily used for raja versioning
 differences

---
 src/bufferManipulation.hpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp
index 548cfe2b..83e5a00e 100644
--- a/src/bufferManipulation.hpp
+++ b/src/bufferManipulation.hpp
@@ -69,21 +69,6 @@ namespace bufferManipulation
  */
 HAS_MEMBER_FUNCTION_NO_RTYPE( move, MemorySpace::host, true );
 
-
-template < typename T >
-struct ContainerShim
-{
-  ContainerShim( T * begin, T * end )
-    : m_begin( begin )
-    , m_end( end )
-  {}
-  T * begin() const { return m_begin; }
-  T * end() const { return m_end; }
-  T * m_begin;
-  T * m_end;
-};
-  
-  
 /**
  * @class VoidBuffer
  * @brief This class implements the default behavior for the Buffer methods related

From a5949848876bda458980e29822fb31fc70e98d01 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 30 Jun 2022 08:59:18 -0700
Subject: [PATCH 22/34] doc

---
 src/Macros.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Macros.hpp b/src/Macros.hpp
index 6e3536c9..8717e397 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -24,17 +24,22 @@
 
 
 #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
+  /// Macro defined when using a device.
   #define LVARRAY_USE_DEVICE
 #endif
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  /// Macro defined when currently compiling on device (only defined in the device context).
   #define LVARRAY_DEVICE_COMPILE
+  /// Marks a function/lambda for inlining
   #define LVARRAY_FORCE_INLINE __forceinline__
 #else
+  /// Marks a function/lambda for inlining
   #define LVARRAY_FORCE_INLINE inline
 #endif
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
+  // Denotes whether to define decorator macros later in this file.
   #define LVARRAY_DECORATE
 #endif
 

From 286929765be04d96af894127cc4049cff420d15e Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 30 Jun 2022 10:13:42 -0700
Subject: [PATCH 23/34] versioning, merge from dev, bugs only showing up in
 tests which weren't compiled on crusher

---
 src/ArrayOfArraysView.hpp                   | 4 ++++
 unitTests/testArray1DOfArray1DOfArray1D.cpp | 2 +-
 unitTests/testMath.cpp                      | 1 +
 unitTests/testSortedArray.cpp               | 2 +-
 unitTests/testTensorOpsTwoSizes1.cpp        | 4 ++--
 5 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index d042d091..72695e3d 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -732,7 +732,11 @@ class ArrayOfArraysView
     auto const fillOffsets = [&]()
     {
       m_offsets[ 0 ] = 0;
+#if RAJA_VERSION_MAJOR >= 1 && RAJA_VERSION_MINOR >= 13
       RAJA::inclusive_scan< POLICY >( RAJA::make_span(capacities, numSubArrays), RAJA::make_span(m_offsets.data() + 1, numSubArrays) );
+#else
+      RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 );
+#endif
     };
     resizeFromOffsetsImpl( numSubArrays, fillOffsets, buffers ... );
   }
diff --git a/unitTests/testArray1DOfArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1DOfArray1D.cpp
index cdd17fe2..5038d778 100644
--- a/unitTests/testArray1DOfArray1DOfArray1D.cpp
+++ b/unitTests/testArray1DOfArray1DOfArray1D.cpp
@@ -272,7 +272,7 @@ using Array1DOfArray1DOfArray1DTestTypes = ::testing::Types<
   , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy >
   , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy >
 #endif
-#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testMath.cpp b/unitTests/testMath.cpp
index d7c76b19..f6d193cb 100644
--- a/unitTests/testMath.cpp
+++ b/unitTests/testMath.cpp
@@ -154,6 +154,7 @@ using TestMathTypes = ::testing::Types<
 #endif
 #if defined( LVARRAY_USE_CUDA )
   , std::pair< __half, parallelDevicePolicy< 32 > >
+#endif
   >;
 
 TYPED_TEST_SUITE( TestMath, TestMathTypes, );
diff --git a/unitTests/testSortedArray.cpp b/unitTests/testSortedArray.cpp
index ae145fbd..fe52ddfc 100644
--- a/unitTests/testSortedArray.cpp
+++ b/unitTests/testSortedArray.cpp
@@ -451,7 +451,7 @@ using SortedArrayViewTestTypes = ::testing::Types<
   std::pair< SortedArray< int, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
   , std::pair< SortedArray< Tensor, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
   , std::pair< SortedArray< TestString, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
-#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< SortedArray< int, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< SortedArray< Tensor, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testTensorOpsTwoSizes1.cpp b/unitTests/testTensorOpsTwoSizes1.cpp
index 101c4671..96ac793c 100644
--- a/unitTests/testTensorOpsTwoSizes1.cpp
+++ b/unitTests/testTensorOpsTwoSizes1.cpp
@@ -938,9 +938,9 @@ using TwoSizesTestTypes = ::testing::Types<
 #endif
   >;
 
-TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes );
+TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes, );
+
 
-  
 TYPED_TEST( TwoSizesTest, scale )
 {
   this->testScale();

From 82b3d4bda74828e9072351d7552f7c35ebf09a69 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 30 Jun 2022 10:20:18 -0700
Subject: [PATCH 24/34] crustify

---
 src/Array.hpp                      |  2 +-
 src/ArrayOfArraysView.hpp          |  2 +-
 src/CRSMatrix.hpp                  |  2 +-
 src/Macros.hpp                     | 32 +++++++++++++++---------------
 src/math.hpp                       |  6 +++---
 unitTests/testArray_ChaiBuffer.cpp |  2 +-
 unitTests/testChaiBuffer.cpp       |  2 +-
 unitTests/testUtils.hpp            |  2 +-
 8 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/Array.hpp b/src/Array.hpp
index 28ef6f95..503d4750 100644
--- a/src/Array.hpp
+++ b/src/Array.hpp
@@ -91,7 +91,7 @@ class Array : public ArrayView< T,
   {
     this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims );
 
-#if !defined(LVARRAY_DEVICE_COMPILE) 
+#if !defined(LVARRAY_DEVICE_COMPILE)
     setName( "" );
 #endif
 #if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE)
diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 72695e3d..58aa5e96 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -733,7 +733,7 @@ class ArrayOfArraysView
     {
       m_offsets[ 0 ] = 0;
 #if RAJA_VERSION_MAJOR >= 1 && RAJA_VERSION_MINOR >= 13
-      RAJA::inclusive_scan< POLICY >( RAJA::make_span(capacities, numSubArrays), RAJA::make_span(m_offsets.data() + 1, numSubArrays) );
+      RAJA::inclusive_scan< POLICY >( RAJA::make_span( capacities, numSubArrays ), RAJA::make_span( m_offsets.data() + 1, numSubArrays ) );
 #else
       RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 );
 #endif
diff --git a/src/CRSMatrix.hpp b/src/CRSMatrix.hpp
index daffdd9e..ddd786c5 100644
--- a/src/CRSMatrix.hpp
+++ b/src/CRSMatrix.hpp
@@ -139,7 +139,7 @@ class CRSMatrix : protected CRSMatrixView< T, COL_TYPE, INDEX_TYPE, BUFFER_TYPE
       RAJA::forall< POLICY >( RAJA::TypedRangeSegment< INDEX_TYPE >( 0, numRows() ),
                               [view] LVARRAY_HOST_DEVICE ( INDEX_TYPE const row )
         {
-	  INDEX_TYPE const nnz = view.numNonZeros( row );
+          INDEX_TYPE const nnz = view.numNonZeros( row );
           T * const entries = view.getEntries( row );
           arrayManipulation::destroy( entries, nnz );
         } );
diff --git a/src/Macros.hpp b/src/Macros.hpp
index 8717e397..82cf24d1 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -24,23 +24,23 @@
 
 
 #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
-  /// Macro defined when using a device.
-  #define LVARRAY_USE_DEVICE
+/// Macro defined when using a device.
+#define LVARRAY_USE_DEVICE
 #endif
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  /// Macro defined when currently compiling on device (only defined in the device context).
-  #define LVARRAY_DEVICE_COMPILE
-  /// Marks a function/lambda for inlining
-  #define LVARRAY_FORCE_INLINE __forceinline__
+/// Macro defined when currently compiling on device (only defined in the device context).
+#define LVARRAY_DEVICE_COMPILE
+/// Marks a function/lambda for inlining
+#define LVARRAY_FORCE_INLINE __forceinline__
 #else
-  /// Marks a function/lambda for inlining
-  #define LVARRAY_FORCE_INLINE inline
+/// Marks a function/lambda for inlining
+#define LVARRAY_FORCE_INLINE inline
 #endif
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  // Denotes whether to define decorator macros later in this file.
-  #define LVARRAY_DECORATE
+// Denotes whether to define decorator macros later in this file.
+#define LVARRAY_DECORATE
 #endif
 
 
@@ -137,10 +137,10 @@
     { \
       constexpr char const * formatString = "***** ERROR\n" \
                                             "***** LOCATION: " LOCATION "\n" \
-                                            "***** Block: [%u, %u, %u]\n" \
-                                            "***** Thread: [%u, %u, %u]\n" \
-                                            "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
-                                            "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
+                                                                        "***** Block: [%u, %u, %u]\n" \
+                                                                        "***** Thread: [%u, %u, %u]\n" \
+                                                                        "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n" \
+                                                                                                                                            "***** MSG: " STRINGIZE( MSG ) "\n\n"; \
       printf( formatString, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z ); \
       asm ( "trap;" ); \
     } \
@@ -578,9 +578,9 @@
  *   the host. To use place directly above a the template.
  */
 #if defined(LVARRAY_USE_CUDA)
-  #define DISABLE_HD_WARNING _Pragma("hd_warning_disable")
+#define DISABLE_HD_WARNING _Pragma("hd_warning_disable")
 #else
-  #define DISABLE_HD_WARNING
+#define DISABLE_HD_WARNING
 #endif
 #else
 /// Mark a function for both host and device usage.
diff --git a/src/math.hpp b/src/math.hpp
index 61d0237e..d45f68f3 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -339,7 +339,7 @@ LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 max( __half2 const a, __half2 const b )
 {
 #if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
-    return __hmax2( a, b );
+  return __hmax2( a, b );
 #else
   __half2 const aFactor = __hge2( a, b );
   __half2 const bFactor = convert< __half2 >( 1 ) - aFactor;
@@ -643,9 +643,9 @@ void sincos( float const theta, float & sinTheta, float & cosTheta )
 {
 #if defined(LVARRAY_DEVICE_COMPILE)
   #if defined(LVARRAY_USE_CUDA)
-    ::sincos( theta, &sinTheta, &cosTheta );
+  ::sincos( theta, &sinTheta, &cosTheta );
   #elif defined(LVARRAY_USE_HIP)
-    ::sincosf( theta, &sinTheta, &cosTheta );
+  ::sincosf( theta, &sinTheta, &cosTheta );
   #endif
 #else
   sinTheta = std::sin( theta );
diff --git a/unitTests/testArray_ChaiBuffer.cpp b/unitTests/testArray_ChaiBuffer.cpp
index 5ef2a6a1..8bd5aaed 100644
--- a/unitTests/testArray_ChaiBuffer.cpp
+++ b/unitTests/testArray_ChaiBuffer.cpp
@@ -144,7 +144,7 @@ TYPED_TEST( ArrayTest, DeviceAlloc )
 {
   this->testHIPDeviceAlloc();
 }
-  
+
 #endif
 
 } // namespace testing
diff --git a/unitTests/testChaiBuffer.cpp b/unitTests/testChaiBuffer.cpp
index ae12886f..27b3401a 100644
--- a/unitTests/testChaiBuffer.cpp
+++ b/unitTests/testChaiBuffer.cpp
@@ -44,7 +44,7 @@ class ChaiBufferTest : public ::testing::Test
   #elif defined( LVARRAY_USE_HIP )
     auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip };
-    std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };    
+    std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
   #else
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool };
diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp
index 639b20ad..a4a3efa1 100644
--- a/unitTests/testUtils.hpp
+++ b/unitTests/testUtils.hpp
@@ -85,7 +85,7 @@ struct RAJAHelper< RAJA::hip_exec< N > >
   using AtomicPolicy = RAJA::hip_atomic;
   static constexpr MemorySpace space = MemorySpace::hip;
 };
-  
+
 #endif
 
 template< typename POLICY, typename INDEX_TYPE, typename LAMBDA >

From d97d49979c84d35ded9e7b445b9eb8918ff3adcc Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Thu, 30 Jun 2022 10:35:03 -0700
Subject: [PATCH 25/34] better raja scan version guard

---
 src/ArrayOfArraysView.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 58aa5e96..592154d8 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -732,7 +732,7 @@ class ArrayOfArraysView
     auto const fillOffsets = [&]()
     {
       m_offsets[ 0 ] = 0;
-#if RAJA_VERSION_MAJOR >= 1 && RAJA_VERSION_MINOR >= 13
+#if ( RAJA_VERSION_MAJOR == 1 && RAJA_VERSION_MINOR >= 13 ) || ( RAJA_VERSION_MAJOR > 1 )
       RAJA::inclusive_scan< POLICY >( RAJA::make_span( capacities, numSubArrays ), RAJA::make_span( m_offsets.data() + 1, numSubArrays ) );
 #else
       RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 );

From deb29b66f351e7286b45ec04ef5937d3ed797a38 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Tue, 5 Jul 2022 13:58:57 -0700
Subject: [PATCH 26/34] nvcc vs cce (hip-clang) differences

---
 host-configs/LLNL/lassen-base.cmake | 6 +++---
 src/ArrayOfArraysView.hpp           | 3 ---
 src/ArrayView.hpp                   | 3 ---
 src/CRSMatrixView.hpp               | 2 --
 unitTests/testTypeManipulation.cpp  | 2 +-
 5 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake
index 5a443bb9..3a60a7f3 100644
--- a/host-configs/LLNL/lassen-base.cmake
+++ b/host-configs/LLNL/lassen-base.cmake
@@ -21,14 +21,14 @@ set(ENABLE_CUDA ON CACHE BOOL "")
 set(CUDA_TOOLKIT_ROOT_DIR /usr/tce/packages/cuda/cuda-10.1.243 CACHE STRING "")
 set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "")
 set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc CACHE STRING "")
-set(CUDA_ARCH sm_70 CACHE STRING "")
+set(CUDA_ARCHITECTURES sm_70 CACHE STRING "")
 set(CMAKE_CUDA_STANDARD 14 CACHE STRING "")
-set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCHITECTURES} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
 
-set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE)
+set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCHITECTURES}" CACHE STRING "" FORCE)
 
 # Uncomment this line to make nvcc output register usage for each kernel.
 # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --resource-usage" CACHE STRING "" FORCE)
diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 592154d8..5efb4bc0 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -202,14 +202,12 @@ class ArrayOfArraysView
    * @brief A constructor to create an uninitialized ArrayOfArraysView.
    * @note An uninitialized ArrayOfArraysView should not be used until it is assigned to.
    */
-  LVARRAY_HOST_DEVICE
   ArrayOfArraysView() = default;
 
   /**
    * @brief Default copy constructor.
    * @note The copy constructor will trigger the copy constructor for @tparam BUFFER_TYPE
    */
-  LVARRAY_HOST_DEVICE
   ArrayOfArraysView( ArrayOfArraysView const & ) = default;
 
   /**
@@ -246,7 +244,6 @@ class ArrayOfArraysView
    * @brief Default copy assignment operator.
    * @return *this.
    */
-  LVARRAY_HOST_DEVICE
   inline
   ArrayOfArraysView & operator=( ArrayOfArraysView const & ) = default;
 
diff --git a/src/ArrayView.hpp b/src/ArrayView.hpp
index 1a2a3d03..aabd48bf 100644
--- a/src/ArrayView.hpp
+++ b/src/ArrayView.hpp
@@ -118,7 +118,6 @@ class ArrayView
    * @brief A constructor to create an uninitialized ArrayView.
    * @note An uninitialized ArrayView should not be used until it is assigned to.
    */
-  LVARRAY_HOST_DEVICE
   ArrayView() = default;
 
   /**
@@ -186,7 +185,6 @@ class ArrayView
    * ArrayView< int, 1, 0, std::ptrdiff_t, MallocBuffer > anotherView = std::move( view );
    * @endcode
    */
-  LVARRAY_HOST_DEVICE
   ArrayView( ArrayView && source ) = default;
 
   /**
@@ -208,7 +206,6 @@ class ArrayView
   {}
 
   /// The default destructor.
-  LVARRAY_HOST_DEVICE
   ~ArrayView() = default;
 
   /**
diff --git a/src/CRSMatrixView.hpp b/src/CRSMatrixView.hpp
index fe3c7c99..9a8bbca5 100644
--- a/src/CRSMatrixView.hpp
+++ b/src/CRSMatrixView.hpp
@@ -106,13 +106,11 @@ class CRSMatrixView : protected SparsityPatternView< COL_TYPE, INDEX_TYPE, BUFFE
   /**
    * @brief Default copy constructor.
    */
-  LVARRAY_HOST_DEVICE
   CRSMatrixView( CRSMatrixView const & ) = default;
 
   /**
    * @brief Default move constructor.
    */
-  LVARRAY_HOST_DEVICE inline
   CRSMatrixView( CRSMatrixView && ) = default;
 
   /**
diff --git a/unitTests/testTypeManipulation.cpp b/unitTests/testTypeManipulation.cpp
index 45ad8380..494fb038 100644
--- a/unitTests/testTypeManipulation.cpp
+++ b/unitTests/testTypeManipulation.cpp
@@ -81,7 +81,7 @@ CUDA_TEST( typeManipulation, forEachArg )
   EXPECT_EQ( intReducer.get(), 2 );
   EXPECT_EQ( floatReducer.get(), 4 );
   EXPECT_EQ( doubleReducer.get(), 7 );
-#eli defined(LVARRAY_USE_HIP)
+#elif defined(LVARRAY_USE_HIP)
   // Test on device.
   RAJA::ReduceSum< RAJA::hip_reduce, int > intReducer( 1 );
   RAJA::ReduceSum< RAJA::hip_reduce, float > floatReducer( 3 );

From 62bef508a73db67b9f43544bea64916613186558 Mon Sep 17 00:00:00 2001
From: William Tobin <tobin6@login2.crusher.olcf.ornl.gov>
Date: Tue, 23 Aug 2022 19:14:56 -0400
Subject: [PATCH 27/34] crusher debugging and host-configs

---
 cmake/CMakeBasics.cmake                       |  6 +--
 host-configs/ORNL/crusher-cce@14.0.1.cmake    | 39 +++++++++++++++++++
 host-configs/ORNL/crusher-cce@14.0.2.cmake    | 39 +++++++++++++++++++
 .../ORNL/crusher-cpu-cce@13.0.1.cmake         |  2 +-
 .../ORNL/crusher-cpu-cce@14.0.1.cmake         | 30 ++++++++++++++
 src/ChaiBuffer.hpp                            |  5 +--
 src/Macros.hpp                                | 28 ++++++-------
 7 files changed, 128 insertions(+), 21 deletions(-)
 create mode 100644 host-configs/ORNL/crusher-cce@14.0.1.cmake
 create mode 100644 host-configs/ORNL/crusher-cce@14.0.2.cmake
 create mode 100644 host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake

diff --git a/cmake/CMakeBasics.cmake b/cmake/CMakeBasics.cmake
index 4c3ec217..25c4bef7 100644
--- a/cmake/CMakeBasics.cmake
+++ b/cmake/CMakeBasics.cmake
@@ -12,9 +12,9 @@ option( ENABLE_TOTALVIEW_OUTPUT "" OFF )
 set( LVARRAY_BUILD_OBJ_LIBS OFF CACHE BOOL "" )
 
 
-if( NOT BLT_CXX_STD STREQUAL c++14 )
-    MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14")
-endif()
+# if( NOT BLT_CXX_STD STREQUAL c++14 )
+#     MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14")
+# endif()
 
 
 blt_append_custom_compiler_flag( FLAGS_VAR CMAKE_CXX_FLAGS DEFAULT "${OpenMP_CXX_FLAGS}")
diff --git a/host-configs/ORNL/crusher-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cce@14.0.1.cmake
new file mode 100644
index 00000000..15c54516
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@14.0.1.cmake
@@ -0,0 +1,39 @@
+
+set(CONFIG_NAME "crusher-cce@14.0.1" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" )
+
+set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" )
+set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.15")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" )
+
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
diff --git a/host-configs/ORNL/crusher-cce@14.0.2.cmake b/host-configs/ORNL/crusher-cce@14.0.2.cmake
new file mode 100644
index 00000000..d0e29023
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@14.0.2.cmake
@@ -0,0 +1,39 @@
+
+set(CONFIG_NAME "crusher-cce@14.0.2" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.2" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.2" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-ksdglvlmamju7gphtyzdavitriemedla" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.1-jxxcauxbzee6nqjmyjz45t5h4f7tv34r" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.1-vgvqpvi3cwdmvy6cu76sqoghnvprzlwu" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-7axkiea7q3hzgojswiz7qdbd2yq6bvsf" CACHE PATH "" )
+
+set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-jptrwzs7vdbckndjg5qg4jwckfmgexmw/" CACHE PATH "" )
+set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-p2msdgsmomufcnwhnow5bbazg7463caf/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.17")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_VERSION_STRING "5.2.0" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" )
+
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
index a4c98307..b9d64b28 100644
--- a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
+++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
@@ -1,6 +1,6 @@
 
 set(CONFIG_NAME "crusher-cpu-cce@13.0.1" CACHE PATH "")
-include( crusher-base.cmake )
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
 
 # Set up the tpls
 set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
diff --git a/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake
new file mode 100644
index 00000000..d25d6b2e
--- /dev/null
+++ b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake
@@ -0,0 +1,30 @@
+
+set(CONFIG_NAME "crusher-cpu-cce@14.0.1" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" )
+
+set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" )
+set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.15")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+# HIP Options
+set( ENABLE_HIP OFF CACHE BOOL "" FORCE )
diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp
index f78998a8..83e8c254 100644
--- a/src/ChaiBuffer.hpp
+++ b/src/ChaiBuffer.hpp
@@ -412,10 +412,9 @@ class ChaiBuffer
         m_capacity == 0 ||
         chaiSpace == chai::NONE ) return;
 
+    auto & am = internal::getArrayManager();
     const_cast< T * & >( m_pointer ) =
-      static_cast< T * >( internal::getArrayManager().move( const_cast< T_non_const * >( m_pointer ),
-                                                            m_pointerRecord,
-                                                            chaiSpace ) );
+      static_cast< T * >( am.move( const_cast< T_non_const * >( m_pointer ), m_pointerRecord, chaiSpace ) );
 
     if( !std::is_const< T >::value && touch ) m_pointerRecord->m_touched[ chaiSpace ] = true;
     m_pointerRecord->m_last_space = chaiSpace;
diff --git a/src/Macros.hpp b/src/Macros.hpp
index 82cf24d1..bcd98b78 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -148,20 +148,20 @@
   #endif
 #else
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
-  do \
-  { \
-    if( EXP ) \
-    { \
-      std::ostringstream __oss; \
-      __oss << "***** ERROR\n"; \
-      __oss << "***** LOCATION: " LOCATION "\n"; \
-      __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \
-      __oss << MSG << "\n"; \
-      __oss << LvArray::system::stackTrace( true ); \
-      std::cout << __oss.str() << std::endl; \
-      LvArray::system::callErrorHandler(); \
-    } \
-  } while( false )
+  // do \
+  // { \
+  //   if( EXP ) \
+  //   { \
+  //     std::ostringstream __oss; \
+  //     __oss << "***** ERROR\n"; \
+  //     __oss << "***** LOCATION: " LOCATION "\n"; \
+  //     __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \
+  //     __oss << MSG << "\n"; \
+  //     __oss << LvArray::system::stackTrace( true ); \
+  //     std::cout << __oss.str() << std::endl; \
+  //     LvArray::system::callErrorHandler(); \
+  //   } \
+  // } while( false )
 #endif
 
 /**

From 9cdbb7b16ee30bd6e86233c1da7f2796c4158afc Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Mon, 29 Aug 2022 15:55:13 -0700
Subject: [PATCH 28/34] Squash

---
 cmake/SetupTPL.cmake                          |  63 ++++-
 scripts/uberenv/packages/lvarray/package.py   |  26 ++
 .../toss_4_x86_64_ib/packages.yaml            |   9 +
 src/CMakeLists.txt                            |   4 +
 src/dense/CMakeLists.cmake                    |  33 +++
 src/dense/common.cpp                          |  18 ++
 src/dense/common.hpp                          |  37 +++
 src/dense/eigendecomposition.cpp              | 229 ++++++++++++++++++
 src/dense/eigendecomposition.hpp              |  50 ++++
 9 files changed, 461 insertions(+), 8 deletions(-)
 create mode 100644 src/dense/CMakeLists.cmake
 create mode 100644 src/dense/common.cpp
 create mode 100644 src/dense/common.hpp
 create mode 100644 src/dense/eigendecomposition.cpp
 create mode 100644 src/dense/eigendecomposition.hpp

diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake
index 29a60128..c312306b 100644
--- a/cmake/SetupTPL.cmake
+++ b/cmake/SetupTPL.cmake
@@ -56,6 +56,12 @@ endif()
 # CHAI
 ################################
 if(ENABLE_CHAI)
+    if(NOT EXISTS ${CHAI_DIR})
+        message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.")
+    endif()
+
+    message(STATUS "Using CHAI from ${CHAI_DIR}")
+
     if(NOT ENABLE_UMPIRE)
         message(FATAL_ERROR "Umpire must be enabled to use CHAI.")
     endif()
@@ -111,16 +117,57 @@ endif()
 ################################
 # Python
 ################################
-if ( ENABLE_PYLVARRAY )
-    message( STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}" )
-    find_package( Python3 REQUIRED
-                  COMPONENTS Development NumPy )
+if(ENABLE_PYLVARRAY)
+    message(STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}")
+    find_package(Python3 REQUIRED
+                 COMPONENTS Development NumPy)
+
+    message(STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}")
+    message(STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}")
+    message(STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}")
+
+    set(thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy)
+else()
+    message(STATUS "Not building pylvarray")
+endif()
+
+################################
+# LAPACK/BLAS
+################################
+if(ENABLE_LAPACK)
+    message(STATUS "BLAS_LIBRARIES = ${BLAS_LIBRARIES}")
+    message(STATUS "LAPACK_LIBRARIES = ${LAPACK_LIBRARIES}")
 
-    message( STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}" )
-    message( STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}" )
-    message( STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}" )
+    blt_import_library(NAME blas
+                       TREAT_INCLUDES_AS_SYSTEM ON
+                       LIBRARIES ${BLAS_LIBRARIES})
 
-    set( thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy )
+    blt_import_library(NAME lapack
+                       DEPENDS_ON blas
+                       TREAT_INCLUDES_AS_SYSTEM ON
+                       LIBRARIES ${LAPACK_LIBRARIES})
+
+    set(thirdPartyLibs ${thirdPartyLibs} blas lapack)
+else()
+    message(STATUS "Not using LAPACK or BLAS.")
+endif()
+
+################################
+# MAGMA
+################################
+if(ENABLE_MAGMA)
+    message(STATUS "Using MAGMA from ${MAGMA_DIR}")
+
+    if(NOT ENABLE_LAPACK)
+        message(FATAL_ERROR "LAPACK must be enabled to use MAGMA.")
+    endif()
+
+    find_package(magma REQUIRED
+                 PATHS ${MAGMA_DIR})
+    
+    set(thirdPartyLibs ${thirdPartyLibs} magma)
+else()
+    message(STATUS "Not using MAGMA.")
 endif()
 
 set( thirdPartyLibs ${thirdPartyLibs} CACHE STRING "" )
diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index b377bdfa..c5a3b35b 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -50,15 +50,21 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('chai', default=False, description='Build Chai support')
     variant('caliper', default=False, description='Build Caliper support')
     variant('pylvarray', default=False, description='Build Python support')
+    # variant('lapack', default=False, description='Build LAPACK and BLAS support')
+    # variant('magma', default=False, description='Build MAGMA support')
     variant('tests', default=True, description='Build tests')
     variant('benchmarks', default=False, description='Build benchmarks')
     variant('examples', default=False, description='Build examples')
     variant('docs', default=False, description='Build docs')
     variant('addr2line', default=True,
             description='Build support for addr2line.')
+
     variant('tpl_build_type', default='none', description='TPL build type',
             values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none'))
         
+
+    # conflicts('~lapack', when='+magma')
+
     depends_on('blt', when='@0.2.0:', type='build')
 
     depends_on('camp')
@@ -76,6 +82,10 @@ class Lvarray(CMakePackage, CudaPackage):
     depends_on('py-scipy@1.5.2:', when='+pylvarray')
     depends_on('py-pip', when='+pylvarray')
 
+    # depends_on('blas', when='+lapack')
+    # depends_on('lapack', when='+lapack')
+    # depends_on('magma', when='+magma')
+
     depends_on('doxygen@1.8.13:', when='+docs', type='build')
     depends_on('py-sphinx@1.6.3:', when='+docs', type='build')
 
@@ -313,6 +323,22 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         else:
             cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False))
 
+        # cfg.write('#{0}\n'.format('-' * 80))
+        # cfg.write('# Math libraries\n')
+        # cfg.write('#{0}\n\n'.format('-' * 80))
+        # if '+lapack' in spec:
+        #     cfg.write(cmake_cache_option('ENABLE_LAPACK', True))
+        #     cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs))
+        #     cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs))
+        # else:
+        #     cfg.write(cmake_cache_option('ENABLE_LAPACK', False))
+
+        # if '+magma' in spec:
+        #     cfg.write(cmake_cache_option('ENABLE_MAGMA', True))
+        #     cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix))
+        # else:
+        #     cfg.write(cmake_cache_option('ENABLE_MAGMA', False))
+
         cfg.write("#{0}\n".format("-" * 80))
         cfg.write("# Documentation\n")
         cfg.write("#{0}\n\n".format("-" * 80))
diff --git a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
index e7ed36f4..ea2998fc 100644
--- a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
@@ -2,6 +2,15 @@ packages:
   all:
     target: [default]
     compiler: [gcc, clang, intel]
+    providers:
+      blas: [intel-mkl]
+      lapack: [intel-mkl]
+
+  intel-mkl:
+    buildable: False
+    externals:
+      - spec: intel-mkl@2020.0.166 threads=openmp
+        prefix: /usr/tce/packages/mkl/mkl-2020.0/
 
   cmake:
     buildable: False
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index da7c512f..8d4ad2ca 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -79,3 +79,7 @@ lvarray_add_code_checks( PREFIX lvarray
 if( ENABLE_PYLVARRAY )
     add_subdirectory( python )
 endif()
+
+if( ENABLE_LAPACK )
+    add_subdirectory( dense )
+endif()
diff --git a/src/dense/CMakeLists.cmake b/src/dense/CMakeLists.cmake
new file mode 100644
index 00000000..0f7096ae
--- /dev/null
+++ b/src/dense/CMakeLists.cmake
@@ -0,0 +1,33 @@
+set( lvarraydense_headers
+     common.hpp
+     eigendecomposition.hpp
+    )
+
+set( lvarraydense_sources
+     common.cpp
+     eigendecomposition.cpp
+    )
+
+blt_add_library( NAME             lvarraydense
+                 SOURCES          ${lvarraydense_sources}
+                 HEADERS          ${lvarraydense_headers}
+                 DEPENDS_ON       lvarray ${lvarray_dependencies} blas lapack
+                 SHARED TRUE
+                 CLEAR_PREFIX TRUE
+                 )
+
+target_include_directories( lvarraydense
+                            PUBLIC
+                            $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>
+                            $<INSTALL_INTERFACE:include> )
+
+install( TARGETS lvarraydense
+         EXPORT lvarraydense
+         ARCHIVE DESTINATION lib
+         LIBRARY DESTINATION lib
+         RUNTIME DESTINATION lib )
+
+install( EXPORT lvarraydense
+         DESTINATION share/lvarray/cmake/ )
+
+lvarray_add_code_checks( PREFIX lvarraydense )
diff --git a/src/dense/common.cpp b/src/dense/common.cpp
new file mode 100644
index 00000000..75c06070
--- /dev/null
+++ b/src/dense/common.cpp
@@ -0,0 +1,18 @@
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+char const * getOption( SymmetricMatrixStorageType const option )
+{
+  static constexpr char const * const upper = "U";
+  static constexpr char const * const lower = "L";
+
+  return option == SymmetricMatrixStorageType::UPPER_TRIANGULAR ? upper : lower;
+}
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/common.hpp b/src/dense/common.hpp
new file mode 100644
index 00000000..40072a91
--- /dev/null
+++ b/src/dense/common.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "common.hpp"
+#include "../Array.hpp"
+#include "../ChaiBuffer.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+/**
+ *
+ */
+enum class SymmetricMatrixStorageType
+{
+  UPPER_TRIANGULAR,
+  LOWER_TRIANGULAR,
+};
+
+/**
+ * TODO: move to internal namespace
+ */
+char const * getOption( SymmetricMatrixStorageType const option );
+
+/**
+ *
+ */
+template< typename T >
+struct Workspace
+{
+  Array< std::complex< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > workComplex;
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rWork;
+};
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/eigendecomposition.cpp b/src/dense/eigendecomposition.cpp
new file mode 100644
index 00000000..fc51132a
--- /dev/null
+++ b/src/dense/eigendecomposition.cpp
@@ -0,0 +1,229 @@
+#include "eigendecomposition.hpp"
+
+/// This macro provide a flexible interface for Fortran naming convention for compiled objects
+// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
+#define FORTRAN_MANGLE( name ) name
+// #else
+// #define FORTRAN_MANGLE( name ) name ## _
+// #endif
+
+extern "C"
+{
+
+#define LVARRAY_CHEEV FORTRAN_MANGLE( cheev )
+void LVARRAY_CHEEV(
+  char const * JOBZ,
+  char const * UPLO,
+  int const * N,
+  std::complex< float > * A,
+  int const * LDA,
+  float * W,
+  std::complex< float > * WORK,
+  int const * LWORK,
+  float const * RWORK,
+  int * INFO
+);
+
+#define LVARRAY_ZHEEV FORTRAN_MANGLE( zheev )
+void LVARRAY_ZHEEV(
+  char const * JOBZ,
+  char const * UPLO,
+  int const * N,
+  std::complex< double > * A,
+  int const * LDA,
+  double * W,
+  std::complex< double > * WORK,
+  int const * LWORK,
+  double const * RWORK,
+  int * INFO );
+
+#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr )
+void LVARRAY_ZHEEVR( 
+  char const * JOBZ,
+  char const * RANGE,
+  char const * UPLO,
+  int const * N,
+  std::complex< double > * A,
+  int const * LDA,
+  double const * VL,
+  double const * VU,
+  int const * IL,
+  int const * IU,
+  double const * ABSTOL,
+  int * M,
+  double * W,
+  double * Z,
+  int const * LDZ,
+  int * ISUPPZ,
+  std::complex< double > * WORK,
+  int const * LWORK,
+  double * RWORK,
+  int * LRWORK,
+  int const * IWORK,
+  int const * LIWORK,
+  int * INFO );
+
+
+} // extern "C"
+
+namespace LvArray
+{
+namespace dense
+{
+namespace internal
+{
+
+/**
+ *
+ */
+char const * getOption( EigenDecompositionOption const option )
+{
+  static constexpr char const * const eigenvalueString = "N";
+  static constexpr char const * const eigenvectorString = "V";
+
+  return option == EigenDecompositionOption::EIGENVALUES ? eigenvalueString : eigenvectorString;
+}
+
+struct HEEVR_status
+{
+  int LWORK;
+  int LRWORK;
+  int LIWORK;
+  bool success
+};
+
+
+template< typename T, typename INDEX_TYPE >
+HEEVR_Sizes heevr(
+  EigenDecompositionOption const decompositionOptions,
+  ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A,
+  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
+  Workspace< T > & workspace,
+  SymmetricMatrixStorageType const storageType,
+  bool const compute )
+
+} // namespace internal
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename T, typename INDEX_TYPE >
+void heev(
+  MemorySpace const space,
+  EigenDecompositionOption const decompositionType,
+  ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A,
+  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
+  Workspace< T > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::cpu, "Device not yet supported." );
+
+  LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), A.size( 1 ),
+    "The matrix A must be square." );
+
+  LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), eigenValues.size(),
+    "The matrix A and lambda have incompatible sizes." );
+
+  // define the arguments of zheev
+  int const N = LvArray::integerConversion< int >( A.size( 0 ) );
+  int const LDA = N;
+  int INFO;
+
+  // Make sure that the workspace is the right size.
+  workspace.rWork.resizeWithoutInitializationOrDestruction( std::max( 1, 3 * N - 2 ) );
+
+  if( workspace.workComplex.size() < std::max( 1, 2 * N - 1 ) );
+  {
+    std::complex< T > optimalWorkSize{ 0, 0 };
+    
+    int LWORK = -1;
+
+    if( std::is_same_v< T, float > )
+    {
+      LVARRAY_CHEEV(
+        getOption( decompositionType ),
+        getOption( storageType ),
+        &N,
+        nullptr,
+        &LDA,
+        nullptr,
+        &optimalWorkSize,
+        &LWORK,
+        nullptr,
+        &INFO );
+    }
+    else
+    {
+      LVARRAY_ZHEEV(
+        getOption( decompositionType ),
+        getOption( storageType ),
+        &N,
+        nullptr,
+        &LDA,
+        nullptr,
+        &optimalWorkSize,
+        &LWORK,
+        nullptr,
+        &INFO );
+    }
+
+    LVARRAY_ERROR_IF_NE_MSG( INFO, 0,
+      "Error in computing the optimal workspace size." );
+    
+    workspace.workComplex.resizeWithoutInitializationOrDestruction(
+      static_cast< INDEX_TYPE >( optimalWorkSize.real() ) );
+  }
+
+  int const LWORK = integerConversion< int >( workspace.workComplex.size() );
+
+  if( std::is_same< T, float >::value )
+  {
+    LVARRAY_CHEEV(
+      getOption( decompositionType ),
+      getOption( storageType ),
+      &N,
+      A.data(),
+      &LDA,
+      eigenValues.data(),
+      workspace.workComplex.data(),
+      &LWORK,
+      workspace.rWork.data(),
+      &INFO );
+  }
+  else
+  {
+    LVARRAY_ZHEEV(
+      getOption( decompositionType ),
+      getOption( storageType ),
+      &N,
+      A.data(),
+      &LDA,
+      eigenValues.data(),
+      workspace.workComplex.data(),
+      &LWORK,
+      workspace.rWork.data(),
+      &INFO );
+  }
+  
+  LVARRAY_ERROR_IF_NE_MSG( INFO, 0,
+    "Error in computing the eigen decomposition." );
+}`
+
+
+// explicit instantiations.
+template void heev< float >(
+  MemorySpace const space,
+  EigenDecompositionOption const decompositionType,
+  ArraySlice< std::complex< float >, 2, 1, INDEX_TYPE > const & A,
+  ArraySlice< float, 1, 0, INDEX_TYPE > const & eigenValues,
+  Workspace< float > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+template void heev< double >(
+  MemorySpace const space,
+  EigenDecompositionOption const decompositionType,
+  ArraySlice< std::complex< double >, 2, 1, INDEX_TYPE > const & A,
+  ArraySlice< double, 1, 0, INDEX_TYPE > const & eigenValues,
+  Workspace< double > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+} // namespace dense
+} // namespace LvArray
diff --git a/src/dense/eigendecomposition.hpp b/src/dense/eigendecomposition.hpp
new file mode 100644
index 00000000..c7d98e49
--- /dev/null
+++ b/src/dense/eigendecomposition.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+/**
+ *
+ */
+struct EigenDecompositionOptions
+{
+  enum Type
+  {
+    EIGENVALUES,
+    EIGENVALUES_AND_VECTORS,
+  };
+
+  enum Range
+  {
+    ALL,
+    IN_RANGE,
+    BY_INDEX,
+  };
+
+  Type const m_type;
+  Range const m_range;
+  double const rangeMin;
+  double const rangeMax;
+  int const indexMin;
+  int const indexMax;
+};
+
+/**
+ *
+ */
+template< typename T, INDEX_TYPE >
+void heev(
+  MemorySpace const space,
+  EigenDecompositionOption const options,
+  ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A,
+  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
+  Workspace< T > & workspace,
+  SymmetricMatrixStorageType const storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR
+);
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file

From 9fb36e9b256fed0a2bc4a96b1adc1b217829ddfa Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Wed, 31 Aug 2022 16:47:57 -0700
Subject: [PATCH 29/34] Squash

---
 src/Macros.hpp                                |  28 +-
 .../{CMakeLists.cmake => CMakeLists.txt}      |   0
 src/dense/common.hpp                          | 101 ++++++-
 src/dense/eigendecomposition.cpp              | 258 +++++++++---------
 src/dense/eigendecomposition.hpp              |  40 +--
 5 files changed, 271 insertions(+), 156 deletions(-)
 rename src/dense/{CMakeLists.cmake => CMakeLists.txt} (100%)

diff --git a/src/Macros.hpp b/src/Macros.hpp
index bcd98b78..82cf24d1 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -148,20 +148,20 @@
   #endif
 #else
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
-  // do \
-  // { \
-  //   if( EXP ) \
-  //   { \
-  //     std::ostringstream __oss; \
-  //     __oss << "***** ERROR\n"; \
-  //     __oss << "***** LOCATION: " LOCATION "\n"; \
-  //     __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \
-  //     __oss << MSG << "\n"; \
-  //     __oss << LvArray::system::stackTrace( true ); \
-  //     std::cout << __oss.str() << std::endl; \
-  //     LvArray::system::callErrorHandler(); \
-  //   } \
-  // } while( false )
+  do \
+  { \
+    if( EXP ) \
+    { \
+      std::ostringstream __oss; \
+      __oss << "***** ERROR\n"; \
+      __oss << "***** LOCATION: " LOCATION "\n"; \
+      __oss << "***** Controlling expression (should be false): " STRINGIZE( EXP ) "\n"; \
+      __oss << MSG << "\n"; \
+      __oss << LvArray::system::stackTrace( true ); \
+      std::cout << __oss.str() << std::endl; \
+      LvArray::system::callErrorHandler(); \
+    } \
+  } while( false )
 #endif
 
 /**
diff --git a/src/dense/CMakeLists.cmake b/src/dense/CMakeLists.txt
similarity index 100%
rename from src/dense/CMakeLists.cmake
rename to src/dense/CMakeLists.txt
diff --git a/src/dense/common.hpp b/src/dense/common.hpp
index 40072a91..4588080e 100644
--- a/src/dense/common.hpp
+++ b/src/dense/common.hpp
@@ -1,13 +1,36 @@
 #pragma once
 
-#include "common.hpp"
 #include "../Array.hpp"
 #include "../ChaiBuffer.hpp"
 
+#include <complex>
+
 namespace LvArray
 {
 namespace dense
 {
+namespace internal
+{
+
+/**
+ * TODO make a complex type and add it to the main LvArray. Make a uniform way of interacting with various complex number implementations.
+ */
+template< typename T >
+struct RealVersion
+{
+  using Type = T;
+};
+
+/**
+ *
+ */
+template< typename T >
+struct RealVersion< std::complex< T > >
+{
+  using Type = T;
+};
+
+} // namespace internal
 
 /**
  *
@@ -27,10 +50,82 @@ char const * getOption( SymmetricMatrixStorageType const option );
  *
  */
 template< typename T >
+using RealVersion = typename internal::RealVersion< T >::Type;
+
+
+using DenseInt = int;
+
+/**
+ *
+ */
+template< typename T >
+struct Matrix
+{
+  /**
+   *
+   */
+  template< typename INDEX_TYPE  >
+  Matrix( ArraySlice< T, 2, 1, INDEX_TYPE > const & slice ):
+    nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) },
+    nCols{ integerConversion< DenseInt >( slice.size( 0 ) ) },
+    stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] ) },
+    data{ &slice( 0, 0 ) }
+  {}
+
+  /**
+   *
+   */
+  bool isSquare() const
+  {
+    return nRows == nCols;
+  }
+
+  DenseInt const nRows;
+  DenseInt const nCols;
+  DenseInt const stride;
+  T * const data;
+};
+
+/**
+ *
+ */
+template< typename T >
+struct Vector
+{
+  template< int USD, typename INDEX_TYPE >
+  Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ):
+    n{ integerConversion< DenseInt >( slice.size() ) },
+    stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] )  },
+    data{ &slice[ 0 ] }
+  {}
+
+  DenseInt const n;
+  DenseInt const stride;
+  T * const data;
+};
+
+/**
+ * TODO(corbett5): Make this into a virtual heirarchy so we can get rid of ChaiBuffer here.
+ * Also add a version that is only for computing sizes so no dynamic allocation needed.
+ * When that is done you can get rid of the constructor here.
+ */
+template< typename T >
 struct Workspace
 {
-  Array< std::complex< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > workComplex;
-  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rWork;
+  Workspace()
+  {}
+
+  Workspace( std::ptrdiff_t initialSize ):
+    work( initialSize ),
+    rwork( initialSize ),
+    iwork( initialSize )
+  {}
+
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > work;
+
+  Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rwork;
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > iwork;
 };
 
 } // namespace dense
diff --git a/src/dense/eigendecomposition.cpp b/src/dense/eigendecomposition.cpp
index fc51132a..5bf2e0c4 100644
--- a/src/dense/eigendecomposition.cpp
+++ b/src/dense/eigendecomposition.cpp
@@ -52,14 +52,14 @@ void LVARRAY_ZHEEVR(
   double const * ABSTOL,
   int * M,
   double * W,
-  double * Z,
+  std::complex< double > * Z,
   int const * LDZ,
   int * ISUPPZ,
   std::complex< double > * WORK,
   int const * LWORK,
   double * RWORK,
-  int * LRWORK,
-  int const * IWORK,
+  int const * LRWORK,
+  int * IWORK,
   int const * LIWORK,
   int * INFO );
 
@@ -73,157 +73,169 @@ namespace dense
 namespace internal
 {
 
-/**
- *
- */
-char const * getOption( EigenDecompositionOption const option )
+template< typename T >
+int heevr(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenValues,
+  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< int > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType,
+  bool const compute )
 {
-  static constexpr char const * const eigenvalueString = "N";
-  static constexpr char const * const eigenvectorString = "V";
+  LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." );
 
-  return option == EigenDecompositionOption::EIGENVALUES ? eigenvalueString : eigenvectorString;
-}
+  LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
 
-struct HEEVR_status
-{
-  int LWORK;
-  int LRWORK;
-  int LIWORK;
-  bool success
-};
-
-
-template< typename T, typename INDEX_TYPE >
-HEEVR_Sizes heevr(
-  EigenDecompositionOption const decompositionOptions,
-  ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A,
-  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
-  Workspace< T > & workspace,
-  SymmetricMatrixStorageType const storageType,
-  bool const compute )
+  char const * const JOBZ = decompositionOptions.typeArg();
+  char const * const RANGE = decompositionOptions.rangeArg();
+  char const * const UPLO = getOption( storageType );
+  int const N = integerConversion< int >( A.nCols );
+  int const LDA = A.stride;
 
-} // namespace internal
+  T const VL = decompositionOptions.rangeMin;
+  T const VU = decompositionOptions.rangeMax;
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename T, typename INDEX_TYPE >
-void heev(
-  MemorySpace const space,
-  EigenDecompositionOption const decompositionType,
-  ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A,
-  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
-  Workspace< T > & workspace,
-  SymmetricMatrixStorageType const storageType )
-{
-  LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::cpu, "Device not yet supported." );
+  if( decompositionOptions.range == EigenDecompositionOptions::Range::IN_INTERVAL )
+  {
+    LVARRAY_ERROR_IF_GE( VL, VU );
+  }
 
-  LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), A.size( 1 ),
-    "The matrix A must be square." );
+  int maxEigenvaluesToFind = N;
+  int const IL = decompositionOptions.indexMin;
+  int const IU = decompositionOptions.indexMax;
+  if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX )
+  {
+    LVARRAY_ERROR_IF_LT( IL, 1 );
+    LVARRAY_ERROR_IF_GT( IU, N );
+    LVARRAY_ERROR_IF_GT( IL, IU );
 
-  LVARRAY_ASSERT_EQ_MSG( A.size( 0 ), eigenValues.size(),
-    "The matrix A and lambda have incompatible sizes." );
+    maxEigenvaluesToFind = IU - IL + 1;
+  }
 
-  // define the arguments of zheev
-  int const N = LvArray::integerConversion< int >( A.size( 0 ) );
-  int const LDA = N;
-  int INFO;
+  LVARRAY_ERROR_IF_LT( eigenValues.n, maxEigenvaluesToFind );
 
-  // Make sure that the workspace is the right size.
-  workspace.rWork.resizeWithoutInitializationOrDestruction( std::max( 1, 3 * N - 2 ) );
+  int const ABSTOL = decompositionOptions.abstol;
+  int M = 0;
 
-  if( workspace.workComplex.size() < std::max( 1, 2 * N - 1 ) );
+  if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS )
   {
-    std::complex< T > optimalWorkSize{ 0, 0 };
-    
-    int LWORK = -1;
+    LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N );
+    LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind );
+  }
 
-    if( std::is_same_v< T, float > )
-    {
-      LVARRAY_CHEEV(
-        getOption( decompositionType ),
-        getOption( storageType ),
-        &N,
-        nullptr,
-        &LDA,
-        nullptr,
-        &optimalWorkSize,
-        &LWORK,
-        nullptr,
-        &INFO );
-    }
-    else
-    {
-      LVARRAY_ZHEEV(
-        getOption( decompositionType ),
-        getOption( storageType ),
-        &N,
-        nullptr,
-        &LDA,
-        nullptr,
-        &optimalWorkSize,
-        &LWORK,
-        nullptr,
-        &INFO );
-    }
+  int const LDZ = eigenVectors.stride;
 
-    LVARRAY_ERROR_IF_NE_MSG( INFO, 0,
-      "Error in computing the optimal workspace size." );
-    
-    workspace.workComplex.resizeWithoutInitializationOrDestruction(
-      static_cast< INDEX_TYPE >( optimalWorkSize.real() ) );
-  }
+  // TODO: check ISUPPZ
+
+  int const LWORK = compute ? integerConversion< int >( workspace.work.size() ) : -1;
+  int const LRWORK = integerConversion< int >( workspace.rwork.size() );
+  int const LIWORK = integerConversion< int >( workspace.iwork.size() );
 
-  int const LWORK = integerConversion< int >( workspace.workComplex.size() );
+  int INFO = 0;
 
+  // With C++ 17 we can remove the reinterpret_cast with constexpr if.
   if( std::is_same< T, float >::value )
   {
-    LVARRAY_CHEEV(
-      getOption( decompositionType ),
-      getOption( storageType ),
-      &N,
-      A.data(),
-      &LDA,
-      eigenValues.data(),
-      workspace.workComplex.data(),
-      &LWORK,
-      workspace.rWork.data(),
-      &INFO );
   }
   else
   {
-    LVARRAY_ZHEEV(
-      getOption( decompositionType ),
-      getOption( storageType ),
+    LVARRAY_ZHEEVR(
+      JOBZ,
+      RANGE,
+      UPLO,
       &N,
-      A.data(),
+      reinterpret_cast< std::complex< double > * >( A.data ),
       &LDA,
-      eigenValues.data(),
-      workspace.workComplex.data(),
+      reinterpret_cast< double const * >( &VL ),
+      reinterpret_cast< double const * >( &VU ),
+      &IL,
+      &IU,
+      reinterpret_cast< double const * >( &ABSTOL ),
+      &M,
+      reinterpret_cast< double * >( eigenValues.data ),
+      reinterpret_cast< std::complex< double > * >( eigenVectors.data ),
+      &LDZ,
+      support.data,
+      reinterpret_cast< std::complex< double > * >( workspace.work.data() ),
       &LWORK,
-      workspace.rWork.data(),
+      reinterpret_cast< double * >( workspace.rwork.data() ),
+      &LRWORK,
+      workspace.iwork.data(),
+      &LIWORK,
       &INFO );
   }
-  
-  LVARRAY_ERROR_IF_NE_MSG( INFO, 0,
-    "Error in computing the eigen decomposition." );
-}`
 
+  LVARRAY_ERROR_IF_NE( INFO, 0 );
 
-// explicit instantiations.
-template void heev< float >(
+  return M;
+}
+
+} // namespace internal
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename T >
+int heevr(
   MemorySpace const space,
-  EigenDecompositionOption const decompositionType,
-  ArraySlice< std::complex< float >, 2, 1, INDEX_TYPE > const & A,
-  ArraySlice< float, 1, 0, INDEX_TYPE > const & eigenValues,
-  Workspace< float > & workspace,
-  SymmetricMatrixStorageType const storageType );
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenValues,
+  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< int > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  bool const reallocateWork = workspace.work.size() < 2 * A.nRows;
+  bool const reallocateRWork = workspace.rwork.size() < 24 * A.nRows;
+  bool const reallocateIWork = workspace.iwork.size() < 10 * A.nRows;
 
-template void heev< double >(
+  if( reallocateWork || reallocateRWork || reallocateIWork )
+  {
+    Workspace< std::complex< T > > optimalSizes( 1 );
+    internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false );
+    
+    if( reallocateWork )
+    {
+      workspace.work.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.work[ 0 ].real() ) );
+    }
+
+    if( reallocateRWork )
+    {
+      workspace.rwork.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.rwork[ 0 ] ) );
+    }
+
+    if( reallocateIWork )
+    {
+      workspace.rwork.resizeWithoutInitializationOrDestruction( space, optimalSizes.iwork[ 0 ] );
+    }
+  }
+
+  return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true );
+}
+
+// explicit instantiations.
+template int heevr< float >(
   MemorySpace const space,
-  EigenDecompositionOption const decompositionType,
-  ArraySlice< std::complex< double >, 2, 1, INDEX_TYPE > const & A,
-  ArraySlice< double, 1, 0, INDEX_TYPE > const & eigenValues,
-  Workspace< double > & workspace,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< float > > const & A,
+  Vector< float > const & eigenValues,
+  Matrix< std::complex< float > > const & eigenVectors,
+  Vector< int > const & support,
+  Workspace< std::complex< float > > & workspace,
   SymmetricMatrixStorageType const storageType );
 
+// template int heevr< double >(
+//   MemorySpace const space,
+//   EigenDecompositionOptions const decompositionOptions,
+//   Matrix< std::complex< double > > const & A,
+//   Vector< double > const & eigenValues,
+//   Matrix< std::complex< double > > const & eigenVectors,
+//   Vector< int > const & support,
+//   Workspace< std::complex< double > > & workspace,
+//   SymmetricMatrixStorageType const storageType );
+
 } // namespace dense
 } // namespace LvArray
diff --git a/src/dense/eigendecomposition.hpp b/src/dense/eigendecomposition.hpp
index c7d98e49..75ae830b 100644
--- a/src/dense/eigendecomposition.hpp
+++ b/src/dense/eigendecomposition.hpp
@@ -21,30 +21,38 @@ struct EigenDecompositionOptions
   enum Range
   {
     ALL,
-    IN_RANGE,
+    IN_INTERVAL,
     BY_INDEX,
   };
 
-  Type const m_type;
-  Range const m_range;
+  char const * typeArg() const
+  {
+    static constexpr char const * const eigenvalueString = "N";
+    static constexpr char const * const eigenvectorString = "V";
+
+    return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString;
+  }
+
+  char const * rangeArg() const
+  {
+    static constexpr char const * const allString = "A";
+    static constexpr char const * const intervalString = "V";
+    static constexpr char const * const indexString = "I";
+
+    if( range == Range::ALL )
+    { return allString; }
+
+    return range == Range::IN_INTERVAL ? intervalString : indexString;
+  }
+
+  Type const type;
+  Range const range;
   double const rangeMin;
   double const rangeMax;
   int const indexMin;
   int const indexMax;
+  double const abstol;
 };
 
-/**
- *
- */
-template< typename T, INDEX_TYPE >
-void heev(
-  MemorySpace const space,
-  EigenDecompositionOption const options,
-  ArraySlice< std::complex< T >, 2, 1, INDEX_TYPE > const & A,
-  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
-  Workspace< T > & workspace,
-  SymmetricMatrixStorageType const storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR
-);
-
 } // namespace dense
 } // namespace LvArray
\ No newline at end of file

From c7036cbe276cb671f9758bccdad27dc9e487d198 Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Thu, 1 Sep 2022 01:36:40 -0700
Subject: [PATCH 30/34] Eigen stuff seems to be at least partialy working.

---
 scripts/uberenv/packages/lvarray/package.py | 307 ++++++++++++--------
 src/ArraySlice.hpp                          |   9 +
 src/dense/CMakeLists.txt                    |   4 +-
 src/dense/common.hpp                        | 130 +++++++--
 src/dense/eigenDecomposition.cpp            | 270 +++++++++++++++++
 src/dense/eigenDecomposition.hpp            | 202 +++++++++++++
 src/dense/eigendecomposition.cpp            | 241 ---------------
 src/dense/eigendecomposition.hpp            |  58 ----
 unitTests/CMakeLists.txt                    |   5 +
 unitTests/dense/CMakeLists.txt              |  34 +++
 unitTests/dense/testEigenDecomposition.cpp  |  62 ++++
 11 files changed, 881 insertions(+), 441 deletions(-)
 create mode 100644 src/dense/eigenDecomposition.cpp
 create mode 100644 src/dense/eigenDecomposition.hpp
 delete mode 100644 src/dense/eigendecomposition.cpp
 delete mode 100644 src/dense/eigendecomposition.hpp
 create mode 100644 unitTests/dense/CMakeLists.txt
 create mode 100644 unitTests/dense/testEigenDecomposition.cpp

diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index c5a3b35b..347ca123 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -18,6 +18,12 @@ def cmake_cache_entry(name, value, comment=""):
 
     return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name, value, comment)
 
+def cmake_cache_list(name, value, comment=""):
+    """Generate a list for a cmake cache variable"""
+
+    indent = 5 + len(name)
+    join_str = '\n' + ' ' * indent
+    return 'set(%s %s CACHE STRING "%s")\n\n' % (name, join_str.join(value), comment)
 
 def cmake_cache_string(name, string, comment=""):
     """Generate a string for a cmake cache variable"""
@@ -50,7 +56,7 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('chai', default=False, description='Build Chai support')
     variant('caliper', default=False, description='Build Caliper support')
     variant('pylvarray', default=False, description='Build Python support')
-    # variant('lapack', default=False, description='Build LAPACK and BLAS support')
+    variant('lapack', default=False, description='Build LAPACK and BLAS support')
     # variant('magma', default=False, description='Build MAGMA support')
     variant('tests', default=True, description='Build tests')
     variant('benchmarks', default=False, description='Build benchmarks')
@@ -65,7 +71,7 @@ class Lvarray(CMakePackage, CudaPackage):
 
     # conflicts('~lapack', when='+magma')
 
-    depends_on('blt', when='@0.2.0:', type='build')
+    depends_on('blt@0.4.1:', when='@0.2.0:', type='build')
 
     depends_on('camp')
 
@@ -82,8 +88,8 @@ class Lvarray(CMakePackage, CudaPackage):
     depends_on('py-scipy@1.5.2:', when='+pylvarray')
     depends_on('py-pip', when='+pylvarray')
 
-    # depends_on('blas', when='+lapack')
-    # depends_on('lapack', when='+lapack')
+    depends_on('blas', when='+lapack')
+    depends_on('lapack', when='+lapack')
     # depends_on('magma', when='+magma')
 
     depends_on('doxygen@1.8.13:', when='+docs', type='build')
@@ -181,132 +187,130 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cmake_exe = os.path.realpath(cmake_exe)
 
         host_config_path = self._get_host_config_path(spec)
-        cfg = open(host_config_path, "w")
-        cfg.write("#{0}\n".format("#" * 80))
-        cfg.write("# Generated host-config - Edit at own risk!\n")
-        cfg.write("#{0}\n".format("#" * 80))
-
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# SYS_TYPE: {0}\n".format(sys_type))
-        cfg.write("# Compiler Spec: {0}\n".format(spec.compiler))
-        cfg.write("# CMake executable path: %s\n" % cmake_exe)
-        cfg.write("#{0}\n\n".format("-" * 80))
-
-        if 'blt' in spec:
-            cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix))
-
-        #######################
-        # Compiler Settings
-        #######################
-
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Compilers\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
-        cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler))
-        cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler))
-
-        # use global spack compiler flags
-        cflags = ' '.join(spec.compiler_flags['cflags'])
-        cxxflags = ' '.join(spec.compiler_flags['cxxflags'])
-
-        if "%intel" in spec:
-            cflags += ' -qoverride-limits'
-            cxxflags += ' -qoverride-limits'
-
-        if cflags:
-            cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
-
-        if cxxflags:
-            cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
+        with open(host_config_path, "w") as cfg:
+            cfg.write("#{0}\n".format("#" * 80))
+            cfg.write("# Generated host-config - Edit at own risk!\n")
+            cfg.write("#{0}\n".format("#" * 80))
 
-        release_flags = "-O3 -DNDEBUG"
-        cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE",
-                                     release_flags))
-        reldebinf_flags = "-O3 -g -DNDEBUG"
-        cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO",
-                                     reldebinf_flags))
-        debug_flags = "-O0 -g"
-        cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags))
-
-        if "%clang arch=linux-rhel7-ppc64le" in spec:
-            cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize"))
-
-        if "+cuda" in spec:
             cfg.write("#{0}\n".format("-" * 80))
-            cfg.write("# Cuda\n")
+            cfg.write("# SYS_TYPE: {0}\n".format(sys_type))
+            cfg.write("# Compiler Spec: {0}\n".format(spec.compiler))
+            cfg.write("# Spec: {0}\n".format(spec))
+            cfg.write("# CMake executable path: %s\n" % cmake_exe)
             cfg.write("#{0}\n\n".format("-" * 80))
 
-            cfg.write(cmake_cache_option("ENABLE_CUDA", True))
-            cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14))
-
-            cudatoolkitdir = spec['cuda'].prefix
-            cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR",
-                                        cudatoolkitdir))
-            cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc"
-            cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler))
-
-            cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror '
-                                'cross-execution-space-call,reorder,'
-                                'deprecated-declarations')
+            if 'blt' in spec:
+                cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix))
 
-            archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch")
-            for archSpecifier in archSpecifiers:
-                for compilerArg in spec.compiler_flags['cxxflags']:
-                    if compilerArg.startswith(archSpecifier):
-                        cmake_cuda_flags += ' -Xcompiler ' + compilerArg
+            #######################
+            # Compiler Settings
+            #######################
 
-            if not spec.satisfies('cuda_arch=none'):
-                cuda_arch = spec.variants['cuda_arch'].value
-                cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0])
-
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags))
-
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE",
-                                         "-O3 -Xcompiler -O3 -DNDEBUG"))
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO",
-                                         "-O3 -g -lineinfo -Xcompiler -O3"))
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG",
-                                         "-O0 -Xcompiler -O0 -g -G"))
-
-        else:
-            cfg.write(cmake_cache_option("ENABLE_CUDA", False))
-
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# CAMP\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Compilers\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler))
+            cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler))
+
+            # use global spack compiler flags
+            cflags = ' '.join(spec.compiler_flags['cflags'])
+            cxxflags = ' '.join(spec.compiler_flags['cxxflags'])
+
+            if "%intel" in spec:
+                cflags += ' -qoverride-limits'
+                cxxflags += ' -qoverride-limits'
+
+            if cflags:
+                cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
+
+            if cxxflags:
+                cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
+
+            release_flags = "-O3 -DNDEBUG"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE",
+                                        release_flags))
+            reldebinf_flags = "-O3 -g -DNDEBUG"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO",
+                                        reldebinf_flags))
+            debug_flags = "-O0 -g"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags))
+
+            if "%clang arch=linux-rhel7-ppc64le" in spec:
+                cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize"))
+
+            if "+cuda" in spec:
+                cfg.write("#{0}\n".format("-" * 80))
+                cfg.write("# Cuda\n")
+                cfg.write("#{0}\n\n".format("-" * 80))
+
+                cfg.write(cmake_cache_option("ENABLE_CUDA", True))
+                cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14))
+
+                cudatoolkitdir = spec['cuda'].prefix
+                cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR",
+                                            cudatoolkitdir))
+                cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc"
+                cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler))
+
+                cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror '
+                                    'cross-execution-space-call,reorder,'
+                                    'deprecated-declarations')
+
+                archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch")
+                for archSpecifier in archSpecifiers:
+                    for compilerArg in spec.compiler_flags['cxxflags']:
+                        if compilerArg.startswith(archSpecifier):
+                            cmake_cuda_flags += ' -Xcompiler ' + compilerArg
+
+                if not spec.satisfies('cuda_arch=none'):
+                    cuda_arch = spec.variants['cuda_arch'].value
+                    cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0])
+
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags))
+
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE",
+                                            "-O3 -Xcompiler -O3 -DNDEBUG"))
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO",
+                                            "-O3 -g -lineinfo -Xcompiler -O3"))
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG",
+                                            "-O0 -Xcompiler -O0 -g -G"))
+
+            else:
+                cfg.write(cmake_cache_option("ENABLE_CUDA", False))
 
-        cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# CAMP\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# RAJA\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix))
 
-        cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# RAJA\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Umpire\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix))
 
-        if "+umpire" in spec:
-            cfg.write(cmake_cache_option("ENABLE_UMPIRE", True))
-            cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_UMPIRE", False))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Umpire\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# CHAI\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            if "+umpire" in spec:
+                cfg.write(cmake_cache_option("ENABLE_UMPIRE", True))
+                cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_UMPIRE", False))
 
-        if "+chai" in spec:
-            cfg.write(cmake_cache_option("ENABLE_CHAI", True))
-            cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_CHAI", False))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# CHAI\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Caliper\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            if "+chai" in spec:
+                cfg.write(cmake_cache_option("ENABLE_CHAI", True))
+                cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_CHAI", False))
 
+<<<<<<< HEAD
         if "+caliper" in spec:
             cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
             cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
@@ -358,15 +362,74 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
                                                       'doxygen')))
         else:
             cfg.write(cmake_cache_option("ENABLE_DOCS", False))
+=======
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Caliper\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# addr2line\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
-        cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec))
+            if "+caliper" in spec:
+                cfg.write("#{0}\n".format("-" * 80))
+                cfg.write("# Caliper\n")
+                cfg.write("#{0}\n\n".format("-" * 80))
+
+                cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
+                cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_CALIPER", False))
+
+            cfg.write('#{0}\n'.format('-' * 80))
+            cfg.write('# Python\n')
+            cfg.write('#{0}\n\n'.format('-' * 80))
+            if '+pylvarray' in spec:
+                cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True))
+                cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3')))
+            else:
+                cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False))
+
+            cfg.write('#{0}\n'.format('-' * 80))
+            cfg.write('# Math libraries\n')
+            cfg.write('#{0}\n\n'.format('-' * 80))
+            if '+lapack' in spec:
+                cfg.write(cmake_cache_option('ENABLE_LAPACK', True))
+                cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs))
+                cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs))
+            else:
+                cfg.write(cmake_cache_option('ENABLE_LAPACK', False))
+
+            # if '+magma' in spec:
+            #     cfg.write(cmake_cache_option('ENABLE_MAGMA', True))
+            #     cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix))
+            # else:
+            #     cfg.write(cmake_cache_option('ENABLE_MAGMA', False))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Other\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Documentation\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
+            if "+docs" in spec:
+                cfg.write(cmake_cache_option("ENABLE_DOCS", True))
+                sphinx_dir = spec['py-sphinx'].prefix
+                cfg.write(cmake_cache_string('SPHINX_EXECUTABLE',
+                                            os.path.join(sphinx_dir,
+                                                        'bin',
+                                                        'sphinx-build')))
+
+                doxygen_dir = spec['doxygen'].prefix
+                cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE',
+                                            os.path.join(doxygen_dir,
+                                                        'bin',
+                                                        'doxygen')))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_DOCS", False))
+>>>>>>> f6cec78 (Eigen stuff seems to be at least partialy working.)
+
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# addr2line\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec))
+
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Other\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
     def cmake_args(self):
         spec = self.spec
diff --git a/src/ArraySlice.hpp b/src/ArraySlice.hpp
index 374979e3..84357d8b 100644
--- a/src/ArraySlice.hpp
+++ b/src/ArraySlice.hpp
@@ -308,6 +308,15 @@ class ArraySlice
     return m_data[ linearIndex( indices ... ) ];
   }
 
+  /**
+   * @brief 
+   */
+  LVARRAY_HOST_DEVICE inline constexpr
+  T * data() const
+  {
+    return m_data;
+  }
+
   /**
    * @return Return a pointer to the values.
    * @tparam USD_ Dummy template parameter, do not specify.
diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt
index 0f7096ae..36778a28 100644
--- a/src/dense/CMakeLists.txt
+++ b/src/dense/CMakeLists.txt
@@ -1,11 +1,11 @@
 set( lvarraydense_headers
      common.hpp
-     eigendecomposition.hpp
+     eigenDecomposition.hpp
     )
 
 set( lvarraydense_sources
      common.cpp
-     eigendecomposition.cpp
+     eigenDecomposition.cpp
     )
 
 blt_add_library( NAME             lvarraydense
diff --git a/src/dense/common.hpp b/src/dense/common.hpp
index 4588080e..146bb407 100644
--- a/src/dense/common.hpp
+++ b/src/dense/common.hpp
@@ -65,11 +65,21 @@ struct Matrix
    *
    */
   template< typename INDEX_TYPE  >
-  Matrix( ArraySlice< T, 2, 1, INDEX_TYPE > const & slice ):
+  Matrix( ArraySlice< T, 2, 0, INDEX_TYPE > const & slice ):
     nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) },
     nCols{ integerConversion< DenseInt >( slice.size( 0 ) ) },
-    stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] ) },
-    data{ &slice( 0, 0 ) }
+    stride{ integerConversion< DenseInt >( slice.stride( 1 ) ) },
+    columnMajor{ true },
+    data{ slice.data() }
+  {}
+
+  template< typename INDEX_TYPE  >
+  Matrix( T & value ):
+    nRows{ 1 },
+    nCols{ 1 },
+    stride{ 1 },
+    columnMajor{ true },
+    data{ &value }
   {}
 
   /**
@@ -83,6 +93,7 @@ struct Matrix
   DenseInt const nRows;
   DenseInt const nCols;
   DenseInt const stride;
+  bool const columnMajor;
   T * const data;
 };
 
@@ -94,38 +105,121 @@ struct Vector
 {
   template< int USD, typename INDEX_TYPE >
   Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ):
-    n{ integerConversion< DenseInt >( slice.size() ) },
-    stride{ integerConversion< DenseInt >( slice.strides()[ 0 ] )  },
-    data{ &slice[ 0 ] }
+    size{ integerConversion< DenseInt >( slice.size() ) },
+    stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) },
+    data{ slice.data() }
   {}
 
-  DenseInt const n;
+  Vector( T & value ):
+    size{ 1 },
+    stride{ 1 },
+    data{ &value }
+  {}
+
+  DenseInt const size;
   DenseInt const stride;
   T * const data;
 };
 
 /**
- * TODO(corbett5): Make this into a virtual heirarchy so we can get rid of ChaiBuffer here.
- * Also add a version that is only for computing sizes so no dynamic allocation needed.
- * When that is done you can get rid of the constructor here.
+ *
  */
 template< typename T >
 struct Workspace
 {
-  Workspace()
+  virtual ~Workspace()
+  {};
+
+  virtual Vector< T > work() = 0;
+
+  virtual Vector< RealVersion< T > > rwork() = 0;
+
+  virtual Vector< DenseInt > iwork() = 0;
+
+  virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) = 0;
+
+  virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) = 0;
+
+  virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) = 0;
+};
+
+/**
+ *
+ */
+template< typename T, template< typename > class BUFFER_TYPE >
+struct ArrayWorkspace : public Workspace< T >
+{
+  ArrayWorkspace()
   {}
 
-  Workspace( std::ptrdiff_t initialSize ):
-    work( initialSize ),
-    rwork( initialSize ),
-    iwork( initialSize )
+  virtual Vector< T > work() override
+  { return m_work.toSlice(); }
+
+  virtual Vector< RealVersion< T > > rwork() override
+  { return m_rwork.toSlice(); }
+
+  virtual Vector< DenseInt > iwork() override
+  { return m_iwork.toSlice(); }
+
+  virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override
+  { m_work.resizeWithoutInitializationOrDestruction( space, newSize ); }
+ 
+  virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override
+  { m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); }
+
+  virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override
+  { m_iwork.resizeWithoutInitializationOrDestruction( space, newSize ); }
+
+private:
+  Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work;
+
+  Array< RealVersion< T >, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_rwork;
+
+  Array< DenseInt, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_iwork;
+};
+
+/**
+ *
+ */
+template< typename T >
+struct OptimalSizeCalculation : public Workspace< T >
+{
+  OptimalSizeCalculation()
   {}
 
-  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > work;
+  virtual Vector< T > work() override
+  { return m_work; }
+
+  virtual Vector< RealVersion< T > > rwork() override
+  { return m_rwork; }
+
+  virtual Vector< int > iwork() override
+  { return m_iwork; }
+
+  virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  DenseInt optimalWorkSize() const
+  { return static_cast< DenseInt >( m_work.real() ); }
+
+  DenseInt optimalRWorkSize() const
+  { return static_cast< DenseInt >( m_rwork ); }
+
+  DenseInt optimalIWorkSize() const
+  { return m_iwork; }
+
+private:
+  T m_work;
 
-  Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > rwork;
+  RealVersion< T > m_rwork;
 
-  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, ChaiBuffer > iwork;
+  DenseInt m_iwork;
 };
 
 } // namespace dense
diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp
new file mode 100644
index 00000000..68a2256d
--- /dev/null
+++ b/src/dense/eigenDecomposition.cpp
@@ -0,0 +1,270 @@
+#include "eigenDecomposition.hpp"
+
+/// This macro provide a flexible interface for Fortran naming convention for compiled objects
+// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
+#define FORTRAN_MANGLE( name ) name
+// #else
+// #define FORTRAN_MANGLE( name ) name ## _
+// #endif
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CHEEVR FORTRAN_MANGLE( cheevr )
+void LVARRAY_CHEEVR( 
+  char const * JOBZ,
+  char const * RANGE,
+  char const * UPLO,
+  LvArray::dense::DenseInt const * N,
+  std::complex< float > * A,
+  LvArray::dense::DenseInt const * LDA,
+  float const * VL,
+  float const * VU,
+  LvArray::dense::DenseInt const * IL,
+  LvArray::dense::DenseInt const * IU,
+  float const * ABSTOL,
+  LvArray::dense::DenseInt * M,
+  float * W,
+  std::complex< float > * Z,
+  LvArray::dense::DenseInt const * LDZ,
+  LvArray::dense::DenseInt * ISUPPZ,
+  std::complex< float > * WORK,
+  LvArray::dense::DenseInt const * LWORK,
+  float * RWORK,
+  LvArray::dense::DenseInt const * LRWORK,
+  LvArray::dense::DenseInt * IWORK,
+  LvArray::dense::DenseInt const * LIWORK,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr )
+void LVARRAY_ZHEEVR( 
+  char const * JOBZ,
+  char const * RANGE,
+  char const * UPLO,
+  LvArray::dense::DenseInt const * N,
+  std::complex< double > * A,
+  LvArray::dense::DenseInt const * LDA,
+  double const * VL,
+  double const * VU,
+  LvArray::dense::DenseInt const * IL,
+  LvArray::dense::DenseInt const * IU,
+  double const * ABSTOL,
+  LvArray::dense::DenseInt * M,
+  double * W,
+  std::complex< double > * Z,
+  LvArray::dense::DenseInt const * LDZ,
+  LvArray::dense::DenseInt * ISUPPZ,
+  std::complex< double > * WORK,
+  LvArray::dense::DenseInt const * LWORK,
+  double * RWORK,
+  LvArray::dense::DenseInt const * LRWORK,
+  LvArray::dense::DenseInt * IWORK,
+  LvArray::dense::DenseInt const * LIWORK,
+  LvArray::dense::DenseInt * INFO );
+
+
+} // extern "C"
+
+namespace LvArray
+{
+namespace dense
+{
+namespace internal
+{
+
+/**
+ * 
+ */
+template< typename T >
+DenseInt heevr(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenValues,
+  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType,
+  bool const compute )
+{
+  LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." );
+
+  LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
+
+  char const * const JOBZ = decompositionOptions.typeArg();
+  char const * const RANGE = decompositionOptions.rangeArg();
+  char const * const UPLO = getOption( storageType );
+  DenseInt const N = A.nCols;
+  DenseInt const LDA = A.stride;
+
+  T const VL = decompositionOptions.rangeMin;
+  T const VU = decompositionOptions.rangeMax;
+
+  DenseInt maxEigenvaluesToFind = N;
+  DenseInt const IL = decompositionOptions.indexMin;
+  DenseInt const IU = decompositionOptions.indexMax;
+  if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX )
+  {
+    LVARRAY_ERROR_IF_GT( IU, N );
+    maxEigenvaluesToFind = IU - IL + 1;
+  }
+
+  LVARRAY_ERROR_IF_LT( eigenValues.size, maxEigenvaluesToFind );
+
+  DenseInt const ABSTOL = decompositionOptions.abstol;
+  DenseInt M = 0;
+
+  if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS )
+  {
+    LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N );
+    LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind );
+  }
+
+  DenseInt const LDZ = std::max( 1, eigenVectors.stride );
+
+  if( decompositionOptions.range == EigenDecompositionOptions::Range::ALL ||
+      ( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX &&
+        maxEigenvaluesToFind == N ) )
+  {
+    LVARRAY_ERROR_IF_LT( support.size, 2 * maxEigenvaluesToFind );
+  }
+
+  DenseInt const LWORK = compute ? workspace.work().size : -1;
+  DenseInt const LRWORK = compute ? workspace.rwork().size : -1;
+  DenseInt const LIWORK = compute ? workspace.iwork().size : -1;
+
+  DenseInt INFO = 0;
+
+  // With C++ 17 we can remove the reinterpret_cast with constexpr if.
+  if( std::is_same< T, float >::value )
+  {
+    LVARRAY_CHEEVR(
+      JOBZ,
+      RANGE,
+      UPLO,
+      &N,
+      reinterpret_cast< std::complex< float > * >( A.data ),
+      &LDA,
+      reinterpret_cast< float const * >( &VL ),
+      reinterpret_cast< float const * >( &VU ),
+      &IL,
+      &IU,
+      reinterpret_cast< float const * >( &ABSTOL ),
+      &M,
+      reinterpret_cast< float * >( eigenValues.data ),
+      reinterpret_cast< std::complex< float > * >( eigenVectors.data ),
+      &LDZ,
+      support.data,
+      reinterpret_cast< std::complex< float > * >( workspace.work().data ),
+      &LWORK,
+      reinterpret_cast< float * >( workspace.rwork().data ),
+      &LRWORK,
+      workspace.iwork().data,
+      &LIWORK,
+      &INFO );
+  }
+  else
+  {
+    LVARRAY_ZHEEVR(
+      JOBZ,
+      RANGE,
+      UPLO,
+      &N,
+      reinterpret_cast< std::complex< double > * >( A.data ),
+      &LDA,
+      reinterpret_cast< double const * >( &VL ),
+      reinterpret_cast< double const * >( &VU ),
+      &IL,
+      &IU,
+      reinterpret_cast< double const * >( &ABSTOL ),
+      &M,
+      reinterpret_cast< double * >( eigenValues.data ),
+      reinterpret_cast< std::complex< double > * >( eigenVectors.data ),
+      &LDZ,
+      support.data,
+      reinterpret_cast< std::complex< double > * >( workspace.work().data ),
+      &LWORK,
+      reinterpret_cast< double * >( workspace.rwork().data ),
+      &LRWORK,
+      workspace.iwork().data,
+      &LIWORK,
+      &INFO );
+  }
+
+  LVARRAY_ERROR_IF_NE( INFO, 0 );
+
+  return M;
+}
+
+} // namespace internal
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename T >
+DenseInt heevr(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenValues,
+  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  bool const reallocateWork = workspace.work().size < 2 * A.nRows;
+  bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows;
+  bool const reallocateIWork = workspace.iwork().size < 10 * A.nRows;
+
+  if( reallocateWork || reallocateRWork || reallocateIWork )
+  {
+    OptimalSizeCalculation< std::complex< T > > optimalSizes;
+    internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false );
+    
+    if( reallocateWork )
+    {
+      workspace.resizeWork( space, optimalSizes.optimalWorkSize() );
+    }
+
+    if( reallocateRWork )
+    {
+      workspace.resizeRWork( space, optimalSizes.optimalRWorkSize() );
+    }
+
+    if( reallocateIWork )
+    {
+      workspace.resizeIWork( space, optimalSizes.optimalIWorkSize() );
+    }
+  }
+
+  return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// explicit instantiations.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template DenseInt heevr< float >(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< float > > const & A,
+  Vector< float > const & eigenValues,
+  Matrix< std::complex< float > > const & eigenVectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< float > > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template DenseInt heevr< double >(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< double > > const & A,
+  Vector< double > const & eigenValues,
+  Matrix< std::complex< double > > const & eigenVectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< double > > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+} // namespace dense
+} // namespace LvArray
diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp
new file mode 100644
index 00000000..16ec001a
--- /dev/null
+++ b/src/dense/eigenDecomposition.hpp
@@ -0,0 +1,202 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+/**
+ *
+ */
+struct EigenDecompositionOptions
+{
+  /**
+   *
+   */
+  enum Type
+  {
+    EIGENVALUES,
+    EIGENVALUES_AND_VECTORS,
+  };
+
+  /**
+   *
+   */
+  enum Range
+  {
+    ALL,
+    IN_INTERVAL,
+    BY_INDEX,
+  };
+
+  /**
+   *
+   */
+  EigenDecompositionOptions( Type const typeP, double const abstolP=0 ):
+    type{ typeP },
+    abstol{ abstolP }
+  {}
+
+  /**
+   *
+   */
+  EigenDecompositionOptions(
+    Type const typeP,
+    double const rangeMinP,
+    double const rangeMaxP,
+    double const abstolP ):
+    type{ typeP },
+    range{ Range::IN_INTERVAL },
+    rangeMin{ rangeMinP },
+    rangeMax{ rangeMaxP },
+    abstol{ abstolP }
+  {
+    LVARRAY_ERROR_IF_GE( rangeMin, rangeMax );
+  }
+
+  /**
+   * TODO: Not sure how I feel about the one based indexing for eigenvalues by index.
+   */
+  EigenDecompositionOptions(
+    Type const typeP,
+    DenseInt const indexMinP,
+    DenseInt const indexMaxP,
+    double const abstolP ):
+    type{ typeP },
+    range{ Range::IN_INTERVAL },
+    indexMin{ indexMinP },
+    indexMax{ indexMaxP },
+    abstol{ abstolP }
+  {
+    LVARRAY_ERROR_IF_LT( indexMin, 1 );
+    LVARRAY_ERROR_IF_GT( indexMin, indexMax );
+  }
+
+  /**
+   *
+   */
+  char const * typeArg() const
+  {
+    static constexpr char const * const eigenvalueString = "N";
+    static constexpr char const * const eigenvectorString = "V";
+
+    return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString;
+  }
+
+  /**
+   *
+   */
+  char const * rangeArg() const
+  {
+    static constexpr char const * const allString = "A";
+    static constexpr char const * const intervalString = "V";
+    static constexpr char const * const indexString = "I";
+
+    if( range == Range::ALL )
+    { return allString; }
+
+    return range == Range::IN_INTERVAL ? intervalString : indexString;
+  }
+
+  ///
+  Type const type;
+
+  ///
+  Range const range = Range::ALL;
+  
+  ///
+  double const rangeMin = std::numeric_limits< double >::max();
+  
+  ///
+  double const rangeMax = std::numeric_limits< double >::lowest();
+  
+  ///
+  DenseInt const indexMin = std::numeric_limits< DenseInt >::max();
+  
+  ///
+  DenseInt const indexMax = std::numeric_limits< DenseInt >::lowest();
+  
+  ///
+  double const abstol = 0;
+};
+
+
+/**
+ *
+ */
+template< typename T >
+DenseInt heevr(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenValues,
+  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+/**
+ *
+ */
+template< typename T, int USD, typename INDEX_TYPE >
+DenseInt heevr(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & A,
+  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
+  ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & eigenVectors,
+  ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  Matrix< std::complex< T > > AMatrix( A );
+  Vector< T > eigenValuesVector( eigenValues );
+  Matrix< std::complex< T > > eigenVectorsMatrix( eigenVectors );
+  Vector< DenseInt > supportVector( support );
+
+  return heevr(
+    space,
+    decompositionOptions,
+    AMatrix,
+    eigenValuesVector,
+    eigenVectorsMatrix,
+    supportVector,
+    workspace,
+    storageType );
+}
+
+/**
+ *
+ */
+template< typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+DenseInt heevr(
+  MemorySpace const space,
+  EigenDecompositionOptions const decompositionOptions,
+  ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & A,
+  ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues,
+  ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors,
+  ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  // Unclear about the touch here since half of A is destroyed, maybe it's not necessary.
+  A.move( space, true );
+  eigenValues.move( space, true );
+  eigenVectors.move( space, true );
+  support.move( space, true );
+
+  return heevr(
+    space,
+    decompositionOptions,
+    A.toSlice(),
+    eigenValues.toSlice(),
+    eigenVectors.toSlice(),
+    support.toSlice(),
+    workspace,
+    storageType );
+}
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/eigendecomposition.cpp b/src/dense/eigendecomposition.cpp
deleted file mode 100644
index 5bf2e0c4..00000000
--- a/src/dense/eigendecomposition.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "eigendecomposition.hpp"
-
-/// This macro provide a flexible interface for Fortran naming convention for compiled objects
-// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
-#define FORTRAN_MANGLE( name ) name
-// #else
-// #define FORTRAN_MANGLE( name ) name ## _
-// #endif
-
-extern "C"
-{
-
-#define LVARRAY_CHEEV FORTRAN_MANGLE( cheev )
-void LVARRAY_CHEEV(
-  char const * JOBZ,
-  char const * UPLO,
-  int const * N,
-  std::complex< float > * A,
-  int const * LDA,
-  float * W,
-  std::complex< float > * WORK,
-  int const * LWORK,
-  float const * RWORK,
-  int * INFO
-);
-
-#define LVARRAY_ZHEEV FORTRAN_MANGLE( zheev )
-void LVARRAY_ZHEEV(
-  char const * JOBZ,
-  char const * UPLO,
-  int const * N,
-  std::complex< double > * A,
-  int const * LDA,
-  double * W,
-  std::complex< double > * WORK,
-  int const * LWORK,
-  double const * RWORK,
-  int * INFO );
-
-#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr )
-void LVARRAY_ZHEEVR( 
-  char const * JOBZ,
-  char const * RANGE,
-  char const * UPLO,
-  int const * N,
-  std::complex< double > * A,
-  int const * LDA,
-  double const * VL,
-  double const * VU,
-  int const * IL,
-  int const * IU,
-  double const * ABSTOL,
-  int * M,
-  double * W,
-  std::complex< double > * Z,
-  int const * LDZ,
-  int * ISUPPZ,
-  std::complex< double > * WORK,
-  int const * LWORK,
-  double * RWORK,
-  int const * LRWORK,
-  int * IWORK,
-  int const * LIWORK,
-  int * INFO );
-
-
-} // extern "C"
-
-namespace LvArray
-{
-namespace dense
-{
-namespace internal
-{
-
-template< typename T >
-int heevr(
-  MemorySpace const space,
-  EigenDecompositionOptions const decompositionOptions,
-  Matrix< std::complex< T > > const & A,
-  Vector< T > const & eigenValues,
-  Matrix< std::complex< T > > const & eigenVectors,
-  Vector< int > const & support,
-  Workspace< std::complex< T > > & workspace,
-  SymmetricMatrixStorageType const storageType,
-  bool const compute )
-{
-  LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." );
-
-  LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
-
-  char const * const JOBZ = decompositionOptions.typeArg();
-  char const * const RANGE = decompositionOptions.rangeArg();
-  char const * const UPLO = getOption( storageType );
-  int const N = integerConversion< int >( A.nCols );
-  int const LDA = A.stride;
-
-  T const VL = decompositionOptions.rangeMin;
-  T const VU = decompositionOptions.rangeMax;
-
-  if( decompositionOptions.range == EigenDecompositionOptions::Range::IN_INTERVAL )
-  {
-    LVARRAY_ERROR_IF_GE( VL, VU );
-  }
-
-  int maxEigenvaluesToFind = N;
-  int const IL = decompositionOptions.indexMin;
-  int const IU = decompositionOptions.indexMax;
-  if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX )
-  {
-    LVARRAY_ERROR_IF_LT( IL, 1 );
-    LVARRAY_ERROR_IF_GT( IU, N );
-    LVARRAY_ERROR_IF_GT( IL, IU );
-
-    maxEigenvaluesToFind = IU - IL + 1;
-  }
-
-  LVARRAY_ERROR_IF_LT( eigenValues.n, maxEigenvaluesToFind );
-
-  int const ABSTOL = decompositionOptions.abstol;
-  int M = 0;
-
-  if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS )
-  {
-    LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N );
-    LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind );
-  }
-
-  int const LDZ = eigenVectors.stride;
-
-  // TODO: check ISUPPZ
-
-  int const LWORK = compute ? integerConversion< int >( workspace.work.size() ) : -1;
-  int const LRWORK = integerConversion< int >( workspace.rwork.size() );
-  int const LIWORK = integerConversion< int >( workspace.iwork.size() );
-
-  int INFO = 0;
-
-  // With C++ 17 we can remove the reinterpret_cast with constexpr if.
-  if( std::is_same< T, float >::value )
-  {
-  }
-  else
-  {
-    LVARRAY_ZHEEVR(
-      JOBZ,
-      RANGE,
-      UPLO,
-      &N,
-      reinterpret_cast< std::complex< double > * >( A.data ),
-      &LDA,
-      reinterpret_cast< double const * >( &VL ),
-      reinterpret_cast< double const * >( &VU ),
-      &IL,
-      &IU,
-      reinterpret_cast< double const * >( &ABSTOL ),
-      &M,
-      reinterpret_cast< double * >( eigenValues.data ),
-      reinterpret_cast< std::complex< double > * >( eigenVectors.data ),
-      &LDZ,
-      support.data,
-      reinterpret_cast< std::complex< double > * >( workspace.work.data() ),
-      &LWORK,
-      reinterpret_cast< double * >( workspace.rwork.data() ),
-      &LRWORK,
-      workspace.iwork.data(),
-      &LIWORK,
-      &INFO );
-  }
-
-  LVARRAY_ERROR_IF_NE( INFO, 0 );
-
-  return M;
-}
-
-} // namespace internal
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T >
-int heevr(
-  MemorySpace const space,
-  EigenDecompositionOptions const decompositionOptions,
-  Matrix< std::complex< T > > const & A,
-  Vector< T > const & eigenValues,
-  Matrix< std::complex< T > > const & eigenVectors,
-  Vector< int > const & support,
-  Workspace< std::complex< T > > & workspace,
-  SymmetricMatrixStorageType const storageType )
-{
-  bool const reallocateWork = workspace.work.size() < 2 * A.nRows;
-  bool const reallocateRWork = workspace.rwork.size() < 24 * A.nRows;
-  bool const reallocateIWork = workspace.iwork.size() < 10 * A.nRows;
-
-  if( reallocateWork || reallocateRWork || reallocateIWork )
-  {
-    Workspace< std::complex< T > > optimalSizes( 1 );
-    internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false );
-    
-    if( reallocateWork )
-    {
-      workspace.work.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.work[ 0 ].real() ) );
-    }
-
-    if( reallocateRWork )
-    {
-      workspace.rwork.resizeWithoutInitializationOrDestruction( space, static_cast< std::ptrdiff_t >( optimalSizes.rwork[ 0 ] ) );
-    }
-
-    if( reallocateIWork )
-    {
-      workspace.rwork.resizeWithoutInitializationOrDestruction( space, optimalSizes.iwork[ 0 ] );
-    }
-  }
-
-  return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true );
-}
-
-// explicit instantiations.
-template int heevr< float >(
-  MemorySpace const space,
-  EigenDecompositionOptions const decompositionOptions,
-  Matrix< std::complex< float > > const & A,
-  Vector< float > const & eigenValues,
-  Matrix< std::complex< float > > const & eigenVectors,
-  Vector< int > const & support,
-  Workspace< std::complex< float > > & workspace,
-  SymmetricMatrixStorageType const storageType );
-
-// template int heevr< double >(
-//   MemorySpace const space,
-//   EigenDecompositionOptions const decompositionOptions,
-//   Matrix< std::complex< double > > const & A,
-//   Vector< double > const & eigenValues,
-//   Matrix< std::complex< double > > const & eigenVectors,
-//   Vector< int > const & support,
-//   Workspace< std::complex< double > > & workspace,
-//   SymmetricMatrixStorageType const storageType );
-
-} // namespace dense
-} // namespace LvArray
diff --git a/src/dense/eigendecomposition.hpp b/src/dense/eigendecomposition.hpp
deleted file mode 100644
index 75ae830b..00000000
--- a/src/dense/eigendecomposition.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#pragma once
-
-#include "common.hpp"
-
-namespace LvArray
-{
-namespace dense
-{
-
-/**
- *
- */
-struct EigenDecompositionOptions
-{
-  enum Type
-  {
-    EIGENVALUES,
-    EIGENVALUES_AND_VECTORS,
-  };
-
-  enum Range
-  {
-    ALL,
-    IN_INTERVAL,
-    BY_INDEX,
-  };
-
-  char const * typeArg() const
-  {
-    static constexpr char const * const eigenvalueString = "N";
-    static constexpr char const * const eigenvectorString = "V";
-
-    return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString;
-  }
-
-  char const * rangeArg() const
-  {
-    static constexpr char const * const allString = "A";
-    static constexpr char const * const intervalString = "V";
-    static constexpr char const * const indexString = "I";
-
-    if( range == Range::ALL )
-    { return allString; }
-
-    return range == Range::IN_INTERVAL ? intervalString : indexString;
-  }
-
-  Type const type;
-  Range const range;
-  double const rangeMin;
-  double const rangeMax;
-  int const indexMin;
-  int const indexMax;
-  double const abstol;
-};
-
-} // namespace dense
-} // namespace LvArray
\ No newline at end of file
diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt
index 4d91681e..3ac33255 100644
--- a/unitTests/CMakeLists.txt
+++ b/unitTests/CMakeLists.txt
@@ -149,3 +149,8 @@ install(TARGETS testTensorOps
 if( ENABLE_PYLVARRAY )
     add_subdirectory( python )
 endif()
+
+if( ENABLE_LAPACK )
+    add_subdirectory( dense )
+endif()
+
diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt
new file mode 100644
index 00000000..f324797e
--- /dev/null
+++ b/unitTests/dense/CMakeLists.txt
@@ -0,0 +1,34 @@
+###################################################################################################
+# Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+# All rights reserved.
+# See the LICENSE file for details.
+# SPDX-License-Identifier: (BSD-3-Clause)
+###################################################################################################
+
+#
+# Specify list of tests
+#
+set( testSources
+     testEigenDecomposition.cpp
+   )
+
+#
+# Add gtest C++ based tests
+#
+foreach(test ${testSources})
+    get_filename_component( test_name ${test} NAME_WE )
+    blt_add_executable( NAME ${test_name}
+                        SOURCES ${test}
+                        OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY}
+                        DEPENDS_ON gtest lvarray lvarraydense ${lvarray_dependencies} )
+
+    target_include_directories( ${test_name} PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../../src )
+
+    blt_add_test( NAME ${test_name}
+                  COMMAND ${test_name} )
+
+    install(TARGETS ${test_name}
+            DESTINATION bin)
+endforeach()
+
+
diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp
new file mode 100644
index 00000000..8f1c1a2b
--- /dev/null
+++ b/unitTests/dense/testEigenDecomposition.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+// Source includes
+#include "dense/eigenDecomposition.hpp"
+
+#include "../testUtils.hpp"
+
+namespace LvArray
+{
+namespace testing
+{
+
+template< typename T >
+using Array1d = Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, DEFAULT_BUFFER >;
+
+template< typename T, typename PERM >
+using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >;
+
+
+TEST( heevr, allEigenvalues )
+{
+  Array2d< std::complex< double >, RAJA::PERM_JI > matrix( 3, 3 );
+  matrix( 1, 1 ) = 2;
+  matrix( 0, 0 ) = 3;
+  matrix( 2, 2 ) = -4;
+
+  Array1d< double > eigenvalues( 3 );
+  Array2d< std::complex< double >, RAJA::PERM_JI > eigenvectors;
+  Array1d< int > support( 6 );
+  dense::ArrayWorkspace< std::complex< double >, ChaiBuffer > workspace;
+  dense::SymmetricMatrixStorageType storageType = dense::SymmetricMatrixStorageType::UPPER_TRIANGULAR;
+
+  dense::heevr< double >(
+    MemorySpace::host,
+    dense::EigenDecompositionOptions( dense::EigenDecompositionOptions::Type::EIGENVALUES ),
+    matrix.toView(),
+    eigenvalues.toView(),
+    eigenvectors.toView(),
+    support,
+    workspace,
+    storageType );
+
+  EXPECT_DOUBLE_EQ( eigenvalues[ 0 ], -4 );
+  EXPECT_DOUBLE_EQ( eigenvalues[ 1 ], 2 );
+  EXPECT_DOUBLE_EQ( eigenvalues[ 2 ], 3 );
+}
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+  return result;
+}

From 3ac40d6f453fa8943a908f255a0a1f3cd113a959 Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Thu, 1 Sep 2022 22:01:51 -0700
Subject: [PATCH 31/34] Building and compiling with MAGMA. GPU not yet working,
 think it's something to do with the new workspaces.

---
 cmake/Config.cmake                            |   5 +-
 cmake/SetupTPL.cmake                          |  64 ++-
 scripts/uberenv/packages/lvarray/package.py   |  72 +--
 scripts/uberenv/packages/magma/cmake-W.patch  |  12 +
 scripts/uberenv/packages/magma/ibm-xl.patch   | 248 ++++++++++
 .../packages/magma/magma-2.3.0-gcc-4.8.patch  |  24 +
 .../packages/magma/magma-2.5.0-cmake.patch    |  77 ++++
 .../uberenv/packages/magma/magma-2.5.0.patch  | 428 ++++++++++++++++++
 scripts/uberenv/packages/magma/package.py     | 125 +++++
 .../blueos_3_ppc64le_ib_p9/packages.yaml      |   9 +
 src/LvArrayConfig.hpp.in                      |   2 +
 src/dense/CMakeLists.txt                      |   9 +-
 src/dense/common.cpp                          |  13 +
 src/dense/common.hpp                          |  74 ++-
 src/dense/eigenDecomposition.cpp              | 277 +++++++++---
 src/dense/eigenDecomposition.hpp              |  33 +-
 unitTests/dense/testEigenDecomposition.cpp    | 116 +++--
 17 files changed, 1405 insertions(+), 183 deletions(-)
 create mode 100644 scripts/uberenv/packages/magma/cmake-W.patch
 create mode 100644 scripts/uberenv/packages/magma/ibm-xl.patch
 create mode 100644 scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch
 create mode 100644 scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch
 create mode 100644 scripts/uberenv/packages/magma/magma-2.5.0.patch
 create mode 100644 scripts/uberenv/packages/magma/package.py

diff --git a/cmake/Config.cmake b/cmake/Config.cmake
index cf8ff35b..c513fbab 100644
--- a/cmake/Config.cmake
+++ b/cmake/Config.cmake
@@ -2,9 +2,10 @@
 set( PREPROCESSOR_DEFINES UMPIRE
                           CHAI
                           CUDA
-			  HIP
+                          HIP
                           TOTALVIEW_OUTPUT
-                          CALIPER )
+                          CALIPER
+                          MAGMA )
 
 set( USE_CONFIGFILE ON CACHE BOOL "" )
 foreach( DEP in ${PREPROCESSOR_DEFINES})
diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake
index c312306b..c40d0582 100644
--- a/cmake/SetupTPL.cmake
+++ b/cmake/SetupTPL.cmake
@@ -1,3 +1,60 @@
+macro(find_and_register)
+    set(singleValueArgs NAME HEADER)
+    set(multiValueArgs INCLUDE_DIRECTORIES
+                       LIBRARY_DIRECTORIES
+                       LIBRARIES
+                       EXTRA_LIBRARIES
+                       DEPENDS )
+
+    ## parse the arguments
+    cmake_parse_arguments(arg
+                          "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT DEFINED arg_NAME)
+        message(FATAL_ERROR "The find_and_register required parameter NAME specifies the name of the library to register.")
+    endif()
+
+    if(NOT DEFINED arg_INCLUDE_DIRECTORIES)
+        message(FATAL_ERROR "The find_and_register required parameter INCLUDE_DIRECTORIES specifies the directories to search for the given header.")
+    endif()
+
+    if(NOT DEFINED arg_LIBRARY_DIRECTORIES)
+        message(FATAL_ERROR "The find_and_register required parameter LIBRARY_DIRECTORIES specifies the directories to search for the given libraries.")
+    endif()
+
+    if(NOT DEFINED arg_HEADER)
+        message(FATAL_ERROR "The find_and_register required parameter HEADER specifies the header to search for.")
+    endif()
+
+    if(NOT DEFINED arg_LIBRARIES)
+        message(FATAL_ERROR "The find_and_register required parameter LIBRARIES specifies the libraries to search for.")
+    endif()
+
+    find_path(${arg_NAME}_INCLUDE_DIR ${arg_HEADER}
+              PATHS ${arg_INCLUDE_DIRECTORIES}
+              NO_DEFAULT_PATH
+              NO_CMAKE_ENVIRONMENT_PATH
+              NO_CMAKE_PATH
+              NO_SYSTEM_ENVIRONMENT_PATH
+              NO_CMAKE_SYSTEM_PATH)
+
+    if(${arg_NAME}_INCLUDE_DIR STREQUAL ${arg_NAME}_INCLUDE_DIR-NOTFOUND)
+        message(FATAL_ERROR "Could not find '${arg_HEADER}' in '${arg_INCLUDE_DIRECTORIES}'")
+    endif()
+
+    blt_find_libraries(FOUND_LIBS ${arg_NAME}_LIBRARIES
+                       NAMES ${arg_LIBRARIES}
+                       PATHS ${arg_LIBRARY_DIRECTORIES}
+                       REQUIRED ON)
+
+    blt_import_library(NAME ${arg_NAME}
+                         INCLUDES ${${arg_NAME}_INCLUDE_DIR}
+                         LIBRARIES ${${arg_NAME}_LIBRARIES} ${arg_EXTRA_LIBRARIES}
+                         TREAT_INCLUDES_AS_SYSTEM ON
+                         DEPENDS_ON ${arg_DEPENDS})
+
+endmacro(find_and_register)
+
 set(thirdPartyLibs "")
 
 ###############################
@@ -162,8 +219,11 @@ if(ENABLE_MAGMA)
         message(FATAL_ERROR "LAPACK must be enabled to use MAGMA.")
     endif()
 
-    find_package(magma REQUIRED
-                 PATHS ${MAGMA_DIR})
+    find_and_register(NAME magma
+                      INCLUDE_DIRECTORIES ${MAGMA_DIR}/include
+                      LIBRARY_DIRECTORIES ${MAGMA_DIR}/lib
+                      HEADER magma.h
+                      LIBRARIES magma)
     
     set(thirdPartyLibs ${thirdPartyLibs} magma)
 else()
diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index 347ca123..7fc306fd 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -57,7 +57,7 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('caliper', default=False, description='Build Caliper support')
     variant('pylvarray', default=False, description='Build Python support')
     variant('lapack', default=False, description='Build LAPACK and BLAS support')
-    # variant('magma', default=False, description='Build MAGMA support')
+    variant('magma', default=False, description='Build MAGMA support')
     variant('tests', default=True, description='Build tests')
     variant('benchmarks', default=False, description='Build benchmarks')
     variant('examples', default=False, description='Build examples')
@@ -65,11 +65,15 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('addr2line', default=True,
             description='Build support for addr2line.')
 
+<<<<<<< HEAD
     variant('tpl_build_type', default='none', description='TPL build type',
             values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none'))
         
 
     # conflicts('~lapack', when='+magma')
+=======
+    conflicts('~lapack', when='+magma')
+>>>>>>> cde43f2 (Building and compiling with MAGMA. GPU not yet working, think it's something to do with the new workspaces.)
 
     depends_on('blt@0.4.1:', when='@0.2.0:', type='build')
 
@@ -90,7 +94,7 @@ class Lvarray(CMakePackage, CudaPackage):
 
     depends_on('blas', when='+lapack')
     depends_on('lapack', when='+lapack')
-    # depends_on('magma', when='+magma')
+    depends_on('magma', when='+magma')
 
     depends_on('doxygen@1.8.13:', when='+docs', type='build')
     depends_on('py-sphinx@1.6.3:', when='+docs', type='build')
@@ -310,59 +314,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
             else:
                 cfg.write(cmake_cache_option("ENABLE_CHAI", False))
 
-<<<<<<< HEAD
-        if "+caliper" in spec:
-            cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
-            cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_CALIPER", False))
-
-        cfg.write('#{0}\n'.format('-' * 80))
-        cfg.write('# Python\n')
-        cfg.write('#{0}\n\n'.format('-' * 80))
-
-        if '+pylvarray' in spec:
-            cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True))
-            cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3')))
-        else:
-            cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False))
-
-        # cfg.write('#{0}\n'.format('-' * 80))
-        # cfg.write('# Math libraries\n')
-        # cfg.write('#{0}\n\n'.format('-' * 80))
-        # if '+lapack' in spec:
-        #     cfg.write(cmake_cache_option('ENABLE_LAPACK', True))
-        #     cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs))
-        #     cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs))
-        # else:
-        #     cfg.write(cmake_cache_option('ENABLE_LAPACK', False))
-
-        # if '+magma' in spec:
-        #     cfg.write(cmake_cache_option('ENABLE_MAGMA', True))
-        #     cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix))
-        # else:
-        #     cfg.write(cmake_cache_option('ENABLE_MAGMA', False))
-
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Documentation\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
-
-        if "+docs" in spec:
-            cfg.write(cmake_cache_option("ENABLE_DOCS", True))
-            sphinx_dir = spec['py-sphinx'].prefix
-            cfg.write(cmake_cache_string('SPHINX_EXECUTABLE',
-                                         os.path.join(sphinx_dir,
-                                                      'bin',
-                                                      'sphinx-build')))
-
-            doxygen_dir = spec['doxygen'].prefix
-            cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE',
-                                         os.path.join(doxygen_dir,
-                                                      'bin',
-                                                      'doxygen')))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_DOCS", False))
-=======
             cfg.write("#{0}\n".format("-" * 80))
             cfg.write("# Caliper\n")
             cfg.write("#{0}\n\n".format("-" * 80))
@@ -396,11 +347,11 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
             else:
                 cfg.write(cmake_cache_option('ENABLE_LAPACK', False))
 
-            # if '+magma' in spec:
-            #     cfg.write(cmake_cache_option('ENABLE_MAGMA', True))
-            #     cfg.write(cmake_cache_list('MAGMA_DIR', spec['magma'].prefix))
-            # else:
-            #     cfg.write(cmake_cache_option('ENABLE_MAGMA', False))
+            if '+magma' in spec:
+                cfg.write(cmake_cache_option('ENABLE_MAGMA', True))
+                cfg.write(cmake_cache_entry('MAGMA_DIR', spec['magma'].prefix))
+            else:
+                cfg.write(cmake_cache_option('ENABLE_MAGMA', False))
 
             cfg.write("#{0}\n".format("-" * 80))
             cfg.write("# Documentation\n")
@@ -420,7 +371,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
                                                         'doxygen')))
             else:
                 cfg.write(cmake_cache_option("ENABLE_DOCS", False))
->>>>>>> f6cec78 (Eigen stuff seems to be at least partialy working.)
 
             cfg.write("#{0}\n".format("-" * 80))
             cfg.write("# addr2line\n")
diff --git a/scripts/uberenv/packages/magma/cmake-W.patch b/scripts/uberenv/packages/magma/cmake-W.patch
new file mode 100644
index 00000000..59179676
--- /dev/null
+++ b/scripts/uberenv/packages/magma/cmake-W.patch
@@ -0,0 +1,12 @@
+diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt
+--- magma-2.5.0-orig/CMakeLists.txt	2019-01-02 11:18:39.000000000 -0800
++++ magma-2.5.0/CMakeLists.txt	2019-04-03 15:58:01.871234891 -0700
+@@ -363,8 +363,6 @@
+ else()
+     # Primarily for gcc / nvcc:
+     # Ignore unused static functions in headers.
+-    set( CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} -Wall -Wno-unused-function" )
+-    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unused-function" )
+ endif()
+ 
+ if (CMAKE_HOST_APPLE)
diff --git a/scripts/uberenv/packages/magma/ibm-xl.patch b/scripts/uberenv/packages/magma/ibm-xl.patch
new file mode 100644
index 00000000..0deab656
--- /dev/null
+++ b/scripts/uberenv/packages/magma/ibm-xl.patch
@@ -0,0 +1,248 @@
+diff -Naur magma-2.2.0/src/dlaex3_m.cpp magma-2.2.0-patched/src/dlaex3_m.cpp
+--- magma-2.2.0/src/dlaex3_m.cpp	2016-11-20 20:20:06.000000000 -0500
++++ magma-2.2.0/src/dlaex3_m.cpp	2017-01-06 15:54:29.423668874 -0500
+@@ -197,7 +197,7 @@
+     magmaDouble_ptr dwork[],
+     magma_queue_t queues[MagmaMaxGPUs][2],
+     magma_range_t range, double vl, double vu, magma_int_t il, magma_int_t iu,
+-    magma_int_t *info )
++    magma_int_t *infom )
+ {
+ #define Q(i_,j_) (Q + (i_) + (j_)*ldq)
+ 
+@@ -209,8 +209,8 @@
+         magma_setdevice(0);
+         magma_dlaex3( k, n, n1, d, Q, ldq, rho,
+                       dlamda, Q2, indx, ctot, w, s, indxq,
+-                      *dwork, range, vl, vu, il, iu, info );
+-        return *info;
++                      *dwork, range, vl, vu, il, iu, infom );
++        return *infom;
+     }
+     double d_one  = 1.;
+     double d_zero = 0.;
+@@ -229,37 +229,37 @@
+     valeig = (range == MagmaRangeV);
+     indeig = (range == MagmaRangeI);
+ 
+-    *info = 0;
++    *infom = 0;
+ 
+     if (k < 0)
+-        *info=-1;
++        *infom=-1;
+     else if (n < k)
+-        *info=-2;
++        *infom=-2;
+     else if (ldq < max(1,n))
+-        *info=-6;
++        *infom=-6;
+     else if (! (alleig || valeig || indeig))
+-        *info = -15;
++        *infom = -15;
+     else {
+         if (valeig) {
+             if (n > 0 && vu <= vl)
+-                *info = -17;
++                *infom = -17;
+         }
+         else if (indeig) {
+             if (il < 1 || il > max(1,n))
+-                *info = -18;
++                *infom = -18;
+             else if (iu < min(n,il) || iu > n)
+-                *info = -19;
++                *infom = -19;
+         }
+     }
+ 
+-    if (*info != 0) {
+-        magma_xerbla( __func__, -(*info) );
+-        return *info;
++    if (*infom != 0) {
++        magma_xerbla( __func__, -(*infom) );
++        return *infom;
+     }
+ 
+     // Quick return if possible
+     if (k == 0)
+-        return *info;
++        return *infom;
+ 
+     magma_device_t orig_dev;
+     magma_getdevice( &orig_dev );
+@@ -360,15 +360,15 @@
+             lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+             // If the zero finder fails, the computation is terminated.
+             if (iinfo != 0) {
+-                #pragma omp critical (info)
+-                *info = iinfo;
++                #pragma omp critical (infom)
++                *infom = iinfo;
+                 break;
+             }
+         }
+ 
+         #pragma omp barrier
+ 
+-        if (*info == 0) {
++        if (*infom == 0) {
+             #pragma omp single
+             {
+                 // Prepare the INDXQ sorting permutation.
+@@ -452,8 +452,8 @@
+             }
+         }
+     }  // end omp parallel
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     timer_stop( time );
+     timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );
+@@ -474,10 +474,10 @@
+         lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+         // If the zero finder fails, the computation is terminated.
+         if (iinfo != 0)
+-            *info=iinfo;
++            *infom=iinfo;
+     }
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     // Prepare the INDXQ sorting permutation.
+     magma_int_t nk = n - k;
+@@ -688,5 +688,5 @@
+ 
+     magma_setdevice( orig_dev );
+     
+-    return *info;
++    return *infom;
+ } /* magma_dlaed3_m */
+diff -Naur magma-2.2.0/src/slaex3_m.cpp magma-2.2.0-patched/src/slaex3_m.cpp
+--- magma-2.2.0/src/slaex3_m.cpp	2016-11-20 20:20:24.000000000 -0500
++++ magma-2.2.0/src/slaex3_m.cpp	2017-01-06 10:20:13.200783151 -0500
+@@ -197,7 +197,7 @@
+     magmaFloat_ptr dwork[],
+     magma_queue_t queues[MagmaMaxGPUs][2],
+     magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu,
+-    magma_int_t *info )
++    magma_int_t *infom )
+ {
+ #define Q(i_,j_) (Q + (i_) + (j_)*ldq)
+ 
+@@ -209,8 +209,8 @@
+         magma_setdevice(0);
+         magma_slaex3( k, n, n1, d, Q, ldq, rho,
+                       dlamda, Q2, indx, ctot, w, s, indxq,
+-                      *dwork, range, vl, vu, il, iu, info );
+-        return *info;
++                      *dwork, range, vl, vu, il, iu, infom );
++        return *infom;
+     }
+     float d_one  = 1.;
+     float d_zero = 0.;
+@@ -229,37 +229,37 @@
+     valeig = (range == MagmaRangeV);
+     indeig = (range == MagmaRangeI);
+ 
+-    *info = 0;
++    *infom = 0;
+ 
+     if (k < 0)
+-        *info=-1;
++        *infom=-1;
+     else if (n < k)
+-        *info=-2;
++        *infom=-2;
+     else if (ldq < max(1,n))
+-        *info=-6;
++        *infom=-6;
+     else if (! (alleig || valeig || indeig))
+-        *info = -15;
++        *infom = -15;
+     else {
+         if (valeig) {
+             if (n > 0 && vu <= vl)
+-                *info = -17;
++                *infom = -17;
+         }
+         else if (indeig) {
+             if (il < 1 || il > max(1,n))
+-                *info = -18;
++                *infom = -18;
+             else if (iu < min(n,il) || iu > n)
+-                *info = -19;
++                *infom = -19;
+         }
+     }
+ 
+-    if (*info != 0) {
+-        magma_xerbla( __func__, -(*info) );
+-        return *info;
++    if (*infom != 0) {
++        magma_xerbla( __func__, -(*infom) );
++        return *infom;
+     }
+ 
+     // Quick return if possible
+     if (k == 0)
+-        return *info;
++        return *infom;
+ 
+     magma_device_t orig_dev;
+     magma_getdevice( &orig_dev );
+@@ -360,15 +360,15 @@
+             lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+             // If the zero finder fails, the computation is terminated.
+             if (iinfo != 0) {
+-                #pragma omp critical (info)
+-                *info = iinfo;
++                #pragma omp critical (infom)
++                *infom = iinfo;
+                 break;
+             }
+         }
+ 
+         #pragma omp barrier
+ 
+-        if (*info == 0) {
++        if (*infom == 0) {
+             #pragma omp single
+             {
+                 // Prepare the INDXQ sorting permutation.
+@@ -452,8 +452,8 @@
+             }
+         }
+     }  // end omp parallel
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     timer_stop( time );
+     timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );
+@@ -474,10 +474,10 @@
+         lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+         // If the zero finder fails, the computation is terminated.
+         if (iinfo != 0)
+-            *info=iinfo;
++            *infom=iinfo;
+     }
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     // Prepare the INDXQ sorting permutation.
+     magma_int_t nk = n - k;
+@@ -688,5 +688,5 @@
+ 
+     magma_setdevice( orig_dev );
+     
+-    return *info;
++    return *infom;
+ } /* magma_slaed3_m */
diff --git a/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch
new file mode 100644
index 00000000..f734a5f1
--- /dev/null
+++ b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch
@@ -0,0 +1,24 @@
+diff -ru magma-2.3.0/testing/testings.h magma-2.3.0-patched/testing/testings.h
+--- magma-2.3.0/testing/testings.h	2017-11-14 21:34:00.000000000 -0800
++++ magma-2.3.0-patched/testing/testings.h	2018-03-23 20:41:16.459934643 -0700
+@@ -269,4 +269,20 @@
+     typename blas::traits<FloatT>::real_t* sigma,
+     FloatT* A, magma_int_t lda );
+ 
++// This overload for the case sigma = nullptr is a workaround for an issue
++// when building with gcc 4.8.5. This is not an issue with gcc 4.9.2.
++template< typename FloatT >
++void magma_generate_matrix(
++    magma_opts& opts,
++    magma_int_t m, magma_int_t n,
++    std::nullptr_t sigma,
++    FloatT* A, magma_int_t lda )
++{
++    magma_generate_matrix<FloatT>(
++        opts,
++        m, n,
++        (typename blas::traits<FloatT>::real_t*) sigma,
++        A, lda );
++}
++
+ #endif /* TESTINGS_H */
diff --git a/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch
new file mode 100644
index 00000000..56b58d85
--- /dev/null
+++ b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch
@@ -0,0 +1,77 @@
+diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt
+--- magma-2.5.0-orig/CMakeLists.txt	2019-01-02 11:18:39.000000000 -0800
++++ magma-2.5.0/CMakeLists.txt	2019-04-03 15:58:01.871234891 -0700
+@@ -440,18 +440,20 @@
+ # compile MAGMA sparse library
+ 
+ # sparse doesn't have Fortran at the moment, so no need for above shenanigans
+-include_directories( sparse/include )
+-include_directories( sparse/control )
+-include_directories( testing )
+-cuda_add_library( magma_sparse ${libsparse_all} )
+-target_link_libraries( magma_sparse
+-	magma
++if (MAGMA_SPARSE)
++  include_directories( sparse/include )
++  include_directories( sparse/control )
++  include_directories( testing )
++  cuda_add_library( magma_sparse ${libsparse_all} )
++  target_link_libraries( magma_sparse
++	  magma
+ 	${LAPACK_LIBRARIES}
+ 	${CUDA_CUDART_LIBRARY}
+ 	${CUDA_CUBLAS_LIBRARIES}
+ 	${CUDA_cusparse_LIBRARY}
+-)
+-set( LIBS_SPARSE ${LIBS} magma_sparse )
++  )
++  set( LIBS_SPARSE ${LIBS} magma_sparse )
++endif()
+ 
+ 
+ # ----------------------------------------
+@@ -480,23 +482,31 @@
+ 
+ # ----------------------------------------
+ # compile each sparse tester
+-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing )
+-foreach( TEST ${sparse_testing_all} )
++if (MAGMA_SPARSE)
++  set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing )
++  foreach( TEST ${sparse_testing_all} )
+ 	string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
+ 	string( REGEX REPLACE "sparse/testing/" "" EXE ${EXE} )
+ 	#message( "${TEST} --> ${EXE}" )
+ 	add_executable( ${EXE} ${TEST} )
+ 	target_link_libraries( ${EXE} ${LIBS_SPARSE} )
+-endforeach()
++  endforeach()
++endif()
+ 
+ 
+ # ----------------------------------------
+ # what to install
+-install( TARGETS magma magma_sparse ${blas_fix}
++set(MAGMA_TARGETS magma)
++set(MAGMA_HEADERS_PATTERNS include/*.h)
++if (MAGMA_SPARSE)
++  set(MAGMA_TARGETS ${MAGMA_TARGETS} magma_sparse)
++  set(MAGMA_HEADERS_PATTERNS ${MAGMA_HEADERS_PATTERNS} sparse/include/*.h)
++endif()
++install( TARGETS ${MAGMA_TARGETS} ${blas_fix}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib )
+-file( GLOB headers include/*.h sparse/include/*.h )
++file( GLOB headers ${MAGMA_HEADERS_PATTERNS} )
+ install( FILES ${headers}
+          DESTINATION include )
+ 
+@@ -509,4 +519,6 @@
+ message( STATUS "    NFLAGS       ${CUDA_NVCC_FLAGS}" )
+ message( STATUS "    FFLAGS       ${CMAKE_Fortran_FLAGS}" )
+ message( STATUS "    LIBS         ${LIBS}" )
+-message( STATUS "    LIBS_SPARSE  ${LIBS_SPARSE}" )
++if (MAGMA_SPARSE)
++  message( STATUS "    LIBS_SPARSE  ${LIBS_SPARSE}" )
++endif()
diff --git a/scripts/uberenv/packages/magma/magma-2.5.0.patch b/scripts/uberenv/packages/magma/magma-2.5.0.patch
new file mode 100644
index 00000000..1ac800c5
--- /dev/null
+++ b/scripts/uberenv/packages/magma/magma-2.5.0.patch
@@ -0,0 +1,428 @@
+diff -r 89706c0efbdb .hgtags
+--- a/.hgtags	Wed Jan 02 14:17:26 2019 -0500
++++ b/.hgtags	Wed Apr 03 15:50:54 2019 -0700
+@@ -1,3 +1,4 @@
+ 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0
+ b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0
+ 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0
++89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0
+diff -r 89706c0efbdb include/magma_types.h
+--- a/include/magma_types.h	Wed Jan 02 14:17:26 2019 -0500
++++ b/include/magma_types.h	Wed Apr 03 15:50:54 2019 -0700
+@@ -77,7 +77,7 @@
+     typedef magma_int_t    magma_device_t;
+ 
+     // Half precision in CUDA 
+-    #if defined(__cplusplus) && CUDA_VERSION > 7500
++    #if defined(__cplusplus) && CUDA_VERSION >= 7500
+     #include <cuda_fp16.h>
+     typedef __half           magmaHalf;
+     #else
+diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu
+--- a/sparse/blas/magma_zsampleselect.cu	Wed Jan 02 14:17:26 2019 -0500
++++ b/sparse/blas/magma_zsampleselect.cu	Wed Apr 03 15:50:54 2019 -0700
+@@ -15,9 +15,12 @@
+ 
+ #define PRECISION_z
+ 
++
+ namespace magma_sampleselect {
+ 
+-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) {
++__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) 
++{
++#if (__CUDA_ARCH__ >= 350)
+     auto idx = threadIdx.x + blockDim.x * blockIdx.x;
+     if (idx >= size) {
+         return;
+@@ -25,6 +28,7 @@
+ 
+     auto v = in[idx];
+     out[idx] = real(v) * real(v) + imag(v) * imag(v);
++#endif
+ }
+ 
+ } // namespace magma_sampleselect
+@@ -164,36 +168,43 @@
+     magma_queue_t queue )
+ {
+     magma_int_t info = 0;
++    magma_int_t arch = magma_getdevice_arch();
+ 
+-    auto num_blocks = magma_ceildiv(total_size, block_size);
+-    auto local_work = (total_size + num_threads - 1) / num_threads;
+-    auto required_size = sizeof(double) * (total_size + searchtree_size)
++    if( arch >= 350 ) {
++        auto num_blocks = magma_ceildiv(total_size, block_size);
++        auto local_work = (total_size + num_threads - 1) / num_threads;
++        auto required_size = sizeof(double) * (total_size + searchtree_size)
+                          + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1);
+-    auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
++        auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
+ 
+-    double* gputmp = (double*)*tmp_ptr;
+-    double* gputree = gputmp + total_size;
+-    uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
+-    int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
+-    int32_t* gpucounts = gpurankout + 1;
+-    int32_t* gpulocalcounts = gpucounts + searchtree_width;
+-    uint32_t bucketidx{};
++        double* gputmp = (double*)*tmp_ptr;
++        double* gputree = gputmp + total_size;
++        uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
++        int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
++        int32_t* gpucounts = gpurankout + 1;
++        int32_t* gpulocalcounts = gpucounts + searchtree_width;
++        uint32_t bucketidx{};
+ 
+-    CHECK(realloc_result);
++        CHECK(realloc_result);
+ 
+-    compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
+-        (val, gputmp, total_size);
+-    build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
+-        (gputmp, gputree, total_size);
+-    count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
+-        (gputmp, gputree, gpulocalcounts, total_size, local_work);
+-    reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
+-        (gpulocalcounts, gpucounts, num_grouped_blocks);
+-    sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
+-        (gpucounts, subset_size, gpubucketidx, gpurankout);
+-    magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
+-    magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
+-    *thrs = std::sqrt(*thrs);
++        compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
++            (val, gputmp, total_size);
++        build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
++            (gputmp, gputree, total_size);
++        count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
++            (gputmp, gputree, gpulocalcounts, total_size, local_work);
++        reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
++            (gpulocalcounts, gpucounts, num_grouped_blocks);
++        sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
++            (gpucounts, subset_size, gpubucketidx, gpurankout);
++        magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
++        magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
++        *thrs = std::sqrt(*thrs);
++    }
++    else {
++        printf("error: this functionality needs CUDA architecture >= 3.5\n");
++        info = MAGMA_ERR_NOT_SUPPORTED;
++    }
+ 
+ cleanup:
+     return info;
+diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp
+--- a/src/xhsgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xhsgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -16,6 +16,131 @@
+ #include <cuda_fp16.h>
+ #endif
+ 
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++//  * Redistributions of source code must retain the above copyright
++//    notice, this list of conditions and the following disclaimer.
++//  * Redistributions in binary form must reproduce the above copyright
++//    notice, this list of conditions and the following disclaimer in the
++//    documentation and/or other materials provided with the distribution.
++//  * Neither the name of NVIDIA CORPORATION nor the names of its
++//    contributors may be used to endorse or promote products derived
++//    from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here: 
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly. 
++// 
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++    uint u;
++    float f;
++    struct
++    {
++        uint Mantissa : 23;
++        uint Exponent : 8;
++        uint Sign : 1;
++    };
++};
++
++union FP16
++{
++    unsigned short u;
++    struct
++    {
++        uint Mantissa : 10;
++        uint Exponent : 5;
++        uint Sign : 1;
++    };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++    FP32 f32infty = { 255 << 23 };
++    FP32 f16max = { (127 + 16) << 23 };
++    FP32 magic = { 15 << 23 };
++    FP32 expinf = { (255 ^ 31) << 23 };
++    uint sign_mask = 0x80000000u;
++    FP16 o = { 0 };
++
++    FP32 f = *((FP32*)&fl);
++
++    uint sign = f.u & sign_mask;
++    f.u ^= sign;
++
++    if (!(f.f < f32infty.u)) // Inf or NaN
++        o.u = f.u ^ expinf.u;
++    else
++    {
++        if (f.f > f16max.f) f.f = f16max.f;
++        f.f *= magic.f;
++    }
++
++    o.u = f.u >> 13; // Take the mantissa bits
++    o.u |= sign >> 16;
++    half tmp;
++    memcpy(&tmp, &o, sizeof(half));
++    //return *((half*)&o);
++    return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++    FP16 h;
++    memcpy(&h, &hf, sizeof(half));
++
++    static const FP32 magic = { 113 << 23 };
++    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++    FP32 o;
++
++    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
++    uint exp = shifted_exp & o.u;   // just the exponent
++    o.u += (127 - 15) << 23;        // exponent adjust
++
++    // handle exponent special cases
++    if (exp == shifted_exp) // Inf/NaN?
++        o.u += (128 - 16) << 23;    // extra exp adjust
++    else if (exp == 0) // Zero/Denormal?
++    {
++        o.u += 1 << 23;             // extra exp adjust
++        o.f -= magic.f;             // renormalize
++    }
++
++    o.u |= (h.u & 0x8000) << 16;    // sign bit
++    return o.f;
++}
++#endif
++
+ #include "magma_internal.h"
+ //#include "nvToolsExt.h"
+ 
+@@ -106,10 +231,13 @@
+     float c_one     = MAGMA_S_ONE;
+     float c_neg_one = MAGMA_S_NEG_ONE;
+     #if 1
++    #if CUDA_VERSION >= 9020
+     const magmaHalf h_one     = (magmaHalf) 1.0;
+     const magmaHalf h_neg_one = (magmaHalf)-1.0;
+-    //const magmaHalf h_one = approx_float_to_half(1.0);
+-    //const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++    #else
++    const magmaHalf h_one = approx_float_to_half(1.0);
++    const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++    #endif
+     #else
+     FP32 float_one    = *((FP32*)&c_one);
+     FP16 half_one     = float_to_half_full(float_one);
+diff -r 89706c0efbdb src/xshgetrf_gpu.cpp
+--- a/src/xshgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xshgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -92,7 +92,7 @@
+     magma_mp_type_t enable_tc,
+     magma_mp_type_t mp_algo_type )
+ {
+-#if CUDA_VERSION >= 7500
++#if CUDA_VERSION >= 9000
+     #ifdef HAVE_clBLAS
+     #define  dA(i_, j_) dA,  (dA_offset  + (i_)       + (j_)*ldda)
+     #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
+diff -r 89706c0efbdb testing/testing_hgemm.cpp
+--- a/testing/testing_hgemm.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/testing/testing_hgemm.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -22,6 +22,131 @@
+ #include "magma_operators.h"
+ #include "testings.h"
+ 
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++//  * Redistributions of source code must retain the above copyright
++//    notice, this list of conditions and the following disclaimer.
++//  * Redistributions in binary form must reproduce the above copyright
++//    notice, this list of conditions and the following disclaimer in the
++//    documentation and/or other materials provided with the distribution.
++//  * Neither the name of NVIDIA CORPORATION nor the names of its
++//    contributors may be used to endorse or promote products derived
++//    from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here: 
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly. 
++// 
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++    uint u;
++    float f;
++    struct
++    {
++        uint Mantissa : 23;
++        uint Exponent : 8;
++        uint Sign : 1;
++    };
++};
++
++union FP16
++{
++    unsigned short u;
++    struct
++    {
++        uint Mantissa : 10;
++        uint Exponent : 5;
++        uint Sign : 1;
++    };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++    FP32 f32infty = { 255 << 23 };
++    FP32 f16max = { (127 + 16) << 23 };
++    FP32 magic = { 15 << 23 };
++    FP32 expinf = { (255 ^ 31) << 23 };
++    uint sign_mask = 0x80000000u;
++    FP16 o = { 0 };
++
++    FP32 f = *((FP32*)&fl);
++
++    uint sign = f.u & sign_mask;
++    f.u ^= sign;
++
++    if (!(f.f < f32infty.u)) // Inf or NaN
++        o.u = f.u ^ expinf.u;
++    else
++    {
++        if (f.f > f16max.f) f.f = f16max.f;
++        f.f *= magic.f;
++    }
++
++    o.u = f.u >> 13; // Take the mantissa bits
++    o.u |= sign >> 16;
++    half tmp;
++    memcpy(&tmp, &o, sizeof(half));
++    //return *((half*)&o);
++    return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++    FP16 h;
++    memcpy(&h, &hf, sizeof(half));
++
++    static const FP32 magic = { 113 << 23 };
++    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++    FP32 o;
++
++    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
++    uint exp = shifted_exp & o.u;   // just the exponent
++    o.u += (127 - 15) << 23;        // exponent adjust
++
++    // handle exponent special cases
++    if (exp == shifted_exp) // Inf/NaN?
++        o.u += (128 - 16) << 23;    // extra exp adjust
++    else if (exp == 0) // Zero/Denormal?
++    {
++        o.u += 1 << 23;             // extra exp adjust
++        o.f -= magic.f;             // renormalize
++    }
++
++    o.u |= (h.u & 0x8000) << 16;    // sign bit
++    return o.f;
++}
++#endif
++
+ /* ////////////////////////////////////////////////////////////////////////////
+    -- Testing sgemm
+ */
+@@ -47,8 +172,13 @@
+     float c_neg_one = MAGMA_S_NEG_ONE;
+     float alpha = MAGMA_S_MAKE(  0.29, -0.86 );
+     float beta  = MAGMA_S_MAKE( -0.48,  0.38 );
+-    magmaHalf h_alpha = (magmaHalf)alpha;
+-    magmaHalf h_beta  = (magmaHalf)beta;
++    #if CUDA_VERSION >= 9020
++    const magmaHalf h_alpha = (magmaHalf) alpha;
++    const magmaHalf h_beta  = (magmaHalf) beta;
++    #else
++    const magmaHalf h_alpha = approx_float_to_half(alpha);
++    const magmaHalf h_beta  = approx_float_to_half(beta);
++    #endif
+     magma_opts opts;
+     opts.parse_opts( argc, argv );
+     
diff --git a/scripts/uberenv/packages/magma/package.py b/scripts/uberenv/packages/magma/package.py
new file mode 100644
index 00000000..8d37bec6
--- /dev/null
+++ b/scripts/uberenv/packages/magma/package.py
@@ -0,0 +1,125 @@
+# Copyright 2013-2021 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+
+from spack import *
+
+
+class Magma(CMakePackage, CudaPackage):
+    """The MAGMA project aims to develop a dense linear algebra library similar
+       to LAPACK but for heterogeneous/hybrid architectures, starting with
+       current "Multicore+GPU" systems.
+    """
+
+    homepage = "http://icl.cs.utk.edu/magma/"
+    url = "http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz"
+    maintainers = ['stomov', 'luszczek']
+
+    version('2.5.4', sha256='7734fb417ae0c367b418dea15096aef2e278a423e527c615aab47f0683683b67')
+    version('2.5.3', sha256='c602d269a9f9a3df28f6a4f593be819abb12ed3fa413bba1ff8183de721c5ef6')
+    version('2.5.2', sha256='065feb85558f9dd6f4cc4db36ac633a3f787827fc832d0b578a049a43a195620')
+    version('2.5.1', sha256='ce32c199131515336b30c92a907effe0c441ebc5c5bdb255e4b06b2508de109f')
+    version('2.5.0', sha256='4fd45c7e46bd9d9124253e7838bbfb9e6003c64c2c67ffcff02e6c36d2bcfa33')
+    version('2.4.0', sha256='4eb839b1295405fd29c8a6f5b4ed578476010bf976af46573f80d1169f1f9a4f')
+    version('2.3.0', sha256='010a4a057d7aa1e57b9426bffc0958f3d06913c9151463737e289e67dd9ea608')
+    version('2.2.0', sha256='df5d4ace417e5bf52694eae0d91490c6bde4cde1b0da98e8d400c5c3a70d83a2')
+
+    variant('fortran', default=True,
+            description='Enable Fortran bindings support')
+    variant('shared', default=True,
+            description='Enable shared library')
+    variant('cuda', default=True, description='Build with CUDA')
+    variant('cuda_arch', default='none', multi=True,
+            description='Specify CUDA architecture(s)')
+    
+    # corbett5 added this variant
+    variant('fortran_convention', default='default', description='LAPACK/BLAS mangling scheme',
+            values=('default', 'add_', 'nochange', 'upcase'), multi=False)
+
+    depends_on('blas')
+    depends_on('lapack')
+    depends_on('cuda@8:', when='@2.5.1:')  # See PR #14471
+
+    conflicts('~cuda', msg='Magma requires cuda')
+    conflicts('cuda_arch=none',
+              msg='Please indicate a CUDA arch value or values')
+
+    # currently not compatible with CUDA-11
+    # https://bitbucket.org/icl/magma/issues/22/cuda-11-changes-issue
+    # https://bitbucket.org/icl/magma/issues/25/error-cusparsesolveanalysisinfo_t-does-not
+    conflicts('^cuda@11:', when='@:2.5.3')
+
+    patch('ibm-xl.patch', when='@2.2:2.5.0%xl')
+    patch('ibm-xl.patch', when='@2.2:2.5.0%xl_r')
+    patch('magma-2.3.0-gcc-4.8.patch', when='@2.3.0%gcc@:4.8')
+    patch('magma-2.5.0.patch', when='@2.5.0')
+    patch('magma-2.5.0-cmake.patch', when='@2.5.0')
+    patch('cmake-W.patch', when='@2.5.0:%nvhpc')
+
+    def cmake_args(self):
+        spec = self.spec
+        options = []
+
+        options.extend([
+            '-DCMAKE_INSTALL_PREFIX=%s' % self.prefix,
+            '-DCMAKE_INSTALL_NAME_DIR:PATH=%s/lib' % self.prefix,
+            '-DBLAS_LIBRARIES=%s' % spec['blas'].libs.joined(';'),
+            # As of MAGMA v2.3.0, CMakeLists.txt does not use the variable
+            # BLAS_LIBRARIES, but only LAPACK_LIBRARIES, so we need to
+            # explicitly add blas to LAPACK_LIBRARIES.
+            '-DLAPACK_LIBRARIES=%s' %
+            (spec['lapack'].libs + spec['blas'].libs).joined(';')
+        ])
+
+        options += ['-DBUILD_SHARED_LIBS=%s' %
+                    ('ON' if ('+shared' in spec) else 'OFF')]
+
+        if '+fortran' in spec:
+            options.extend([
+                '-DUSE_FORTRAN=yes'
+            ])
+            if spec.satisfies('%xl') or spec.satisfies('%xl_r'):
+                options.extend([
+                    '-DCMAKE_Fortran_COMPILER=%s' % self.compiler.f77
+                ])
+        
+        # corbett5 added this else block
+        else:
+            options.extend([
+                '-DUSE_FORTRAN=no'
+            ])
+
+        if spec.satisfies('^cuda'):
+            cuda_arch = self.spec.variants['cuda_arch'].value
+            if '@:2.2.0' in spec:
+                capabilities = ' '.join('sm{0}'.format(i) for i in cuda_arch)
+                options.extend(['-DGPU_TARGET=' + capabilities])
+            else:
+                capabilities = ' '.join('sm_{0}'.format(i) for i in cuda_arch)
+                options.extend(['-DGPU_TARGET=' + capabilities])
+
+        if '@2.5.0' in spec:
+            options.extend(['-DMAGMA_SPARSE=OFF'])
+            if spec.compiler.name in ['xl', 'xl_r']:
+                options.extend(['-DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=TRUE'])
+        
+        # corbett5 added these definitions
+        if spec.variants['fortran_convention'].value == 'add_':
+            options.extend(['-DFORTRAN_CONVENTION=-DADD_'])
+        
+        if spec.variants['fortran_convention'].value == 'nochange':
+            options.extend(['-DFORTRAN_CONVENTION=-DNOCHANGE'])
+        
+        if spec.variants['fortran_convention'].value == 'upcase':
+            options.extend(['-DFORTRAN_CONVENTION=-DUPCASE'])
+
+        return options
+
+    @run_after('install')
+    def post_install(self):
+        install('magmablas/atomics.cuh', self.prefix.include)
+        install('control/magma_threadsetting.h', self.prefix.include)
+        install('control/pthread_barrier.h', self.prefix.include)
+        install('control/magma_internal.h', self.prefix.include)
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
index d054887c..265a6c5f 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
@@ -2,6 +2,15 @@ packages:
   all:
     target: [default]
     compiler: [gcc, clang, xl]
+    providers:
+      blas: [netlib-lapack]
+      lapack: [netlib-lapack]
+  
+  netlib-lapack:
+    buildable: False
+    externals:
+    - spec: netlib-lapack@3.10.0 ~external-blas
+      prefix: /usr/tcetmp/packages/lapack/lapack-3.10.0-P9-xl-2022.03.10/
 
   cuda:
     buildable: False
diff --git a/src/LvArrayConfig.hpp.in b/src/LvArrayConfig.hpp.in
index dcbd30b3..bf48242a 100644
--- a/src/LvArrayConfig.hpp.in
+++ b/src/LvArrayConfig.hpp.in
@@ -32,4 +32,6 @@
 
 #cmakedefine LVARRAY_USE_CALIPER
 
+#cmakedefine LVARRAY_USE_MAGMA
+
 #cmakedefine LVARRAY_ADDR2LINE_EXEC @LVARRAY_ADDR2LINE_EXEC@
diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt
index 36778a28..3d7a1f34 100644
--- a/src/dense/CMakeLists.txt
+++ b/src/dense/CMakeLists.txt
@@ -8,10 +8,17 @@ set( lvarraydense_sources
      eigenDecomposition.cpp
     )
 
+set( dependencies lvarray ${lvarray_dependencies} blas lapack )
+
+if( ENABLE_MAGMA )
+    set( dependencies ${dependencies} magma )
+endif()
+    
+
 blt_add_library( NAME             lvarraydense
                  SOURCES          ${lvarraydense_sources}
                  HEADERS          ${lvarraydense_headers}
-                 DEPENDS_ON       lvarray ${lvarray_dependencies} blas lapack
+                 DEPENDS_ON       ${dependencies}
                  SHARED TRUE
                  CLEAR_PREFIX TRUE
                  )
diff --git a/src/dense/common.cpp b/src/dense/common.cpp
index 75c06070..8843ca82 100644
--- a/src/dense/common.cpp
+++ b/src/dense/common.cpp
@@ -14,5 +14,18 @@ char const * getOption( SymmetricMatrixStorageType const option )
   return option == SymmetricMatrixStorageType::UPPER_TRIANGULAR ? upper : lower;
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+MemorySpace getSpaceForBackend( BuiltInBackends const backend )
+{
+#if defined( LVARRAY_USE_MAGMA )
+  // TODO: This needs to be changed to MemorySpace::hip or whatever.
+  if( backend == BuiltInBackends::MAGMA_GPU ) return MemorySpace::cuda;
+#else
+  LVARRAY_UNUSED_VARIABLE( backend );
+#endif
+
+  return MemorySpace::host;
+}
+
 } // namespace dense
 } // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/common.hpp b/src/dense/common.hpp
index 146bb407..09ef3edd 100644
--- a/src/dense/common.hpp
+++ b/src/dense/common.hpp
@@ -53,6 +53,26 @@ template< typename T >
 using RealVersion = typename internal::RealVersion< T >::Type;
 
 
+/**
+ *
+ */
+enum class BuiltInBackends
+{
+  LAPACK,
+#if defined( LVARRAY_USE_MAGMA )
+  MAGMA,
+  MAGMA_GPU,
+#endif
+};
+
+/**
+ *
+ */
+MemorySpace getSpaceForBackend( BuiltInBackends const backend );
+
+/**
+ *
+ */
 using DenseInt = int;
 
 /**
@@ -73,7 +93,9 @@ struct Matrix
     data{ slice.data() }
   {}
 
-  template< typename INDEX_TYPE  >
+  /**
+   *
+   */
   Matrix( T & value ):
     nRows{ 1 },
     nCols{ 1 },
@@ -132,12 +154,20 @@ struct Workspace
 
   virtual Vector< T > work() = 0;
 
+  virtual Vector< T > work2() = 0;
+
+  virtual Vector< T > work3() = 0;
+
   virtual Vector< RealVersion< T > > rwork() = 0;
 
   virtual Vector< DenseInt > iwork() = 0;
 
   virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) = 0;
 
+  virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) = 0;
+
+  virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) = 0;
+
   virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) = 0;
 
   virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) = 0;
@@ -155,6 +185,12 @@ struct ArrayWorkspace : public Workspace< T >
   virtual Vector< T > work() override
   { return m_work.toSlice(); }
 
+  virtual Vector< T > work2() override
+  { return m_work2.toSlice(); }
+
+  virtual Vector< T > work3() override
+  { return m_work3.toSlice(); }
+
   virtual Vector< RealVersion< T > > rwork() override
   { return m_rwork.toSlice(); }
 
@@ -163,16 +199,28 @@ struct ArrayWorkspace : public Workspace< T >
 
   virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override
   { m_work.resizeWithoutInitializationOrDestruction( space, newSize ); }
+
+  virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) override
+  { m_work2.resizeWithoutInitializationOrDestruction( space, newSize ); }
+
+  virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) override
+  { m_work3.resizeWithoutInitializationOrDestruction( space, newSize ); }
  
   virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override
   { m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); }
 
   virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override
-  { m_iwork.resizeWithoutInitializationOrDestruction( space, newSize ); }
+  {
+    m_iwork.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
 
 private:
   Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work;
 
+  Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work2;
+
+  Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work3;
+
   Array< RealVersion< T >, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_rwork;
 
   Array< DenseInt, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_iwork;
@@ -190,6 +238,12 @@ struct OptimalSizeCalculation : public Workspace< T >
   virtual Vector< T > work() override
   { return m_work; }
 
+  virtual Vector< T > work2() override
+  { return m_work2; }
+
+  virtual Vector< T > work3() override
+  { return m_work3; }
+
   virtual Vector< RealVersion< T > > rwork() override
   { return m_rwork; }
 
@@ -199,6 +253,12 @@ struct OptimalSizeCalculation : public Workspace< T >
   virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
   { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
 
+  virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
   virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
   { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
 
@@ -215,11 +275,15 @@ struct OptimalSizeCalculation : public Workspace< T >
   { return m_iwork; }
 
 private:
-  T m_work;
+  T m_work { -1 };
+
+  T m_work2 { -1 };
+
+  T m_work3 { -1 };
 
-  RealVersion< T > m_rwork;
+  RealVersion< T > m_rwork { -1 };
 
-  DenseInt m_iwork;
+  DenseInt m_iwork { -1 };
 };
 
 } // namespace dense
diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp
index 68a2256d..071d5996 100644
--- a/src/dense/eigenDecomposition.cpp
+++ b/src/dense/eigenDecomposition.cpp
@@ -1,5 +1,9 @@
 #include "eigenDecomposition.hpp"
 
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
 /// This macro provide a flexible interface for Fortran naming convention for compiled objects
 // #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
 #define FORTRAN_MANGLE( name ) name
@@ -79,17 +83,17 @@ namespace internal
  */
 template< typename T >
 DenseInt heevr(
-  MemorySpace const space,
+  BuiltInBackends const backend,
   EigenDecompositionOptions const decompositionOptions,
   Matrix< std::complex< T > > const & A,
-  Vector< T > const & eigenValues,
-  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< T > const & eigenvalues,
+  Matrix< std::complex< T > > const & eigenvectors,
   Vector< DenseInt > const & support,
   Workspace< std::complex< T > > & workspace,
   SymmetricMatrixStorageType const storageType,
   bool const compute )
 {
-  LVARRAY_ERROR_IF_NE_MSG( space, MemorySpace::host, "Device not yet supported." );
+  LVARRAY_UNUSED_VARIABLE( backend );
 
   LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
 
@@ -105,27 +109,27 @@ DenseInt heevr(
   DenseInt maxEigenvaluesToFind = N;
   DenseInt const IL = decompositionOptions.indexMin;
   DenseInt const IU = decompositionOptions.indexMax;
-  if( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX )
+  if( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX )
   {
     LVARRAY_ERROR_IF_GT( IU, N );
     maxEigenvaluesToFind = IU - IL + 1;
   }
 
-  LVARRAY_ERROR_IF_LT( eigenValues.size, maxEigenvaluesToFind );
+  LVARRAY_ERROR_IF_LT( eigenvalues.size, maxEigenvaluesToFind );
 
   DenseInt const ABSTOL = decompositionOptions.abstol;
   DenseInt M = 0;
 
-  if( decompositionOptions.type == EigenDecompositionOptions::Type::EIGENVALUES_AND_VECTORS )
+  if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS )
   {
-    LVARRAY_ERROR_IF_NE( eigenVectors.nRows, N );
-    LVARRAY_ERROR_IF_LT( eigenVectors.nCols, maxEigenvaluesToFind );
+    LVARRAY_ERROR_IF_NE( eigenvectors.nRows, N );
+    LVARRAY_ERROR_IF_LT( eigenvectors.nCols, maxEigenvaluesToFind );
   }
 
-  DenseInt const LDZ = std::max( 1, eigenVectors.stride );
+  DenseInt const LDZ = std::max( 1, eigenvectors.stride );
 
-  if( decompositionOptions.range == EigenDecompositionOptions::Range::ALL ||
-      ( decompositionOptions.range == EigenDecompositionOptions::Range::BY_INDEX &&
+  if( decompositionOptions.range == EigenDecompositionOptions::ALL ||
+      ( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX &&
         maxEigenvaluesToFind == N ) )
   {
     LVARRAY_ERROR_IF_LT( support.size, 2 * maxEigenvaluesToFind );
@@ -138,59 +142,178 @@ DenseInt heevr(
   DenseInt INFO = 0;
 
   // With C++ 17 we can remove the reinterpret_cast with constexpr if.
-  if( std::is_same< T, float >::value )
+  if( backend == BuiltInBackends::LAPACK )
   {
-    LVARRAY_CHEEVR(
-      JOBZ,
-      RANGE,
-      UPLO,
-      &N,
-      reinterpret_cast< std::complex< float > * >( A.data ),
-      &LDA,
-      reinterpret_cast< float const * >( &VL ),
-      reinterpret_cast< float const * >( &VU ),
-      &IL,
-      &IU,
-      reinterpret_cast< float const * >( &ABSTOL ),
-      &M,
-      reinterpret_cast< float * >( eigenValues.data ),
-      reinterpret_cast< std::complex< float > * >( eigenVectors.data ),
-      &LDZ,
-      support.data,
-      reinterpret_cast< std::complex< float > * >( workspace.work().data ),
-      &LWORK,
-      reinterpret_cast< float * >( workspace.rwork().data ),
-      &LRWORK,
-      workspace.iwork().data,
-      &LIWORK,
-      &INFO );
+    if( std::is_same< T, float >::value )
+    {
+      LVARRAY_CHEEVR(
+        JOBZ,
+        RANGE,
+        UPLO,
+        &N,
+        reinterpret_cast< std::complex< float > * >( A.data ),
+        &LDA,
+        reinterpret_cast< float const * >( &VL ),
+        reinterpret_cast< float const * >( &VU ),
+        &IL,
+        &IU,
+        reinterpret_cast< float const * >( &ABSTOL ),
+        &M,
+        reinterpret_cast< float * >( eigenvalues.data ),
+        reinterpret_cast< std::complex< float > * >( eigenvectors.data ),
+        &LDZ,
+        support.data,
+        reinterpret_cast< std::complex< float > * >( workspace.work().data ),
+        &LWORK,
+        reinterpret_cast< float * >( workspace.rwork().data ),
+        &LRWORK,
+        workspace.iwork().data,
+        &LIWORK,
+        &INFO );
+    }
+    else
+    {
+      LVARRAY_ZHEEVR(
+        JOBZ,
+        RANGE,
+        UPLO,
+        &N,
+        reinterpret_cast< std::complex< double > * >( A.data ),
+        &LDA,
+        reinterpret_cast< double const * >( &VL ),
+        reinterpret_cast< double const * >( &VU ),
+        &IL,
+        &IU,
+        reinterpret_cast< double const * >( &ABSTOL ),
+        &M,
+        reinterpret_cast< double * >( eigenvalues.data ),
+        reinterpret_cast< std::complex< double > * >( eigenvectors.data ),
+        &LDZ,
+        support.data,
+        reinterpret_cast< std::complex< double > * >( workspace.work().data ),
+        &LWORK,
+        reinterpret_cast< double * >( workspace.rwork().data ),
+        &LRWORK,
+        workspace.iwork().data,
+        &LIWORK,
+        &INFO );
+    }
   }
+#if defined( LVARRAY_USE_MAGMA )
+  else if( backend == BuiltInBackends::MAGMA )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      magma_cheevr(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< float * >( eigenvalues.data ),
+        reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< float * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
+    }
+    else
+    {
+      magma_zheevr(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< double * >( eigenvalues.data ),
+        reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< double * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
+    }
+  }
+  else if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    int LDWA = N;
+    int LDWZ = 1;
+
+    if( compute )
+    {
+      workspace.resizeWork2( MemorySpace::cuda, LDWA * N );
+
+      if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS )
+      {
+        LDWZ = N;
+      }
+
+      workspace.resizeWork3( MemorySpace::cuda, LDWZ * maxEigenvaluesToFind );
+    }
+
+    if( std::is_same< T, float >::value )
+    {
+      magma_cheevr_gpu(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< float * >( eigenvalues.data ),
+        reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work2().data ),
+        LDWA,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work3().data ),
+        LDWZ,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< float * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
+    }
+    else
+    {
+      LVARRAY_ERROR( "Not supported." );
+    }
+  }
+#endif
   else
   {
-    LVARRAY_ZHEEVR(
-      JOBZ,
-      RANGE,
-      UPLO,
-      &N,
-      reinterpret_cast< std::complex< double > * >( A.data ),
-      &LDA,
-      reinterpret_cast< double const * >( &VL ),
-      reinterpret_cast< double const * >( &VU ),
-      &IL,
-      &IU,
-      reinterpret_cast< double const * >( &ABSTOL ),
-      &M,
-      reinterpret_cast< double * >( eigenValues.data ),
-      reinterpret_cast< std::complex< double > * >( eigenVectors.data ),
-      &LDZ,
-      support.data,
-      reinterpret_cast< std::complex< double > * >( workspace.work().data ),
-      &LWORK,
-      reinterpret_cast< double * >( workspace.rwork().data ),
-      &LRWORK,
-      workspace.iwork().data,
-      &LIWORK,
-      &INFO );
+    LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) );
   }
 
   LVARRAY_ERROR_IF_NE( INFO, 0 );
@@ -203,15 +326,20 @@ DenseInt heevr(
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 template< typename T >
 DenseInt heevr(
-  MemorySpace const space,
+  BuiltInBackends const backend,
   EigenDecompositionOptions const decompositionOptions,
   Matrix< std::complex< T > > const & A,
-  Vector< T > const & eigenValues,
-  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< T > const & eigenvalues,
+  Matrix< std::complex< T > > const & eigenvectors,
   Vector< DenseInt > const & support,
   Workspace< std::complex< T > > & workspace,
   SymmetricMatrixStorageType const storageType )
 {
+  // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries.
+  // I'm not sure exactly how this would work for the eigenvectors though.
+  LVARRAY_ERROR_IF( !A.columnMajor, "Row major is not yet supported." );
+  LVARRAY_ERROR_IF( !eigenvectors.columnMajor, "Row major is not yet supported." );
+
   bool const reallocateWork = workspace.work().size < 2 * A.nRows;
   bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows;
   bool const reallocateIWork = workspace.iwork().size < 10 * A.nRows;
@@ -219,25 +347,30 @@ DenseInt heevr(
   if( reallocateWork || reallocateRWork || reallocateIWork )
   {
     OptimalSizeCalculation< std::complex< T > > optimalSizes;
-    internal::heevr( MemorySpace::host, decompositionOptions, A, eigenValues, eigenVectors, support, optimalSizes, storageType, false );
+    internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, optimalSizes, storageType, false );
     
+    MemorySpace const space = getSpaceForBackend( backend );
+
     if( reallocateWork )
     {
+      LVARRAY_LOG_VAR( optimalSizes.optimalWorkSize() );
       workspace.resizeWork( space, optimalSizes.optimalWorkSize() );
     }
 
     if( reallocateRWork )
     {
+      LVARRAY_LOG_VAR( optimalSizes.optimalRWorkSize() );
       workspace.resizeRWork( space, optimalSizes.optimalRWorkSize() );
     }
 
     if( reallocateIWork )
     {
+      LVARRAY_LOG_VAR( optimalSizes.optimalIWorkSize() );
       workspace.resizeIWork( space, optimalSizes.optimalIWorkSize() );
     }
   }
 
-  return internal::heevr( space, decompositionOptions, A, eigenValues, eigenVectors, support, workspace, storageType, true );
+  return internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, workspace, storageType, true );
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -246,22 +379,22 @@ DenseInt heevr(
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 template DenseInt heevr< float >(
-  MemorySpace const space,
+  BuiltInBackends const backend,
   EigenDecompositionOptions const decompositionOptions,
   Matrix< std::complex< float > > const & A,
-  Vector< float > const & eigenValues,
-  Matrix< std::complex< float > > const & eigenVectors,
+  Vector< float > const & eigenvalues,
+  Matrix< std::complex< float > > const & eigenvectors,
   Vector< DenseInt > const & support,
   Workspace< std::complex< float > > & workspace,
   SymmetricMatrixStorageType const storageType );
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 template DenseInt heevr< double >(
-  MemorySpace const space,
+  BuiltInBackends const backend,
   EigenDecompositionOptions const decompositionOptions,
   Matrix< std::complex< double > > const & A,
-  Vector< double > const & eigenValues,
-  Matrix< std::complex< double > > const & eigenVectors,
+  Vector< double > const & eigenvalues,
+  Matrix< std::complex< double > > const & eigenvectors,
   Vector< DenseInt > const & support,
   Workspace< std::complex< double > > & workspace,
   SymmetricMatrixStorageType const storageType );
diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp
index 16ec001a..e83305fa 100644
--- a/src/dense/eigenDecomposition.hpp
+++ b/src/dense/eigenDecomposition.hpp
@@ -37,7 +37,9 @@ struct EigenDecompositionOptions
   EigenDecompositionOptions( Type const typeP, double const abstolP=0 ):
     type{ typeP },
     abstol{ abstolP }
-  {}
+  {
+    LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type );
+  }
 
   /**
    *
@@ -48,11 +50,12 @@ struct EigenDecompositionOptions
     double const rangeMaxP,
     double const abstolP ):
     type{ typeP },
-    range{ Range::IN_INTERVAL },
+    range{ IN_INTERVAL },
     rangeMin{ rangeMinP },
     rangeMax{ rangeMaxP },
     abstol{ abstolP }
   {
+    LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type );
     LVARRAY_ERROR_IF_GE( rangeMin, rangeMax );
   }
 
@@ -65,11 +68,12 @@ struct EigenDecompositionOptions
     DenseInt const indexMaxP,
     double const abstolP ):
     type{ typeP },
-    range{ Range::IN_INTERVAL },
+    range{ IN_INTERVAL },
     indexMin{ indexMinP },
     indexMax{ indexMaxP },
     abstol{ abstolP }
   {
+    LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type );
     LVARRAY_ERROR_IF_LT( indexMin, 1 );
     LVARRAY_ERROR_IF_GT( indexMin, indexMax );
   }
@@ -82,7 +86,7 @@ struct EigenDecompositionOptions
     static constexpr char const * const eigenvalueString = "N";
     static constexpr char const * const eigenvectorString = "V";
 
-    return type == Type::EIGENVALUES ? eigenvalueString : eigenvectorString;
+    return type == EIGENVALUES ? eigenvalueString : eigenvectorString;
   }
 
   /**
@@ -94,17 +98,17 @@ struct EigenDecompositionOptions
     static constexpr char const * const intervalString = "V";
     static constexpr char const * const indexString = "I";
 
-    if( range == Range::ALL )
+    if( range == ALL )
     { return allString; }
 
-    return range == Range::IN_INTERVAL ? intervalString : indexString;
+    return range == IN_INTERVAL ? intervalString : indexString;
   }
 
   ///
   Type const type;
 
   ///
-  Range const range = Range::ALL;
+  Range const range = ALL;
   
   ///
   double const rangeMin = std::numeric_limits< double >::max();
@@ -128,7 +132,7 @@ struct EigenDecompositionOptions
  */
 template< typename T >
 DenseInt heevr(
-  MemorySpace const space,
+  BuiltInBackends const backend,
   EigenDecompositionOptions const decompositionOptions,
   Matrix< std::complex< T > > const & A,
   Vector< T > const & eigenValues,
@@ -140,9 +144,9 @@ DenseInt heevr(
 /**
  *
  */
-template< typename T, int USD, typename INDEX_TYPE >
+template< typename BACK_END, typename T, int USD, typename INDEX_TYPE >
 DenseInt heevr(
-  MemorySpace const space,
+  BACK_END && backend,
   EigenDecompositionOptions const decompositionOptions,
   ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & A,
   ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
@@ -157,7 +161,7 @@ DenseInt heevr(
   Vector< DenseInt > supportVector( support );
 
   return heevr(
-    space,
+    std::forward< BACK_END >( backend ),
     decompositionOptions,
     AMatrix,
     eigenValuesVector,
@@ -170,9 +174,9 @@ DenseInt heevr(
 /**
  *
  */
-template< typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+template< typename BACK_END, typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
 DenseInt heevr(
-  MemorySpace const space,
+  BACK_END && backend,
   EigenDecompositionOptions const decompositionOptions,
   ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & A,
   ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues,
@@ -182,13 +186,14 @@ DenseInt heevr(
   SymmetricMatrixStorageType const storageType )
 {
   // Unclear about the touch here since half of A is destroyed, maybe it's not necessary.
+  MemorySpace const space = getSpaceForBackend( backend );
   A.move( space, true );
   eigenValues.move( space, true );
   eigenVectors.move( space, true );
   support.move( space, true );
 
   return heevr(
-    space,
+    std::forward< BACK_END >( backend ),
     decompositionOptions,
     A.toSlice(),
     eigenValues.toSlice(),
diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp
index 8f1c1a2b..9234362c 100644
--- a/unitTests/dense/testEigenDecomposition.cpp
+++ b/unitTests/dense/testEigenDecomposition.cpp
@@ -15,39 +15,103 @@ namespace LvArray
 namespace testing
 {
 
+using namespace dense;
+
 template< typename T >
 using Array1d = Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, DEFAULT_BUFFER >;
 
 template< typename T, typename PERM >
 using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >;
 
+template< typename T >
+struct HEEVR_TEST
+{
+  HEEVR_TEST( BuiltInBackends const backend ):
+    m_backend( backend )
+  {}
+  
+  void threeByThreeEigenvalues()
+  {
+    resize( 3, 3, 0 );
+
+    m_matrix( 1, 1 ) = 2;
+    m_matrix( 0, 0 ) = 3;
+    m_matrix( 2, 2 ) = -4;
+
+    SymmetricMatrixStorageType storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR;
+
+    heevr(
+      m_backend,
+      EigenDecompositionOptions( EigenDecompositionOptions::EIGENVALUES ),
+      m_matrix.toView(),
+      m_eigenvalues.toView(),
+      m_eigenvectors.toView(),
+      m_support,
+      m_workspace,
+      storageType );
+
+    EXPECT_DOUBLE_EQ( m_eigenvalues[ 0 ], -4 );
+    EXPECT_DOUBLE_EQ( m_eigenvalues[ 1 ], 2 );
+    EXPECT_DOUBLE_EQ( m_eigenvalues[ 2 ], 3 );
+  }
+
+private:
+  void resize( DenseInt const n, DenseInt const nvals, DenseInt const nvec )
+  {
+    m_matrix.resize( n, n );
+    m_eigenvalues.resize( nvals );
+    m_eigenvectors.resize( n, nvec );;
+    m_support.resize( 2 * n );
+  }
+
+  BuiltInBackends const m_backend;
+  Array2d< std::complex< T >, RAJA::PERM_JI > m_matrix;
+  Array1d< T > m_eigenvalues;
+  Array2d< std::complex< T >, RAJA::PERM_JI > m_eigenvectors;
+  Array1d< int > m_support;
+  ArrayWorkspace< std::complex< T >, ChaiBuffer > m_workspace;
+};
+
+TEST( eigenvalues_float, lapack )
+{
+  HEEVR_TEST< float > test( BuiltInBackends::LAPACK );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_double, lapack )
+{
+  HEEVR_TEST< double > test( BuiltInBackends::LAPACK );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_float, magma )
+{
+  HEEVR_TEST< float > test( BuiltInBackends::MAGMA );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_double, magma )
+{
+  HEEVR_TEST< double > test( BuiltInBackends::MAGMA );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_float, magma_gpu )
+{
+  HEEVR_TEST< float > test( BuiltInBackends::MAGMA_GPU );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_double, magma_gpu )
+{
+  HEEVR_TEST< double > test( BuiltInBackends::MAGMA_GPU );
 
-TEST( heevr, allEigenvalues )
-{
-  Array2d< std::complex< double >, RAJA::PERM_JI > matrix( 3, 3 );
-  matrix( 1, 1 ) = 2;
-  matrix( 0, 0 ) = 3;
-  matrix( 2, 2 ) = -4;
-
-  Array1d< double > eigenvalues( 3 );
-  Array2d< std::complex< double >, RAJA::PERM_JI > eigenvectors;
-  Array1d< int > support( 6 );
-  dense::ArrayWorkspace< std::complex< double >, ChaiBuffer > workspace;
-  dense::SymmetricMatrixStorageType storageType = dense::SymmetricMatrixStorageType::UPPER_TRIANGULAR;
-
-  dense::heevr< double >(
-    MemorySpace::host,
-    dense::EigenDecompositionOptions( dense::EigenDecompositionOptions::Type::EIGENVALUES ),
-    matrix.toView(),
-    eigenvalues.toView(),
-    eigenvectors.toView(),
-    support,
-    workspace,
-    storageType );
-
-  EXPECT_DOUBLE_EQ( eigenvalues[ 0 ], -4 );
-  EXPECT_DOUBLE_EQ( eigenvalues[ 1 ], 2 );
-  EXPECT_DOUBLE_EQ( eigenvalues[ 2 ], 3 );
+  test.threeByThreeEigenvalues();
 }
 
 } // namespace testing

From 5ada3d5c13ddb8c447ab1f1aa38d1a575e61c556 Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Tue, 6 Sep 2022 15:06:56 -0700
Subject: [PATCH 32/34] Got linear solve and eigenvalues working.

---
 src/dense/CMakeLists.txt                   |   3 +
 src/dense/backendHelpers.hpp               |  12 +
 src/dense/common.hpp                       |  57 ++++-
 src/dense/eigenDecomposition.cpp           |  72 +++---
 src/dense/eigenDecomposition.hpp           |  33 ++-
 src/dense/linearSolve.cpp                  | 278 +++++++++++++++++++++
 src/dense/linearSolve.hpp                  |  85 +++++++
 unitTests/dense/CMakeLists.txt             |   1 +
 unitTests/dense/testEigenDecomposition.cpp |  34 ++-
 unitTests/dense/testLinearSolve.cpp        | 198 +++++++++++++++
 10 files changed, 715 insertions(+), 58 deletions(-)
 create mode 100644 src/dense/backendHelpers.hpp
 create mode 100644 src/dense/linearSolve.cpp
 create mode 100644 src/dense/linearSolve.hpp
 create mode 100644 unitTests/dense/testLinearSolve.cpp

diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt
index 3d7a1f34..0a1de30b 100644
--- a/src/dense/CMakeLists.txt
+++ b/src/dense/CMakeLists.txt
@@ -1,11 +1,14 @@
 set( lvarraydense_headers
      common.hpp
+     backendHelpers.hpp
      eigenDecomposition.hpp
+     linearSolve.hpp
     )
 
 set( lvarraydense_sources
      common.cpp
      eigenDecomposition.cpp
+     linearSolve.cpp
     )
 
 set( dependencies lvarray ${lvarray_dependencies} blas lapack )
diff --git a/src/dense/backendHelpers.hpp b/src/dense/backendHelpers.hpp
new file mode 100644
index 00000000..144ad845
--- /dev/null
+++ b/src/dense/backendHelpers.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
+/// This macro provide a flexible interface for Fortran naming convention for compiled objects
+// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
+#define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name
+// #else
+// #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name ## _
+// #endif
\ No newline at end of file
diff --git a/src/dense/common.hpp b/src/dense/common.hpp
index 09ef3edd..9c4fda87 100644
--- a/src/dense/common.hpp
+++ b/src/dense/common.hpp
@@ -52,6 +52,17 @@ char const * getOption( SymmetricMatrixStorageType const option );
 template< typename T >
 using RealVersion = typename internal::RealVersion< T >::Type;
 
+/**
+ *
+ */
+template< typename T >
+static constexpr bool IsComplex = !std::is_same< RealVersion< T >, T >::value;
+
+/**
+ *
+ */
+template< typename T, typename U >
+static constexpr bool IsComplexT = IsComplex< T > && std::is_same< RealVersion< T >, U >::value;
 
 /**
  *
@@ -86,10 +97,22 @@ struct Matrix
    */
   template< typename INDEX_TYPE  >
   Matrix( ArraySlice< T, 2, 0, INDEX_TYPE > const & slice ):
-    nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) },
-    nCols{ integerConversion< DenseInt >( slice.size( 0 ) ) },
+    nRows{ integerConversion< DenseInt >( slice.size( 0 ) ) },
+    nCols{ integerConversion< DenseInt >( slice.size( 1 ) ) },
     stride{ integerConversion< DenseInt >( slice.stride( 1 ) ) },
-    columnMajor{ true },
+    isColumnMajor{ true },
+    data{ slice.data() }
+  {}
+
+  /**
+   *
+   */
+  template< typename INDEX_TYPE, int USD >
+  Matrix( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ):
+    nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) },
+    nCols{ integerConversion< DenseInt >( 1 ) },
+    stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) },
+    isColumnMajor{ true },
     data{ slice.data() }
   {}
 
@@ -100,7 +123,7 @@ struct Matrix
     nRows{ 1 },
     nCols{ 1 },
     stride{ 1 },
-    columnMajor{ true },
+    isColumnMajor{ true },
     data{ &value }
   {}
 
@@ -115,7 +138,7 @@ struct Matrix
   DenseInt const nRows;
   DenseInt const nCols;
   DenseInt const stride;
-  bool const columnMajor;
+  bool const isColumnMajor;
   T * const data;
 };
 
@@ -180,7 +203,13 @@ template< typename T, template< typename > class BUFFER_TYPE >
 struct ArrayWorkspace : public Workspace< T >
 {
   ArrayWorkspace()
-  {}
+  {
+    m_work.setName( "ArrayWorkspace::m_work" );
+    m_work2.setName( "ArrayWorkspace::m_work2" );
+    m_work3.setName( "ArrayWorkspace::m_work3" );
+    m_rwork.setName( "ArrayWorkspace::m_rwork" );
+    m_iwork.setName( "ArrayWorkspace::m_iwork" );
+  }
 
   virtual Vector< T > work() override
   { return m_work.toSlice(); }
@@ -198,16 +227,24 @@ struct ArrayWorkspace : public Workspace< T >
   { return m_iwork.toSlice(); }
 
   virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override
-  { m_work.resizeWithoutInitializationOrDestruction( space, newSize ); }
+  {
+    m_work.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
 
   virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) override
-  { m_work2.resizeWithoutInitializationOrDestruction( space, newSize ); }
+  {
+    m_work2.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
 
   virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) override
-  { m_work3.resizeWithoutInitializationOrDestruction( space, newSize ); }
+  {
+    m_work3.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
  
   virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override
-  { m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); }
+  {
+    m_rwork.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
 
   virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override
   {
diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp
index 071d5996..e70b6561 100644
--- a/src/dense/eigenDecomposition.cpp
+++ b/src/dense/eigenDecomposition.cpp
@@ -1,21 +1,11 @@
 #include "eigenDecomposition.hpp"
-
-#if defined( LVARRAY_USE_MAGMA )
-  #include <magma.h>
-#endif
-
-/// This macro provide a flexible interface for Fortran naming convention for compiled objects
-// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
-#define FORTRAN_MANGLE( name ) name
-// #else
-// #define FORTRAN_MANGLE( name ) name ## _
-// #endif
+#include "backendHelpers.hpp"
 
 extern "C"
 {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-#define LVARRAY_CHEEVR FORTRAN_MANGLE( cheevr )
+#define LVARRAY_CHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( cheevr )
 void LVARRAY_CHEEVR( 
   char const * JOBZ,
   char const * RANGE,
@@ -42,7 +32,7 @@ void LVARRAY_CHEEVR(
   LvArray::dense::DenseInt * INFO );
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-#define LVARRAY_ZHEEVR FORTRAN_MANGLE( zheevr )
+#define LVARRAY_ZHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( zheevr )
 void LVARRAY_ZHEEVR( 
   char const * JOBZ,
   char const * RANGE,
@@ -260,18 +250,12 @@ DenseInt heevr(
   else if( backend == BuiltInBackends::MAGMA_GPU )
   {
     int LDWA = N;
-    int LDWZ = 1;
+    int LDWZ = N;
 
     if( compute )
     {
-      workspace.resizeWork2( MemorySpace::cuda, LDWA * N );
-
-      if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS )
-      {
-        LDWZ = N;
-      }
-
-      workspace.resizeWork3( MemorySpace::cuda, LDWZ * maxEigenvaluesToFind );
+      workspace.resizeWork2( MemorySpace::host, LDWA * N );
+      workspace.resizeWork3( MemorySpace::host, LDWZ * maxEigenvaluesToFind );
     }
 
     if( std::is_same< T, float >::value )
@@ -307,7 +291,34 @@ DenseInt heevr(
     }
     else
     {
-      LVARRAY_ERROR( "Not supported." );
+      magma_zheevr_gpu(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< double * >( eigenvalues.data ),
+        reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work2().data ),
+        LDWA,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work3().data ),
+        LDWZ,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< double * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
     }
   }
 #endif
@@ -337,8 +348,8 @@ DenseInt heevr(
 {
   // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries.
   // I'm not sure exactly how this would work for the eigenvectors though.
-  LVARRAY_ERROR_IF( !A.columnMajor, "Row major is not yet supported." );
-  LVARRAY_ERROR_IF( !eigenvectors.columnMajor, "Row major is not yet supported." );
+  LVARRAY_ERROR_IF( !A.isColumnMajor, "Row major is not yet supported." );
+  LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor, "Row major is not yet supported." );
 
   bool const reallocateWork = workspace.work().size < 2 * A.nRows;
   bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows;
@@ -349,24 +360,19 @@ DenseInt heevr(
     OptimalSizeCalculation< std::complex< T > > optimalSizes;
     internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, optimalSizes, storageType, false );
     
-    MemorySpace const space = getSpaceForBackend( backend );
-
     if( reallocateWork )
     {
-      LVARRAY_LOG_VAR( optimalSizes.optimalWorkSize() );
-      workspace.resizeWork( space, optimalSizes.optimalWorkSize() );
+      workspace.resizeWork( MemorySpace::host, optimalSizes.optimalWorkSize() );
     }
 
     if( reallocateRWork )
     {
-      LVARRAY_LOG_VAR( optimalSizes.optimalRWorkSize() );
-      workspace.resizeRWork( space, optimalSizes.optimalRWorkSize() );
+      workspace.resizeRWork( MemorySpace::host, optimalSizes.optimalRWorkSize() );
     }
 
     if( reallocateIWork )
     {
-      LVARRAY_LOG_VAR( optimalSizes.optimalIWorkSize() );
-      workspace.resizeIWork( space, optimalSizes.optimalIWorkSize() );
+      workspace.resizeIWork( MemorySpace::host, optimalSizes.optimalIWorkSize() );
     }
   }
 
diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp
index e83305fa..5e7f3819 100644
--- a/src/dense/eigenDecomposition.hpp
+++ b/src/dense/eigenDecomposition.hpp
@@ -144,13 +144,13 @@ DenseInt heevr(
 /**
  *
  */
-template< typename BACK_END, typename T, int USD, typename INDEX_TYPE >
+template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE >
 DenseInt heevr(
   BACK_END && backend,
   EigenDecompositionOptions const decompositionOptions,
-  ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & A,
+  ArraySlice< std::complex< T >, 2, USD_A, INDEX_TYPE > const & A,
   ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
-  ArraySlice< std::complex< T >, 2, USD, INDEX_TYPE > const & eigenVectors,
+  ArraySlice< std::complex< T >, 2, USD_V, INDEX_TYPE > const & eigenVectors,
   ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & support,
   Workspace< std::complex< T > > & workspace,
   SymmetricMatrixStorageType const storageType )
@@ -174,23 +174,36 @@ DenseInt heevr(
 /**
  *
  */
-template< typename BACK_END, typename T, int USD, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
 DenseInt heevr(
   BACK_END && backend,
   EigenDecompositionOptions const decompositionOptions,
-  ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & A,
+  ArrayView< std::complex< T >, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A,
   ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues,
-  ArrayView< std::complex< T >, 2, USD, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors,
+  ArrayView< std::complex< T >, 2, USD_V, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors,
   ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & support,
   Workspace< std::complex< T > > & workspace,
   SymmetricMatrixStorageType const storageType )
 {
-  // Unclear about the touch here since half of A is destroyed, maybe it's not necessary.
   MemorySpace const space = getSpaceForBackend( backend );
-  A.move( space, true );
-  eigenValues.move( space, true );
+  
+  // The A matrix isn't touched because it is destroyed.
+  A.move( space, false );
   eigenVectors.move( space, true );
-  support.move( space, true );
+
+#if defined( LVARRAY_USE_MAGMA )
+  // MAGMA wants the eigenvalues and support on the CPU.
+  if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    eigenValues.move( MemorySpace::host, true );
+    support.move( MemorySpace::host, true );
+  }
+  else
+#endif
+  {
+    eigenValues.move( space, true );
+    support.move( space, true );
+  }
 
   return heevr(
     std::forward< BACK_END >( backend ),
diff --git a/src/dense/linearSolve.cpp b/src/dense/linearSolve.cpp
new file mode 100644
index 00000000..9833710f
--- /dev/null
+++ b/src/dense/linearSolve.cpp
@@ -0,0 +1,278 @@
+#include "linearSolve.hpp"
+#include "backendHelpers.hpp"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv )
+void LVARRAY_SGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  float * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  float * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv )
+void LVARRAY_DGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  double * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  double * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv )
+void LVARRAY_CGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  std::complex< float > * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  std::complex< float > * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv )
+void LVARRAY_ZGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  std::complex< double > * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  std::complex< double > * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+} // extern "C"
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename T >
+void gesv(
+  BuiltInBackends const backend,
+  Matrix< T > const & A,
+  Matrix< T > const & B,
+  Vector< DenseInt > const & pivots )
+{
+  LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
+  LVARRAY_ERROR_IF( !A.isColumnMajor, "The matrix A must be column major." );
+
+  LVARRAY_ERROR_IF_NE( A.nRows, B.nRows );
+  LVARRAY_ERROR_IF( !B.isColumnMajor, "The matrix B must be column major." );
+
+  LVARRAY_ERROR_IF_NE( pivots.size, A.nRows );
+
+  DenseInt const N = A.nCols;
+  DenseInt const NRHS = B.nCols;
+  DenseInt const LDA = A.stride;
+  DenseInt const LDB = B.stride;
+  DenseInt INFO = 0;
+
+  if( backend == BuiltInBackends::LAPACK )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      LVARRAY_SGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< float * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< float * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+    if( std::is_same< T, double >::value )
+    {
+      LVARRAY_DGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< double * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< double * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, float > )
+    {
+      LVARRAY_CGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< std::complex< float > * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< std::complex< float > * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, double > )
+    {
+      LVARRAY_ZGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< std::complex< double > * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< std::complex< double > * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+  }
+#if defined( LVARRAY_USE_MAGMA )
+  else if( backend == BuiltInBackends::MAGMA )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      magma_sgesv(
+        N,
+        NRHS,
+        reinterpret_cast< float * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< float * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( std::is_same< T, double >::value )
+    {
+      magma_dgesv(
+        N,
+        NRHS,
+        reinterpret_cast< double * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< double * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, float > )
+    {
+      magma_cgesv(
+        N,
+        NRHS,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaFloatComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, double > )
+    {
+      magma_zgesv(
+        N,
+        NRHS,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaDoubleComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+  }
+  else if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      magma_sgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< float * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< float * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( std::is_same< T, double >::value )
+    {
+      magma_dgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< double * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< double * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, float > )
+    {
+      magma_cgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaFloatComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, double > )
+    {
+      magma_zgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaDoubleComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+  }
+#endif
+  else
+  {
+    LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) );
+  }
+
+  LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." );
+  LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 <<
+                              " ) is exactly zero so the solution could not be computed." );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< float >(
+  BuiltInBackends const backend,
+  Matrix< float > const & A,
+  Matrix< float > const & B,
+  Vector< DenseInt > const & pivots );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< double >(
+  BuiltInBackends const backend,
+  Matrix< double > const & A,
+  Matrix< double > const & B,
+  Vector< DenseInt > const & pivots );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< std::complex< float > >(
+  BuiltInBackends const backend,
+  Matrix< std::complex< float > > const & A,
+  Matrix< std::complex< float > > const & B,
+  Vector< DenseInt > const & pivots );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< std::complex< double > >(
+  BuiltInBackends const backend,
+  Matrix< std::complex< double > > const & A,
+  Matrix< std::complex< double > > const & B,
+  Vector< DenseInt > const & pivots );
+
+
+} // namespace dense
+} // namespace LvArray
diff --git a/src/dense/linearSolve.hpp b/src/dense/linearSolve.hpp
new file mode 100644
index 00000000..3efe7719
--- /dev/null
+++ b/src/dense/linearSolve.hpp
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+/**
+ * @brief Solves the matrix equation A X = B for X using (s, d, c, z)gesv.
+ * 
+ * @tparam T The type of values in the matrices. Must be one of float, double, std::complex< float >, or std::complex< double >.
+ * @param backend The built in backend that implements (s, d, c, z)gesv.
+ * @param A The input matrix, which is overwritten with L and U from the LU decomposition.
+ * @param B The input right hand side, is overwritten with the solution X.
+ * @param pivots The permutation matrix used when factoring A.
+ * 
+ * @note When using @c MAGMA_GPU as the backend both @param A and @param B should be on the GPU while @param pivots
+ *   remains on the host.
+ */
+template< typename T >
+void gesv(
+  BuiltInBackends const backend,
+  Matrix< T > const & A,
+  Matrix< T > const & B,
+  Vector< DenseInt > const & pivots );
+
+/**
+ *
+ */
+template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE >
+void gesv(
+  BACK_END && backend,
+  ArraySlice< T, 2, USD_A, INDEX_TYPE > const & A,
+  ArraySlice< T, NDIM_B, USD_B, INDEX_TYPE > const & B,
+  ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & pivots )
+{
+  Matrix< T > AMatrix( A );
+  Matrix< T > BMatrix( B );
+  Vector< DenseInt > pivotsVector( pivots );
+
+  gesv(
+    std::forward< BACK_END >( backend ),
+    AMatrix,
+    BMatrix,
+    pivots );
+}
+
+/**
+ *
+ */
+template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+void gesv(
+  BACK_END && backend,
+  ArrayView< T, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A,
+  ArrayView< T, NDIM_B, USD_B, INDEX_TYPE, BUFFER_TYPE > const & B,
+  ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & pivots )
+{
+  // TODO(corbett5): Unclear about the touch here since A is destroyed but the LU decomposition may still be useful.
+  MemorySpace const space = getSpaceForBackend( backend );
+  A.move( space, true );
+  B.move( space, true );
+
+#if defined( LVARRAY_USE_MAGMA )
+  // MAGMA wants the pivots on the CPU.
+  if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    pivots.move( MemorySpace::host, true );
+  }
+  else
+#endif
+  {
+    pivots.move( space, true );
+  }
+
+  return gesv(
+    std::forward< BACK_END >( backend ),
+    A.toSlice(),
+    B.toSlice(),
+    pivots.toSlice() );
+}
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt
index f324797e..4d58286d 100644
--- a/unitTests/dense/CMakeLists.txt
+++ b/unitTests/dense/CMakeLists.txt
@@ -10,6 +10,7 @@
 #
 set( testSources
      testEigenDecomposition.cpp
+     testLinearSolve.cpp
    )
 
 #
diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp
index 9234362c..bc29aa05 100644
--- a/unitTests/dense/testEigenDecomposition.cpp
+++ b/unitTests/dense/testEigenDecomposition.cpp
@@ -10,6 +10,10 @@
 
 #include "../testUtils.hpp"
 
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
 namespace LvArray
 {
 namespace testing
@@ -18,21 +22,28 @@ namespace testing
 using namespace dense;
 
 template< typename T >
-using Array1d = Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, DEFAULT_BUFFER >;
+using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >;
 
 template< typename T, typename PERM >
-using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >;
+using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >;
+
+// TODO(corbett5): significantly improve this test.
 
 template< typename T >
 struct HEEVR_TEST
 {
   HEEVR_TEST( BuiltInBackends const backend ):
     m_backend( backend )
-  {}
+  {
+    m_matrix.setName( "matrix" );
+    m_eigenvalues.setName( "m_eigenvalues" );
+    m_eigenvectors.setName( "eigenvectors" );
+    m_support.setName( "support" );
+  }
   
   void threeByThreeEigenvalues()
   {
-    resize( 3, 3, 0 );
+    resize( 20, 20, 0 );
 
     m_matrix( 1, 1 ) = 2;
     m_matrix( 0, 0 ) = 3;
@@ -60,7 +71,7 @@ struct HEEVR_TEST
   {
     m_matrix.resize( n, n );
     m_eigenvalues.resize( nvals );
-    m_eigenvectors.resize( n, nvec );;
+    m_eigenvectors.resize( n, nvec );
     m_support.resize( 2 * n );
   }
 
@@ -86,6 +97,8 @@ TEST( eigenvalues_double, lapack )
   test.threeByThreeEigenvalues();
 }
 
+#if defined( LVARRAY_USE_MAGMA )
+
 TEST( eigenvalues_float, magma )
 {
   HEEVR_TEST< float > test( BuiltInBackends::MAGMA );
@@ -114,13 +127,24 @@ TEST( eigenvalues_double, magma_gpu )
   test.threeByThreeEigenvalues();
 }
 
+#endif
+
 } // namespace testing
 } // namespace LvArray
 
 // This is the default gtest main method. It is included for ease of debugging.
 int main( int argc, char * * argv )
 {
+#if defined( LVARRAY_USE_MAGMA )
+  magma_init();
+#endif
+
   ::testing::InitGoogleTest( &argc, argv );
   int const result = RUN_ALL_TESTS();
+
+#if defined( LVARRAY_USE_MAGMA )
+  magma_finalize();
+#endif
+
   return result;
 }
diff --git a/unitTests/dense/testLinearSolve.cpp b/unitTests/dense/testLinearSolve.cpp
new file mode 100644
index 00000000..7a1ab3c8
--- /dev/null
+++ b/unitTests/dense/testLinearSolve.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+// Source includes
+#include "dense/linearSolve.hpp"
+
+#include "../testUtils.hpp"
+
+#include "output.hpp"
+
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
+#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \
+  EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \
+  EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError )
+
+namespace LvArray
+{
+namespace testing
+{
+
+using namespace dense;
+
+template< typename T >
+using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >;
+
+template< typename T, typename PERM >
+using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >;
+
+
+template< typename T >
+struct GESV_Test : public ::testing::Test
+{
+  void test( BuiltInBackends const backend, DenseInt const N, DenseInt const nrhs )
+  {
+    Array2d< T, RAJA::PERM_JI > A( N, N );
+    Array2d< T, RAJA::PERM_JI > B( N, nrhs ) ;
+    Array1d< DenseInt > pivots( N );
+
+    for( DenseInt row = 0; row < N; ++row )
+    {
+      for( DenseInt col = 0; col < N; ++col )
+      {
+        A( row, col ) = randomNumber();
+      }
+
+      for( DenseInt col = 0; col < nrhs; ++col )
+      {
+        B( row, col ) = randomNumber();
+      }
+    }
+
+    Array2d< T, RAJA::PERM_JI > ACopy( A );
+    Array2d< T, RAJA::PERM_JI > X( B );
+    gesv( backend, ACopy.toView(), X.toView(), pivots );
+
+    // TODO(corbett5): replace this with matrix matrix multiplication
+    X.move( MemorySpace::host, true );
+    for( DenseInt i = 0; i < N; ++i )
+    {
+      for( DenseInt j = 0; j < nrhs; ++j )
+      {
+        T dot = 0;
+        for( DenseInt k = 0; k < N; ++k )
+        {
+          dot += A( i, k ) * X( k, j );
+        }
+
+        EXPECT_COMPLEX_NEAR( dot, B( i, j ), 10 * N * std::numeric_limits< RealVersion< T > >::epsilon() );
+      }
+    }
+  }
+
+private:
+  
+  template< typename _T=T >
+  std::enable_if_t< !IsComplex< _T >, T >
+  randomNumber()
+  { return m_dist( m_gen ); }
+
+  template< typename _T=T >
+  std::enable_if_t< IsComplex< _T >, T >
+  randomNumber()
+  { return { m_dist( m_gen ), m_dist( m_gen ) }; }
+
+  std::mt19937_64 m_gen;
+  std::uniform_real_distribution< RealVersion< T > > m_dist;
+};
+
+using GESV_Test_types = ::testing::Types<
+  float,
+  double,
+  std::complex< float >,
+  std::complex< double >
+  >;
+TYPED_TEST_SUITE( GESV_Test, GESV_Test_types, );
+
+TYPED_TEST( GESV_Test, LAPACK_2x2 )
+{
+  this->test( BuiltInBackends::LAPACK, 2, 1 );
+  this->test( BuiltInBackends::LAPACK, 2, 2 );
+}
+
+TYPED_TEST( GESV_Test, LAPACK_10x10 )
+{
+  this->test( BuiltInBackends::LAPACK, 10, 1 );
+  this->test( BuiltInBackends::LAPACK, 10, 3 );
+}
+
+TYPED_TEST( GESV_Test, LAPACK_100x100 )
+{
+  this->test( BuiltInBackends::LAPACK, 100, 1 );
+  this->test( BuiltInBackends::LAPACK, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, LAPACK_1000x1000 )
+{
+  this->test( BuiltInBackends::LAPACK, 100, 1 );
+  this->test( BuiltInBackends::LAPACK, 100, 10 );
+}
+
+#if defined( LVARRAY_USE_MAGMA )
+
+TYPED_TEST( GESV_Test, MAGMA_2x2 )
+{
+  this->test( BuiltInBackends::MAGMA, 2, 1 );
+  this->test( BuiltInBackends::MAGMA, 2, 2 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_10x10 )
+{
+  this->test( BuiltInBackends::MAGMA, 10, 1 );
+  this->test( BuiltInBackends::MAGMA, 10, 3 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_100x100 )
+{
+  this->test( BuiltInBackends::MAGMA, 100, 1 );
+  this->test( BuiltInBackends::MAGMA, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_1000x1000 )
+{
+  this->test( BuiltInBackends::MAGMA, 100, 1 );
+  this->test( BuiltInBackends::MAGMA, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_2x2 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 2, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 2, 2 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_10x10 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 10, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 10, 3 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_100x100 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_1000x1000 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 10 );
+}
+
+#endif
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+#if defined( LVARRAY_USE_MAGMA )
+  magma_init();
+#endif
+
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+
+#if defined( LVARRAY_USE_MAGMA )
+  magma_finalize();
+#endif
+
+  return result;
+}

From 3165719492cee7cd653e9444fbfd0d229edd3ca0 Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Tue, 6 Sep 2022 22:02:54 -0700
Subject: [PATCH 33/34] Squash

---
 .../blueos_3_ppc64le_ib_p9/compilers.yaml              | 10 +++++-----
 .../spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml | 10 ++++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
index b8353dd0..652d26c4 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
@@ -1,11 +1,11 @@
 compilers:
 - compiler:
-    spec: clang@10.0.1
+    spec: clang@upstream-2019.03.19
     paths:
-      cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      cc: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang
+      cxx: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r
     flags:
       cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
       cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
index 265a6c5f..575d66db 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
@@ -3,14 +3,20 @@ packages:
     target: [default]
     compiler: [gcc, clang, xl]
     providers:
-      blas: [netlib-lapack]
-      lapack: [netlib-lapack]
+      blas: [netlib-lapack, essl]
+      lapack: [netlib-lapack, essl]
   
   netlib-lapack:
     buildable: False
     externals:
     - spec: netlib-lapack@3.10.0 ~external-blas
       prefix: /usr/tcetmp/packages/lapack/lapack-3.10.0-P9-xl-2022.03.10/
+  
+  essl:
+    buildable: False
+    externals:
+      - spec: essl@6.2.1 ~ilp64 threads=openmp +cuda +lapack
+        prefix: /usr/tcetmp/packages/essl/essl-6.2.1/
 
   cuda:
     buildable: False

From 3725d1a70d8b243d2682b5b6aa9a4d1c44eff3e5 Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Tue, 9 May 2023 23:47:14 -0700
Subject: [PATCH 34/34] Started testing gemm, need to work out some stuff.

---
 scripts/uberenv/packages/lvarray/package.py   |   6 +-
 .../toss_4_x86_64_ib/packages.yaml            |  10 +-
 src/dense/BlasLapackInterface.cpp             | 210 ++++++++++++++++++
 src/dense/BlasLapackInterface.hpp             |  31 +++
 src/dense/CMakeLists.txt                      |   6 +-
 src/dense/backendHelpers.hpp                  |  78 ++++++-
 src/dense/common.cpp                          |  26 +--
 src/dense/common.hpp                          | 190 ++++++++--------
 src/dense/dense.hpp                           |  52 +++++
 src/dense/eigenDecomposition.cpp              |  20 +-
 src/dense/linearSolve.cpp                     |  16 +-
 unitTests/dense/CMakeLists.txt                |   5 +-
 unitTests/dense/testgemm.cpp                  | 199 +++++++++++++++++
 13 files changed, 695 insertions(+), 154 deletions(-)
 create mode 100644 src/dense/BlasLapackInterface.cpp
 create mode 100644 src/dense/BlasLapackInterface.hpp
 create mode 100644 src/dense/dense.hpp
 create mode 100644 unitTests/dense/testgemm.cpp

diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index 7fc306fd..df299f3e 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -65,15 +65,10 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('addr2line', default=True,
             description='Build support for addr2line.')
 
-<<<<<<< HEAD
     variant('tpl_build_type', default='none', description='TPL build type',
             values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none'))
         
-
-    # conflicts('~lapack', when='+magma')
-=======
     conflicts('~lapack', when='+magma')
->>>>>>> cde43f2 (Building and compiling with MAGMA. GPU not yet working, think it's something to do with the new workspaces.)
 
     depends_on('blt@0.4.1:', when='@0.2.0:', type='build')
 
@@ -114,6 +109,7 @@ class Lvarray(CMakePackage, CudaPackage):
             depends_on('umpire build_type={}'.format(bt))
             depends_on('chai build_type={}'.format(bt), when='+chai')
             depends_on('caliper build_type={}'.format(bt), when='+caliper')
+            depends_on('magma build_type={}'.format(bt), when='+magma')
 
     phases = ['hostconfig', 'cmake', 'build', 'install']
 
diff --git a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
index ea2998fc..d3d2714a 100644
--- a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
@@ -3,14 +3,14 @@ packages:
     target: [default]
     compiler: [gcc, clang, intel]
     providers:
-      blas: [intel-mkl]
-      lapack: [intel-mkl]
+      blas: [intel-oneapi-mkl]
+      lapack: [intel-oneapi-mkl]
 
-  intel-mkl:
+  intel-oneapi-mkl:
     buildable: False
     externals:
-      - spec: intel-mkl@2020.0.166 threads=openmp
-        prefix: /usr/tce/packages/mkl/mkl-2020.0/
+      - spec: intel-oneapi-mkl@2022.1.0
+        prefix: /usr/tce/backend/installations/linux-rhel8-x86_64/intel-19.0.4/intel-oneapi-mkl-2022.1.0-sksz67twjxftvwchnagedk36gf7plkrp/
 
   cmake:
     buildable: False
diff --git a/src/dense/BlasLapackInterface.cpp b/src/dense/BlasLapackInterface.cpp
new file mode 100644
index 00000000..ca4309c5
--- /dev/null
+++ b/src/dense/BlasLapackInterface.cpp
@@ -0,0 +1,210 @@
+#include "BlasLapackInterface.hpp"
+#include "backendHelpers.hpp"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_SGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( sgemm )
+void LVARRAY_SGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   float const * ALPHA,
+   float const * A,
+   int const * LDA,
+   float const * B,
+   int const * LDB,
+   float const * BETA,
+   float * C,
+   int const * LDC );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_DGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( dgemm )
+void LVARRAY_DGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   double const * ALPHA,
+   double const * A,
+   int const * LDA,
+   double const * B,
+   int const * LDB,
+   double const * BETA,
+   double * C,
+   int const * LDC );
+  
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( cgemm )
+void LVARRAY_CGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   std::complex< float > const * ALPHA,
+   std::complex< float > const * A,
+   int const * LDA,
+   std::complex< float > const * B,
+   int const * LDB,
+   std::complex< float > const * BETA,
+   std::complex< float > * C,
+   int const * LDC );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( zgemm )
+void LVARRAY_ZGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   std::complex< double > const * ALPHA,
+   std::complex< double > const * A,
+   int const * LDA,
+   std::complex< double > const * B,
+   int const * LDB,
+   std::complex< double > const * BETA,
+   std::complex< double > * C,
+   int const * LDC );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv )
+void LVARRAY_SGESV( 
+  int const * N,
+  int const * NRHS,
+  float * A,
+  int const * LDA,
+  int * IPIV,
+  float * B,
+  int const * LDB,
+  int * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv )
+void LVARRAY_DGESV( 
+  int const * N,
+  int const * NRHS,
+  double * A,
+  int const * LDA,
+  int * IPIV,
+  double * B,
+  int const * LDB,
+  int * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv )
+void LVARRAY_CGESV( 
+  int const * N,
+  int const * NRHS,
+  std::complex< float > * A,
+  int const * LDA,
+  int * IPIV,
+  std::complex< float > * B,
+  int const * LDB,
+  int * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv )
+void LVARRAY_ZGESV( 
+  int const * N,
+  int const * NRHS,
+  std::complex< double > * A,
+  int const * LDA,
+  int * IPIV,
+  std::complex< double > * B,
+  int const * LDB,
+  int * INFO );
+
+} // extern "C"
+
+namespace LvArray
+{
+namespace dense
+{
+
+char toLapackChar( Operation const op )
+{
+  if( op == Operation::NO_OP ) return 'N';
+  if( op == Operation::TRANSPOSE ) return 'T';
+  if( op == Operation::ADJOINT ) return 'C';
+
+  LVARRAY_ERROR( "Unknown operation: " << int( op ) );
+  return '\0';
+}
+
+
+template< typename T >
+void BlasLapackInterface< T >::gemm(
+  Operation opA,
+  Operation opB,
+  T const alpha,
+  Matrix< T const > const & A,
+  Matrix< T const > const & B,
+  T const beta, 
+  Matrix< T > const & C )
+{
+  char const TRANSA = toLapackChar( opA );
+  char const TRANSB = toLapackChar( opB );
+  int const M = C.sizes[ 0 ];
+  int const N = C.sizes[ 1 ];
+  int const K = opA == Operation::NO_OP ? A.sizes[ 1 ] : A.sizes[ 0 ];
+  int const LDA = std::max( std::ptrdiff_t{ 1 }, A.strides[ 1 ] );
+  int const LDB = std::max( std::ptrdiff_t{ 1 }, B.strides[ 1 ] );
+  int const LDC = std::max( std::ptrdiff_t{ 1 }, C.strides[ 1 ] );
+
+  TypeDispatch< T >::dispatch( LVARRAY_SGEMM, LVARRAY_DGEMM, LVARRAY_CGEMM, LVARRAY_ZGEMM,
+    &TRANSA,
+    &TRANSB,
+    &M,
+    &N,
+    &K,
+    &alpha,
+    A.data,
+    &LDA,
+    B.data,
+    &LDB,
+    &beta,
+    C.data,
+    &LDC );
+}
+
+
+template< typename T >
+void BlasLapackInterface< T >::gesv(
+  Matrix< T > const & A,
+  Matrix< T > const & B,
+  Vector< int > const & pivots )
+{
+  int const N = A.sizes[ 0 ];
+  int const NRHS = B.sizes[ 1 ];
+  int const LDA = A.strides[ 1 ];
+  int const LDB = B.strides[ 1 ];
+  int INFO = 0;
+
+  TypeDispatch< T >::dispatch( LVARRAY_SGESV, LVARRAY_DGESV, LVARRAY_CGESV, LVARRAY_ZGESV,
+    &N,
+    &NRHS,
+    A.data,
+    &LDA,
+    pivots.data,
+    B.data,
+    &LDB,
+    &INFO );
+  
+  LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." );
+  LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 <<
+                              " ) is exactly zero so the solution could not be computed." );
+}
+
+template class BlasLapackInterface< float >;
+template class BlasLapackInterface< double >;
+template class BlasLapackInterface< std::complex< float > >;
+template class BlasLapackInterface< std::complex< double > >;
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/BlasLapackInterface.hpp b/src/dense/BlasLapackInterface.hpp
new file mode 100644
index 00000000..ed747828
--- /dev/null
+++ b/src/dense/BlasLapackInterface.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename T >
+struct BlasLapackInterface
+{
+  static constexpr MemorySpace MEMORY_SPACE = MemorySpace::host;
+
+  static void gemm(
+    Operation opA,
+    Operation opB,
+    T const alpha,
+    Matrix< T const > const & A,
+    Matrix< T const > const & B,
+    T const beta, 
+    Matrix< T > const & C );
+  
+  static void gesv(
+    Matrix< T > const & A,
+    Matrix< T > const & B,
+    Vector< int > const & pivots );
+};
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt
index 0a1de30b..8c7b4b0c 100644
--- a/src/dense/CMakeLists.txt
+++ b/src/dense/CMakeLists.txt
@@ -1,14 +1,12 @@
 set( lvarraydense_headers
      common.hpp
      backendHelpers.hpp
-     eigenDecomposition.hpp
-     linearSolve.hpp
+     BlasLapackInterface.hpp
     )
 
 set( lvarraydense_sources
      common.cpp
-     eigenDecomposition.cpp
-     linearSolve.cpp
+     BlasLapackInterface.cpp
     )
 
 set( dependencies lvarray ${lvarray_dependencies} blas lapack )
diff --git a/src/dense/backendHelpers.hpp b/src/dense/backendHelpers.hpp
index 144ad845..5de71cf8 100644
--- a/src/dense/backendHelpers.hpp
+++ b/src/dense/backendHelpers.hpp
@@ -1,12 +1,82 @@
 #pragma once
 
-#if defined( LVARRAY_USE_MAGMA )
-  #include <magma.h>
-#endif
+#include <complex>
 
 /// This macro provide a flexible interface for Fortran naming convention for compiled objects
 // #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
 #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name
 // #else
 // #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name ## _
-// #endif
\ No newline at end of file
+// #endif
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename T >
+struct TypeDispatch
+{};
+
+template<>
+struct TypeDispatch< float >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT && fFloat,
+    F_DOUBLE &&,
+    F_CFLOAT &&,
+    F_CDOUBLE &&,
+    ARGS && ... args )
+  {
+    return fFloat( std::forward< ARGS >( args ) ... );
+  }
+};
+
+template<>
+struct TypeDispatch< double >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT &&,
+    F_DOUBLE && fDouble,
+    F_CFLOAT &&,
+    F_CDOUBLE &&,
+    ARGS && ... args )
+  {
+    return fDouble( std::forward< ARGS >( args ) ... );
+  }
+};
+
+template<>
+struct TypeDispatch< std::complex< float > >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT &&,
+    F_DOUBLE &&,
+    F_CFLOAT && fCFloat,
+    F_CDOUBLE &&,
+    ARGS && ... args )
+  {
+    return fCFloat( std::forward< ARGS >( args ) ... );
+  }
+};
+
+template<>
+struct TypeDispatch< std::complex< double > >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT &&,
+    F_DOUBLE &&,
+    F_CFLOAT &&,
+    F_CDOUBLE && fCDouble,
+    ARGS && ... args )
+  {
+    return fCDouble( std::forward< ARGS >( args ) ... );
+  }
+};
+
+} // namespace dense
+} // namespace LvArray
diff --git a/src/dense/common.cpp b/src/dense/common.cpp
index 8843ca82..b1cab9fe 100644
--- a/src/dense/common.cpp
+++ b/src/dense/common.cpp
@@ -5,26 +5,16 @@ namespace LvArray
 namespace dense
 {
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-char const * getOption( SymmetricMatrixStorageType const option )
+Operation transposeOp( Operation const op )
 {
-  static constexpr char const * const upper = "U";
-  static constexpr char const * const lower = "L";
+  switch( op )
+  {
+    case Operation::NO_OP: return Operation::TRANSPOSE;
+    case Operation::TRANSPOSE: return Operation::NO_OP;
+    case Operation::ADJOINT: LVARRAY_ERROR( "Not supported" );
+  }
 
-  return option == SymmetricMatrixStorageType::UPPER_TRIANGULAR ? upper : lower;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-MemorySpace getSpaceForBackend( BuiltInBackends const backend )
-{
-#if defined( LVARRAY_USE_MAGMA )
-  // TODO: This needs to be changed to MemorySpace::hip or whatever.
-  if( backend == BuiltInBackends::MAGMA_GPU ) return MemorySpace::cuda;
-#else
-  LVARRAY_UNUSED_VARIABLE( backend );
-#endif
-
-  return MemorySpace::host;
+  return Operation::NO_OP;
 }
 
 } // namespace dense
diff --git a/src/dense/common.hpp b/src/dense/common.hpp
index 9c4fda87..376b589c 100644
--- a/src/dense/common.hpp
+++ b/src/dense/common.hpp
@@ -41,10 +41,14 @@ enum class SymmetricMatrixStorageType
   LOWER_TRIANGULAR,
 };
 
-/**
- * TODO: move to internal namespace
- */
-char const * getOption( SymmetricMatrixStorageType const option );
+enum class Operation
+{
+  NO_OP,
+  TRANSPOSE,
+  ADJOINT,
+};
+
+Operation transposeOp( Operation const op );
 
 /**
  *
@@ -64,84 +68,76 @@ static constexpr bool IsComplex = !std::is_same< RealVersion< T >, T >::value;
 template< typename T, typename U >
 static constexpr bool IsComplexT = IsComplex< T > && std::is_same< RealVersion< T >, U >::value;
 
-/**
- *
- */
-enum class BuiltInBackends
-{
-  LAPACK,
-#if defined( LVARRAY_USE_MAGMA )
-  MAGMA,
-  MAGMA_GPU,
-#endif
-};
-
-/**
- *
- */
-MemorySpace getSpaceForBackend( BuiltInBackends const backend );
-
-/**
- *
- */
-using DenseInt = int;
-
 /**
  *
  */
 template< typename T >
 struct Matrix
 {
-  /**
-   *
-   */
-  template< typename INDEX_TYPE  >
-  Matrix( ArraySlice< T, 2, 0, INDEX_TYPE > const & slice ):
-    nRows{ integerConversion< DenseInt >( slice.size( 0 ) ) },
-    nCols{ integerConversion< DenseInt >( slice.size( 1 ) ) },
-    stride{ integerConversion< DenseInt >( slice.stride( 1 ) ) },
-    isColumnMajor{ true },
-    data{ slice.data() }
-  {}
-
-  /**
-   *
-   */
-  template< typename INDEX_TYPE, int USD >
-  Matrix( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ):
-    nRows{ integerConversion< DenseInt >( slice.size( 1 ) ) },
-    nCols{ integerConversion< DenseInt >( 1 ) },
-    stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) },
-    isColumnMajor{ true },
-    data{ slice.data() }
-  {}
+  Matrix(
+    typeManipulation::CArray< std::ptrdiff_t, 2 > const & sizesIn,
+    typeManipulation::CArray< std::ptrdiff_t, 2 > const & stridesIn,
+    T * const dataIn ):
+    sizes{ sizesIn },
+    strides{ stridesIn },
+    data{ dataIn }
+  {
+    LVARRAY_ERROR_IF_LT( sizes[ 0 ], 0 );
+    LVARRAY_ERROR_IF_LT( sizes[ 1 ], 0 );
+    LVARRAY_ERROR_IF_LT( strides[ 0 ], 0 );
+    LVARRAY_ERROR_IF_LT( strides[ 1 ], 0 );
+  }
 
-  /**
-   *
-   */
   Matrix( T & value ):
-    nRows{ 1 },
-    nCols{ 1 },
-    stride{ 1 },
-    isColumnMajor{ true },
+    sizes{ 1, 1 },
+    strides{ 1, 1 },
     data{ &value }
   {}
 
-  /**
-   *
-   */
+  Matrix( Matrix< std::remove_const_t< T > > const & src ):
+    sizes{ src.sizes },
+    strides{ src.strides },
+    data{ src.data }
+  {}
+
   bool isSquare() const
+  { return sizes[0] == sizes[1]; }
+
+  bool isColumnMajor() const
+  { return strides[ 0 ] == 1; }
+
+  bool isRowMajor() const
+  { return strides[ 1 ] == 1; }
+
+  bool isContiguous() const
+  { return isColumnMajor() || isRowMajor(); }
+
+  std::ptrdiff_t nRows() const
+  { return sizes[ 0 ]; }
+
+  std::ptrdiff_t nCols() const
+  { return sizes[ 1 ]; }
+
+  Matrix transpose() const
   {
-    return nRows == nCols;
+    return Matrix( { sizes[ 1 ], sizes[ 0 ] }, { strides[ 1 ], strides[ 0 ] }, data );
   }
 
-  DenseInt const nRows;
-  DenseInt const nCols;
-  DenseInt const stride;
-  bool const isColumnMajor;
-  T * const data;
+  typeManipulation::CArray< std::ptrdiff_t, 2 > sizes;
+  typeManipulation::CArray< std::ptrdiff_t, 2 > strides;
+  T * data;
 };
 
+template< typename T, typename PERM, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+Matrix< T > toMatrix(
+  Array< T, 2, PERM, INDEX_TYPE, BUFFER_TYPE > const & array,
+  MemorySpace const space,
+  bool const touch )
+{
+  array.move( space, touch );
+  return Matrix< T >( array.dimsArray(), array.stridesArray(), array.data() );
+}
+
 /**
  *
  */
@@ -150,8 +146,8 @@ struct Vector
 {
   template< int USD, typename INDEX_TYPE >
   Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ):
-    size{ integerConversion< DenseInt >( slice.size() ) },
-    stride{ integerConversion< DenseInt >( slice.stride( 0 ) ) },
+    size{ integerConversion< std::ptrdiff_t >( slice.size() ) },
+    stride{ integerConversion< std::ptrdiff_t >( slice.stride( 0 ) ) },
     data{ slice.data() }
   {}
 
@@ -161,8 +157,8 @@ struct Vector
     data{ &value }
   {}
 
-  DenseInt const size;
-  DenseInt const stride;
+  std::ptrdiff_t const size;
+  std::ptrdiff_t const stride;
   T * const data;
 };
 
@@ -183,17 +179,17 @@ struct Workspace
 
   virtual Vector< RealVersion< T > > rwork() = 0;
 
-  virtual Vector< DenseInt > iwork() = 0;
+  virtual Vector< int > iwork() = 0;
 
-  virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) = 0;
+  virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
 
-  virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) = 0;
+  virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
 
-  virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) = 0;
+  virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
 
-  virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) = 0;
+  virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
 
-  virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) = 0;
+  virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
 };
 
 /**
@@ -223,44 +219,44 @@ struct ArrayWorkspace : public Workspace< T >
   virtual Vector< RealVersion< T > > rwork() override
   { return m_rwork.toSlice(); }
 
-  virtual Vector< DenseInt > iwork() override
+  virtual Vector< int > iwork() override
   { return m_iwork.toSlice(); }
 
-  virtual void resizeWork( MemorySpace const space, DenseInt const newSize ) override
+  virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) override
   {
     m_work.resizeWithoutInitializationOrDestruction( space, newSize );
   }
 
-  virtual void resizeWork2( MemorySpace const space, DenseInt const newSize ) override
+  virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) override
   {
     m_work2.resizeWithoutInitializationOrDestruction( space, newSize );
   }
 
-  virtual void resizeWork3( MemorySpace const space, DenseInt const newSize ) override
+  virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) override
   {
     m_work3.resizeWithoutInitializationOrDestruction( space, newSize );
   }
  
-  virtual void resizeRWork( MemorySpace const space, DenseInt const newSize ) override
+  virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) override
   {
     m_rwork.resizeWithoutInitializationOrDestruction( space, newSize );
   }
 
-  virtual void resizeIWork( MemorySpace const space, DenseInt const newSize ) override
+  virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) override
   {
     m_iwork.resizeWithoutInitializationOrDestruction( space, newSize );
   }
 
 private:
-  Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work;
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work;
 
-  Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work2;
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work2;
 
-  Array< T, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_work3;
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work3;
 
-  Array< RealVersion< T >, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_rwork;
+  Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_rwork;
 
-  Array< DenseInt, 1, RAJA::PERM_I, DenseInt, BUFFER_TYPE > m_iwork;
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_iwork;
 };
 
 /**
@@ -287,28 +283,28 @@ struct OptimalSizeCalculation : public Workspace< T >
   virtual Vector< int > iwork() override
   { return m_iwork; }
 
-  virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
   { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
 
-  virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
   { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
 
-  virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
   { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
 
-  virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
   { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
 
-  virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), DenseInt const LVARRAY_UNUSED_ARG( newSize ) ) override
+  virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
   { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
 
-  DenseInt optimalWorkSize() const
-  { return static_cast< DenseInt >( m_work.real() ); }
+  std::ptrdiff_t optimalWorkSize() const
+  { return static_cast< std::ptrdiff_t >( m_work.real() ); }
 
-  DenseInt optimalRWorkSize() const
-  { return static_cast< DenseInt >( m_rwork ); }
+  std::ptrdiff_t optimalRWorkSize() const
+  { return static_cast< std::ptrdiff_t >( m_rwork ); }
 
-  DenseInt optimalIWorkSize() const
+  std::ptrdiff_t optimalIWorkSize() const
   { return m_iwork; }
 
 private:
@@ -320,7 +316,7 @@ struct OptimalSizeCalculation : public Workspace< T >
 
   RealVersion< T > m_rwork { -1 };
 
-  DenseInt m_iwork { -1 };
+  int m_iwork { -1 };
 };
 
 } // namespace dense
diff --git a/src/dense/dense.hpp b/src/dense/dense.hpp
new file mode 100644
index 00000000..2fcf202d
--- /dev/null
+++ b/src/dense/dense.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename INTERFACE, typename MATRIX_A, typename MATRIX_B, typename MATRIX_C, typename SCALAR >
+void gemm(
+  Operation opA,
+  Operation opB,
+  SCALAR const alpha,
+  MATRIX_A const & Ain,
+  MATRIX_B const & Bin,
+  SCALAR const beta,
+  MATRIX_C const & Cin )
+{
+  Matrix< SCALAR const > A = toMatrix( Ain, INTERFACE::MEMORY_SPACE, false );
+  Matrix< SCALAR const > B = toMatrix( Bin, INTERFACE::MEMORY_SPACE, false );
+  Matrix< SCALAR > const C = toMatrix( Cin, INTERFACE::MEMORY_SPACE, true );
+
+  // Check the sizes
+  LVARRAY_ERROR_IF_NE( C.sizes[ 0 ], A.sizes[ 0 + (opA != Operation::NO_OP) ] );
+  LVARRAY_ERROR_IF_NE( C.sizes[ 1 ], B.sizes[ 1 - (opB != Operation::NO_OP) ] );
+  LVARRAY_ERROR_IF_NE( A.sizes[ 1 - (opA != Operation::NO_OP) ],
+                       B.sizes[ 0 + (opB != Operation::NO_OP) ] );
+
+  // Check that everything is contiguous
+  LVARRAY_ERROR_IF( !A.isContiguous(), "Matrix A must have one stride on dimension." );
+  LVARRAY_ERROR_IF( !B.isContiguous(), "Matrix B must have one stride one dimension." );
+  LVARRAY_ERROR_IF( !C.isColumnMajor(), "Matrix C must be column major." );
+
+  // TODO(corbett5): Don't think this will work for Hermitian matrices.
+  if( !A.isColumnMajor() )
+  {
+    A = A.transpose();
+    opA = transposeOp( opA );
+  }
+  if( !B.isColumnMajor() )
+  {
+    B = B.transpose();
+    opB = transposeOp( opB );
+  }
+
+  INTERFACE::gemm( opA, opB, alpha, A, B, beta, C );
+}
+
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp
index e70b6561..68236057 100644
--- a/src/dense/eigenDecomposition.cpp
+++ b/src/dense/eigenDecomposition.cpp
@@ -90,8 +90,8 @@ DenseInt heevr(
   char const * const JOBZ = decompositionOptions.typeArg();
   char const * const RANGE = decompositionOptions.rangeArg();
   char const * const UPLO = getOption( storageType );
-  DenseInt const N = A.nCols;
-  DenseInt const LDA = A.stride;
+  DenseInt const N = A.sizes[ 1 ];
+  DenseInt const LDA = A.strides[ 1 ];
 
   T const VL = decompositionOptions.rangeMin;
   T const VU = decompositionOptions.rangeMax;
@@ -112,11 +112,11 @@ DenseInt heevr(
 
   if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS )
   {
-    LVARRAY_ERROR_IF_NE( eigenvectors.nRows, N );
-    LVARRAY_ERROR_IF_LT( eigenvectors.nCols, maxEigenvaluesToFind );
+    LVARRAY_ERROR_IF_NE( eigenvectors.sizes[ 0 ], N );
+    LVARRAY_ERROR_IF_LT( eigenvectors.sizes[ 1 ], maxEigenvaluesToFind );
   }
 
-  DenseInt const LDZ = std::max( 1, eigenvectors.stride );
+  DenseInt const LDZ = std::max( 1, eigenvectors.strides[ 1 ] );
 
   if( decompositionOptions.range == EigenDecompositionOptions::ALL ||
       ( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX &&
@@ -348,12 +348,12 @@ DenseInt heevr(
 {
   // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries.
   // I'm not sure exactly how this would work for the eigenvectors though.
-  LVARRAY_ERROR_IF( !A.isColumnMajor, "Row major is not yet supported." );
-  LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor, "Row major is not yet supported." );
+  LVARRAY_ERROR_IF( !A.isColumnMajor(), "Row major is not yet supported." );
+  LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor(), "Row major is not yet supported." );
 
-  bool const reallocateWork = workspace.work().size < 2 * A.nRows;
-  bool const reallocateRWork = workspace.rwork().size < 24 * A.nRows;
-  bool const reallocateIWork = workspace.iwork().size < 10 * A.nRows;
+  bool const reallocateWork = workspace.work().size < 2 * A.sizes[ 0 ];
+  bool const reallocateRWork = workspace.rwork().size < 24 * A.sizes[ 0 ];
+  bool const reallocateIWork = workspace.iwork().size < 10 * A.sizes[ 0 ];
 
   if( reallocateWork || reallocateRWork || reallocateIWork )
   {
diff --git a/src/dense/linearSolve.cpp b/src/dense/linearSolve.cpp
index 9833710f..33d5f503 100644
--- a/src/dense/linearSolve.cpp
+++ b/src/dense/linearSolve.cpp
@@ -67,17 +67,17 @@ void gesv(
   Vector< DenseInt > const & pivots )
 {
   LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
-  LVARRAY_ERROR_IF( !A.isColumnMajor, "The matrix A must be column major." );
+  LVARRAY_ERROR_IF( !A.isColumnMajor(), "The matrix A must be column major." );
 
-  LVARRAY_ERROR_IF_NE( A.nRows, B.nRows );
-  LVARRAY_ERROR_IF( !B.isColumnMajor, "The matrix B must be column major." );
+  LVARRAY_ERROR_IF_NE( A.sizes[ 0 ], B.sizes[ 0 ] );
+  LVARRAY_ERROR_IF( !B.isColumnMajor(), "The matrix B must be column major." );
 
-  LVARRAY_ERROR_IF_NE( pivots.size, A.nRows );
+  LVARRAY_ERROR_IF_NE( pivots.size, A.sizes[ 0 ] );
 
-  DenseInt const N = A.nCols;
-  DenseInt const NRHS = B.nCols;
-  DenseInt const LDA = A.stride;
-  DenseInt const LDB = B.stride;
+  DenseInt const N = A.sizes[ 1 ];
+  DenseInt const NRHS = B.sizes[ 1 ];
+  DenseInt const LDA = A.strides[ 1 ];
+  DenseInt const LDB = B.strides[ 1 ];
   DenseInt INFO = 0;
 
   if( backend == BuiltInBackends::LAPACK )
diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt
index 4d58286d..f87b2fda 100644
--- a/unitTests/dense/CMakeLists.txt
+++ b/unitTests/dense/CMakeLists.txt
@@ -9,9 +9,8 @@
 # Specify list of tests
 #
 set( testSources
-     testEigenDecomposition.cpp
-     testLinearSolve.cpp
-   )
+     testgemm.cpp
+)
 
 #
 # Add gtest C++ based tests
diff --git a/unitTests/dense/testgemm.cpp b/unitTests/dense/testgemm.cpp
new file mode 100644
index 00000000..51f50773
--- /dev/null
+++ b/unitTests/dense/testgemm.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+// Source includes
+#include "dense/dense.hpp"
+#include "dense/BlasLapackInterface.hpp"
+
+#include "../testUtils.hpp"
+
+#include <random>
+
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
+#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \
+  EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \
+  EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError )
+
+namespace LvArray
+{
+namespace testing
+{
+
+// This should probably go in a common place
+template< typename T, typename PERM >
+using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >;
+
+template< typename T >
+std::enable_if_t< std::is_floating_point< T >::value, T >
+randomValue( std::mt19937 & gen )
+{ return std::uniform_real_distribution< T >{ -1, 1 }( gen ); }
+
+template< typename T >
+std::enable_if_t< dense::IsComplex< T >, T >
+randomValue( std::mt19937 & gen )
+{ 
+  return { std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ),
+           std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ) }; 
+}
+
+template< typename T, typename PERM >
+Array2d< T, PERM > randomMatrix( std::ptrdiff_t const N, std::ptrdiff_t const M )
+{
+  std::mt19937 gen( std::random_device{}() );
+
+  Array2d< T, PERM > const ret( N, M );
+
+  for( std::ptrdiff_t r = 0; r < N; ++r )
+  {
+    for( std::ptrdiff_t c = 0; c < M; ++c )
+    {
+      ret( r, c ) = T{10} * randomValue< T >( gen );
+    }
+  }
+
+  return ret; 
+}
+
+template< typename T, typename PERM >
+std::enable_if_t< std::is_floating_point< T >::value >
+checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol )
+{
+  ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) );
+  ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) );
+
+  for( std::ptrdiff_t i = 0; i < lhs.size(); ++i )
+  {
+    EXPECT_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol );
+  }
+}
+
+template< typename T, typename PERM >
+std::enable_if_t< dense::IsComplex< T > >
+checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol )
+{
+  ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) );
+  ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) );
+
+  for( std::ptrdiff_t i = 0; i < lhs.size(); ++i )
+  {
+    EXPECT_COMPLEX_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol );
+  }
+}
+
+template< typename INTERFACE, typename T, typename PERM_A, typename PERM_B >
+struct GemmTest
+{
+  std::mt19937 gen();
+
+  void Rij_eq_AikBkj()
+  {
+    std::mt19937 gen( std::random_device{}() );
+
+    int const N = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen );
+    int const M = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen );
+    int const K = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen );
+    
+    T const alpha = T{10} * randomValue< T >( gen );
+    T const beta = T{10} * randomValue< T >( gen );
+    
+    Array2d< T, PERM_A > const A = randomMatrix< T, PERM_A >( N, K );
+    Array2d< T, PERM_B > const B = randomMatrix< T, PERM_B >( K, M );
+    Array2d< T, RAJA::PERM_JI > const C = randomMatrix< T, RAJA::PERM_JI >( N, M );
+    
+    Array2d< T, PERM_A > const Acopy = A;
+    Array2d< T, PERM_B > const Bcopy = B;
+    Array2d< T, RAJA::PERM_JI > const Ccopy = C;
+
+    dense::gemm< INTERFACE >( dense::Operation::NO_OP, dense::Operation::NO_OP, alpha, A, B, beta, C );
+
+    A.move( MemorySpace::host, false );
+    B.move( MemorySpace::host, false );
+    C.move( MemorySpace::host, false );
+
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        T dot = 0;
+        for( std::ptrdiff_t k = 0; k < K; ++k )
+        {
+          dot += Acopy( i, k ) * Bcopy( k, j );
+        }
+
+        Ccopy( i, j ) = alpha * dot + beta * Ccopy( i, j );
+      }
+    }
+
+    checkEqual( A, Acopy, 0 );
+    checkEqual( B, Bcopy, 0 );
+    checkEqual( C, Ccopy, 1e3 * std::numeric_limits< dense::RealVersion< T > >::epsilon() );
+  }
+};
+
+TEST( LapackInterface_float, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_double, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_float, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_double, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_float, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_double, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_float, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_double, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+#if defined( LVARRAY_USE_MAGMA )
+  magma_init();
+#endif
+
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+
+#if defined( LVARRAY_USE_MAGMA )
+  magma_finalize();
+#endif
+
+  return result;
+}