diff --git a/cmake/ExternalGitTags.cmake b/cmake/ExternalGitTags.cmake index 1d4dbb245..81c2fca94 100644 --- a/cmake/ExternalGitTags.cmake +++ b/cmake/ExternalGitTags.cmake @@ -20,7 +20,7 @@ set(EXTERN_ARPACK_GIT_BRANCH "Git branch for external ARPACK-NG build" ) set(EXTERN_ARPACK_GIT_TAG - "0eaf38bd51efd3046f0a7e61a08ddcd7ed1f1fcc" CACHE STRING # 11/22/2023 + "51ce061ff7066ce712638bb058c805c40a326582" CACHE STRING "Git tag for external ARPACK-NG build" ) @@ -34,7 +34,7 @@ set(EXTERN_BUTTERFLYPACK_GIT_BRANCH "Git branch for external ButterflyPACK build" ) set(EXTERN_BUTTERFLYPACK_GIT_TAG - "e1ad6091e8dc2cb906ec426222f4acedb4eeeff2" CACHE STRING # 01/10/2024 + "e1ad6091e8dc2cb906ec426222f4acedb4eeeff2" CACHE STRING "Git tag for external ButterflyPACK build" ) @@ -48,7 +48,7 @@ set(EXTERN_GSLIB_GIT_BRANCH "Git branch for external GSLIB build" ) set(EXTERN_GSLIB_GIT_TAG - "39d1baae8f4bfebe3ebca6a234dcc8ba1ee5edc7" CACHE STRING # 11/09/2022 + "39d1baae8f4bfebe3ebca6a234dcc8ba1ee5edc7" CACHE STRING "Git tag for external GSLIB build" ) @@ -62,7 +62,7 @@ set(EXTERN_HYPRE_GIT_BRANCH "Git branch for external HYPRE build" ) set(EXTERN_HYPRE_GIT_TAG - "57bfb26e268ddf003668c5d0b5938ae258922a83" CACHE STRING # 01/08/2024 + "7e7fc8ce09153c60ae538a52a5f870f93b9608ca" CACHE STRING "Git tag for external HYPRE build" ) @@ -76,7 +76,7 @@ set(EXTERN_LIBCEED_GIT_BRANCH "Git branch for external libCEED build" ) set(EXTERN_LIBCEED_GIT_TAG - "699fe8f87dbb33a262c5d9f777cbfdff72550eee" CACHE STRING # main @ 01/11/2024 + "437ae4f6500350dd714f2f799cd1eb0b6575cf0f" CACHE STRING # main @ 02/14/2024 "Git tag for external libCEED build" ) @@ -90,7 +90,7 @@ set(EXTERN_LIBXSMM_GIT_BRANCH "Git branch for external LIBXSMM build" ) set(EXTERN_LIBXSMM_GIT_TAG - "462cbc42f7c5423642a340a71f825731a013ecd6" CACHE STRING # 01/11/2023 + "13df674c4b73a1b84f6456de8595903ebfbb43e0" CACHE STRING "Git tag for external LIBXSMM build" ) @@ -104,7 +104,7 @@ set(EXTERN_MAGMA_GIT_BRANCH "Git branch for external MAGMA build" ) set(EXTERN_MAGMA_GIT_TAG - "061a800467aab6d15f4353c4fc0770375863b43e" CACHE STRING # 12/29/2023 + "1628609ab4a8421bd50bafb33cabf49001c1c97b" CACHE STRING "Git tag for external MAGMA build" ) @@ -118,7 +118,7 @@ set(EXTERN_METIS_GIT_BRANCH "Git branch for external METIS build" ) set(EXTERN_METIS_GIT_TAG - "8b194fdf09661ac41b36fa16db0474d38f46f1ac" CACHE STRING # 01/10/2023 + "8b194fdf09661ac41b36fa16db0474d38f46f1ac" CACHE STRING "Git tag for external METIS build" ) @@ -132,7 +132,7 @@ set(EXTERN_MFEM_GIT_BRANCH "Git branch for external MFEM build" ) set(EXTERN_MFEM_GIT_TAG - "2b27d4815f6e549ffb01065e9ad2d3549706ccec" CACHE STRING # master @ 01/11/2024 + "db77249ac0f22e277081d43a5db6c3601e9e2830" CACHE STRING # master @ 02/13/2024 "Git tag for external MFEM build" ) @@ -146,7 +146,7 @@ set(EXTERN_MUMPS_GIT_BRANCH "Git branch for external MUMPS build" ) set(EXTERN_MUMPS_GIT_TAG - "89d12a54114636a040baf652a3b654530ef2590f" CACHE STRING # 01/11/2023 + "888feec5482665dc7f334e9d08cc45e98128340c" CACHE STRING "Git tag for external MUMPS build" ) @@ -160,7 +160,7 @@ set(EXTERN_PARMETIS_GIT_BRANCH "Git branch for external ParMETIS build" ) set(EXTERN_PARMETIS_GIT_TAG - "f5e3aab04fd5fe6e09fa02f885c1c29d349f9f8b" CACHE STRING # 01/11/2023 + "f5e3aab04fd5fe6e09fa02f885c1c29d349f9f8b" CACHE STRING "Git tag for external ParMETIS build" ) @@ -174,7 +174,7 @@ set(EXTERN_PETSC_GIT_BRANCH "Git branch for external PETSc build" ) set(EXTERN_PETSC_GIT_TAG - "e98d5aa1d9a8b761c58f921ec2d97e44eaf55cc3" CACHE STRING # 01/12/2024 + "d1841210cfd64925ca30ed94c24c0bbb5cdc9142" CACHE STRING "Git tag for external PETSc build" ) @@ -188,7 +188,7 @@ set(EXTERN_SCALAPACK_GIT_BRANCH "Git branch for external ScaLAPACK build" ) set(EXTERN_SCALAPACK_GIT_TAG - "8435bf3bc435d9611d096abab46ebf6beb1d35ea" CACHE STRING # 11/15/2023 + "8435bf3bc435d9611d096abab46ebf6beb1d35ea" CACHE STRING "Git tag for external ScaLAPACK build" ) @@ -202,7 +202,7 @@ set(EXTERN_SLEPC_GIT_BRANCH "Git branch for external SLEPc build" ) set(EXTERN_SLEPC_GIT_TAG - "9fccf47e48b0f46fc169e86349cb3ddd4f8dee5b" CACHE STRING # 01/12/2024 + "7980363cc28f3ee28098ac846022f6cc6d867aa6" CACHE STRING "Git tag for external SLEPc build" ) @@ -216,7 +216,7 @@ set(EXTERN_STRUMPACK_GIT_BRANCH "Git branch for external STRUMPACK build" ) set(EXTERN_STRUMPACK_GIT_TAG - "90dcaaa3c7f06e69f94da43aa1043dd9e030cc2a" CACHE STRING # 12/14/2023 + "3462e8cbc6e57d0c3934492abbcc69ee7edb9457" CACHE STRING "Git tag for external STRUMPACK build" ) @@ -230,7 +230,7 @@ set(EXTERN_SUPERLU_GIT_BRANCH "Git branch for external SuperLU_DIST build" ) set(EXTERN_SUPERLU_GIT_TAG - "b3eecd3eaac3a1332d0d2c5fc052d1af114df192" CACHE STRING # 11/17/2023 + "67575a491de06e81a98c804929c8f0de40ee5990" CACHE STRING "Git tag for external SuperLU_DIST build" ) @@ -244,7 +244,7 @@ set(EXTERN_ZFP_GIT_BRANCH "Git branch for external ZFP build" ) set(EXTERN_ZFP_GIT_TAG - "f61aac5ca3445ac48bab94d0e3da62bb77bdd989" CACHE STRING # 01/11/2024 + "6d2b93cf0fda729903fa4b95d4b8580f8f86950f" CACHE STRING "Git tag for external ZFP build" ) @@ -256,7 +256,7 @@ set(EXTERN_JSON_URL # fmt set(EXTERN_FMT_URL - "https://github.com/fmtlib/fmt/releases/download/10.1.1/fmt-10.1.1.zip" CACHE STRING + "https://github.com/fmtlib/fmt/releases/download/10.2.0/fmt-10.2.0.zip" CACHE STRING "URL for external fmt build" ) diff --git a/cmake/ExternalLibCEED.cmake b/cmake/ExternalLibCEED.cmake index 008fa9754..203038d8e 100644 --- a/cmake/ExternalLibCEED.cmake +++ b/cmake/ExternalLibCEED.cmake @@ -121,6 +121,11 @@ endif() string(REPLACE ";" "; " LIBCEED_OPTIONS_PRINT "${LIBCEED_OPTIONS}") message(STATUS "LIBCEED_OPTIONS: ${LIBCEED_OPTIONS_PRINT}") +# Patch for LIBXSMM pre-v2 +set(LIBCEED_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/libCEED/patch_xsmm.diff" +) + include(ExternalProject) ExternalProject_Add(libCEED DEPENDS ${LIBCEED_DEPENDENCIES} @@ -131,6 +136,7 @@ ExternalProject_Add(libCEED PREFIX ${CMAKE_BINARY_DIR}/extern/libCEED-cmake BUILD_IN_SOURCE TRUE UPDATE_COMMAND "" + PATCH_COMMAND git apply "${LIBCEED_PATCH_FILES}" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} ${LIBCEED_OPTIONS} install diff --git a/cmake/ExternalMFEM.cmake b/cmake/ExternalMFEM.cmake index 1e3927366..0d5c3e380 100644 --- a/cmake/ExternalMFEM.cmake +++ b/cmake/ExternalMFEM.cmake @@ -98,6 +98,7 @@ endif() # Configure BLAS/LAPACK for dependencies if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") list(APPEND MFEM_OPTIONS + # "-DMFEM_USE_LAPACK=YES" "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" ) @@ -361,8 +362,9 @@ set(MFEM_PATCH_FILES "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_vis_dev.diff" "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_partitioner_dev.diff" "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_par_tet_mesh_fix.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_pncmesh_update_fix.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_getnodalvalues_device_fix.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_workspace_vectors.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff" ) include(ExternalProject) diff --git a/extern/patch/libCEED/patch_xsmm.diff b/extern/patch/libCEED/patch_xsmm.diff new file mode 100644 index 000000000..aeb05410e --- /dev/null +++ b/extern/patch/libCEED/patch_xsmm.diff @@ -0,0 +1,98 @@ +diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c +index 44e9081e..09bdb6fd 100644 +--- a/backends/ref/ceed-ref-restriction.c ++++ b/backends/ref/ceed-ref-restriction.c +@@ -233,7 +233,7 @@ static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestr + CeedScalar vv_loc; + + vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset]; +- CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc; ++ vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc; + } + } + } +@@ -257,7 +257,7 @@ static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRes + CeedScalar vv_loc; + + vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0); +- CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc; ++ vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc; + } + } + } +@@ -287,7 +287,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle + impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]; + } + for (CeedInt j = 0; j < block_end; j++) { +- CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; ++ vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; + } + for (n = 1; n < elem_size - 1; n++) { + CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) { +@@ -299,7 +299,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle + impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]; + } + for (CeedInt j = 0; j < block_end; j++) { +- CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; ++ vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; + } + } + CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) { +@@ -309,7 +309,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle + impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]; + } + for (CeedInt j = 0; j < block_end; j++) { +- CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; ++ vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; + } + } + } +@@ -338,7 +338,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core + abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]); + } + for (CeedInt j = 0; j < block_end; j++) { +- CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; ++ vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; + } + for (n = 1; n < elem_size - 1; n++) { + CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) { +@@ -350,7 +350,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core + abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]); + } + for (CeedInt j = 0; j < block_end; j++) { +- CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; ++ vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; + } + } + CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) { +@@ -360,7 +360,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core + abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]); + } + for (CeedInt j = 0; j < block_end; j++) { +- CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; ++ vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; + } + } + } +diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c +index a64f5e9f..555352ef 100644 +--- a/backends/xsmm/ceed-xsmm-tensor.c ++++ b/backends/xsmm/ceed-xsmm-tensor.c +@@ -30,7 +30,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64) + : libxsmm_create_gemm_shape(J, A, B, !t_mode ? B : J, B, J, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32); +- const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); ++ const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); + libxsmm_gemm_param gemm_param; + + CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); +@@ -50,7 +50,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64) + : libxsmm_create_gemm_shape(C, J, B, C, !t_mode ? B : J, C, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32); +- const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); ++ const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); + libxsmm_gemm_param gemm_param; + + CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); diff --git a/extern/patch/mfem/patch_getnodalvalues_device_fix.diff b/extern/patch/mfem/patch_getnodalvalues_device_fix.diff deleted file mode 100644 index 1f41b8123..000000000 --- a/extern/patch/mfem/patch_getnodalvalues_device_fix.diff +++ /dev/null @@ -1,15 +0,0 @@ -diff --git a/fem/gridfunc.cpp b/fem/gridfunc.cpp -index 310d8d704..bd5504d63 100644 ---- a/fem/gridfunc.cpp -+++ b/fem/gridfunc.cpp -@@ -1924,9 +1924,9 @@ void GridFunction::GetNodalValues(Vector &nval, int vdim) const - Array values; - Array overlap(fes->GetNV()); - nval.SetSize(fes->GetNV()); -- - nval = 0.0; - overlap = 0; -+ nval.HostReadWrite(); - for (i = 0; i < fes->GetNE(); i++) - { - fes->GetElementVertices(i, vertices); diff --git a/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff b/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff new file mode 100644 index 000000000..9658c832c --- /dev/null +++ b/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff @@ -0,0 +1,970 @@ +diff --git a/fem/complex_fem.cpp b/fem/complex_fem.cpp +index 95565804d..a26707cbe 100644 +--- a/fem/complex_fem.cpp ++++ b/fem/complex_fem.cpp +@@ -1243,25 +1243,28 @@ ParSesquilinearForm::FormLinearSystem(const Array &ess_tdof_list, + HypreParMatrix * Ah; + A_i.Get(Ah); + hypre_ParCSRMatrix *Aih = *Ah; +-#if !defined(HYPRE_USING_GPU) +- ess_tdof_list.HostRead(); +- for (int k = 0; k < n; k++) ++ if (!HypreUsingGPU()) + { +- const int j = ess_tdof_list[k]; +- Aih->diag->data[Aih->diag->i[j]] = 0.0; ++ ess_tdof_list.HostRead(); ++ for (int k = 0; k < n; k++) ++ { ++ const int j = ess_tdof_list[k]; ++ Aih->diag->data[Aih->diag->i[j]] = 0.0; ++ } + } +-#else +- Ah->HypreReadWrite(); +- const int *d_ess_tdof_list = +- ess_tdof_list.GetMemory().Read(MemoryClass::DEVICE, n); +- const int *d_diag_i = Aih->diag->i; +- double *d_diag_data = Aih->diag->data; +- MFEM_GPU_FORALL(k, n, ++ else + { +- const int j = d_ess_tdof_list[k]; +- d_diag_data[d_diag_i[j]] = 0.0; +- }); +-#endif ++ Ah->HypreReadWrite(); ++ const int *d_ess_tdof_list = ++ ess_tdof_list.GetMemory().Read(MemoryClass::DEVICE, n); ++ HYPRE_Int *d_diag_i = Aih->diag->i; ++ double *d_diag_data = Aih->diag->data; ++ mfem::forall(n, [=] MFEM_HOST_DEVICE (int k) ++ { ++ const int j = d_ess_tdof_list[k]; ++ d_diag_data[d_diag_i[j]] = 0.0; ++ }); ++ } + } + else + { +diff --git a/fem/lor/lor_ams.cpp b/fem/lor/lor_ams.cpp +index f8d0dfd8c..9066cc20a 100644 +--- a/fem/lor/lor_ams.cpp ++++ b/fem/lor/lor_ams.cpp +@@ -291,7 +291,7 @@ void BatchedLOR_AMS::FormCoordinateVectors(const Vector &X_vert) + const auto ltdof_ldof = HypreRead(R->GetMemoryJ()); + + // Go from E-vector format directly to T-vector format +- MFEM_HYPRE_FORALL(i, ntdofs, ++ mfem::forall_switch(HypreUsingGPU(), ntdofs, [=] MFEM_HOST_DEVICE (int i) + { + const int j = d_offsets[ltdof_ldof[i]]; + for (int c = 0; c < sdim; ++c) +diff --git a/general/mem_manager.hpp b/general/mem_manager.hpp +index c5fed95a0..753bd74b3 100644 +--- a/general/mem_manager.hpp ++++ b/general/mem_manager.hpp +@@ -995,7 +995,7 @@ inline void Memory::MakeAlias(const Memory &base, int offset, int size) + if (!(base.flags & Registered)) + { + if ( +-#if !defined(HYPRE_USING_GPU) ++#if !defined(HYPRE_USING_GPU) || MFEM_HYPRE_VERSION >= 22600 + // If the following condition is true then MemoryManager::Exists() + // should also be true: + IsDeviceMemory(MemoryManager::GetDeviceMemoryType()) +diff --git a/linalg/hypre.cpp b/linalg/hypre.cpp +index 0fd16be03..19cd473de 100644 +--- a/linalg/hypre.cpp ++++ b/linalg/hypre.cpp +@@ -65,6 +65,18 @@ void Hypre::SetDefaultOptions() + #endif + #endif + ++#if MFEM_HYPRE_VERSION >= 22600 ++ if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) ++ { ++ HYPRE_SetMemoryLocation(HYPRE_MEMORY_DEVICE); ++ HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE); ++ } ++ else ++ { ++ HYPRE_SetMemoryLocation(HYPRE_MEMORY_HOST); ++ HYPRE_SetExecutionPolicy(HYPRE_EXEC_HOST); ++ } ++#else + // The following options are hypre's defaults as of hypre-2.24 + + // Allocate hypre objects in GPU memory (default) +@@ -72,6 +84,16 @@ void Hypre::SetDefaultOptions() + + // Where to execute when using UVM (default) + // HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE); ++#endif ++ ++ // TODO 23/11/28: introduced in https://github.com/hypre-space/hypre/pull/962, ++ // not yet in a release ++#if MFEM_HYPRE_VERSION >= 23100 ++ if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) ++ { ++ HYPRE_DeviceInitialize(); ++ } ++#endif + + // Use GPU-based random number generator (default) + // HYPRE_SetUseGpuRand(1); +@@ -121,22 +143,27 @@ bool CanShallowCopy(const Memory &src, MemoryClass mc) + inline void HypreParVector::_SetDataAndSize_() + { + hypre_Vector *x_loc = hypre_ParVectorLocalVector(x); +-#if !defined(HYPRE_USING_GPU) +- SetDataAndSize(hypre_VectorData(x_loc), +- internal::to_int(hypre_VectorSize(x_loc))); +-#else +- size = internal::to_int(hypre_VectorSize(x_loc)); +- MemoryType mt = (hypre_VectorMemoryLocation(x_loc) == HYPRE_MEMORY_HOST +- ? MemoryType::HOST : GetHypreMemoryType()); +- if (hypre_VectorData(x_loc) != NULL) ++#if defined(HYPRE_USING_GPU) ++ if (HypreUsingGPU()) + { +- data.Wrap(hypre_VectorData(x_loc), size, mt, false); ++ size = internal::to_int(hypre_VectorSize(x_loc)); ++ MemoryType mt = (hypre_VectorMemoryLocation(x_loc) == HYPRE_MEMORY_HOST ++ ? MemoryType::HOST : GetHypreMemoryType()); ++ if (hypre_VectorData(x_loc) != NULL) ++ { ++ data.Wrap(hypre_VectorData(x_loc), size, mt, false); ++ } ++ else ++ { ++ data.Reset(); ++ } + } + else ++#endif + { +- data.Reset(); ++ SetDataAndSize(hypre_VectorData(x_loc), ++ internal::to_int(hypre_VectorSize(x_loc))); + } +-#endif + } + + HypreParVector::HypreParVector(MPI_Comm comm, HYPRE_BigInt glob_size, +@@ -315,7 +342,7 @@ void HypreParVector::HypreRead() const + hypre_VectorData(x_loc) = + const_cast(data.Read(GetHypreMemoryClass(), size)); + #ifdef HYPRE_USING_GPU +- hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE; ++ hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation(); + #endif + } + +@@ -324,7 +351,7 @@ void HypreParVector::HypreReadWrite() + hypre_Vector *x_loc = hypre_ParVectorLocalVector(x); + hypre_VectorData(x_loc) = data.ReadWrite(GetHypreMemoryClass(), size); + #ifdef HYPRE_USING_GPU +- hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE; ++ hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation(); + #endif + } + +@@ -333,7 +360,7 @@ void HypreParVector::HypreWrite() + hypre_Vector *x_loc = hypre_ParVectorLocalVector(x); + hypre_VectorData(x_loc) = data.Write(GetHypreMemoryClass(), size); + #ifdef HYPRE_USING_GPU +- hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE; ++ hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation(); + #endif + } + +@@ -347,7 +374,7 @@ void HypreParVector::WrapMemoryRead(const Memory &mem) + hypre_VectorData(x_loc) = + const_cast(mem.Read(GetHypreMemoryClass(), size)); + #ifdef HYPRE_USING_GPU +- hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE; ++ hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation(); + #endif + data.MakeAlias(mem, 0, size); + } +@@ -361,7 +388,7 @@ void HypreParVector::WrapMemoryReadWrite(Memory &mem) + hypre_Vector *x_loc = hypre_ParVectorLocalVector(x); + hypre_VectorData(x_loc) = mem.ReadWrite(GetHypreMemoryClass(), size); + #ifdef HYPRE_USING_GPU +- hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE; ++ hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation(); + #endif + data.MakeAlias(mem, 0, size); + } +@@ -375,7 +402,7 @@ void HypreParVector::WrapMemoryWrite(Memory &mem) + hypre_Vector *x_loc = hypre_ParVectorLocalVector(x); + hypre_VectorData(x_loc) = mem.Write(GetHypreMemoryClass(), size); + #ifdef HYPRE_USING_GPU +- hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE; ++ hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation(); + #endif + data.MakeAlias(mem, 0, size); + } +@@ -535,6 +562,12 @@ void HypreParMatrix::Init() + + void HypreParMatrix::Read(MemoryClass mc) const + { ++#if MFEM_HYPRE_VERSION >= 22600 ++ if (GetHypreMemoryLocation() == HYPRE_MEMORY_HOST && mc != MemoryClass::HOST) ++ { ++ MFEM_ABORT("Hypre is configured to use the HOST but the MemoryClass is DEVICE"); ++ } ++#endif + hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A); + hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A); + const int num_rows = NumRows(); +@@ -547,15 +580,26 @@ void HypreParMatrix::Read(MemoryClass mc) const + offd->j = const_cast(mem_offd.J.Read(mc, offd_nnz)); + offd->data = const_cast(mem_offd.data.Read(mc, offd_nnz)); + #if MFEM_HYPRE_VERSION >= 21800 ++#if MFEM_HYPRE_VERSION >= 22600 ++ decltype(diag->memory_location) ml = ++ (mc == MemoryClass::HOST) ? HYPRE_MEMORY_HOST : GetHypreMemoryLocation(); ++#else // MFEM_HYPRE_VERSION >= 22600 + decltype(diag->memory_location) ml = + (mc != GetHypreMemoryClass() ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE); ++#endif // MFEM_HYPRE_VERSION >= 22600 + diag->memory_location = ml; + offd->memory_location = ml; +-#endif ++#endif // MFEM_HYPRE_VERSION >= 21800 + } + + void HypreParMatrix::ReadWrite(MemoryClass mc) + { ++#if MFEM_HYPRE_VERSION >= 22600 ++ if (GetHypreMemoryLocation() == HYPRE_MEMORY_HOST && mc != MemoryClass::HOST) ++ { ++ MFEM_ABORT("Hypre is configured to use the HOST but the MemoryClass is DEVICE"); ++ } ++#endif + hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A); + hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A); + const int num_rows = NumRows(); +@@ -568,15 +612,26 @@ void HypreParMatrix::ReadWrite(MemoryClass mc) + offd->j = mem_offd.J.ReadWrite(mc, offd_nnz); + offd->data = mem_offd.data.ReadWrite(mc, offd_nnz); + #if MFEM_HYPRE_VERSION >= 21800 ++#if MFEM_HYPRE_VERSION >= 22600 ++ decltype(diag->memory_location) ml = ++ (mc == MemoryClass::HOST) ? HYPRE_MEMORY_HOST : GetHypreMemoryLocation(); ++#else // MFEM_HYPRE_VERSION >= 22600 + decltype(diag->memory_location) ml = + (mc != GetHypreMemoryClass() ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE); ++#endif // MFEM_HYPRE_VERSION >= 22600 + diag->memory_location = ml; + offd->memory_location = ml; +-#endif ++#endif // MFEM_HYPRE_VERSION >= 21800 + } + + void HypreParMatrix::Write(MemoryClass mc, bool set_diag, bool set_offd) + { ++#if MFEM_HYPRE_VERSION >= 22600 ++ if (GetHypreMemoryLocation() == HYPRE_MEMORY_HOST && mc != MemoryClass::HOST) ++ { ++ MFEM_ABORT("Hypre is configured to use the HOST but the MemoryClass is DEVICE"); ++ } ++#endif + hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A); + hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A); + if (set_diag) +@@ -592,11 +647,16 @@ void HypreParMatrix::Write(MemoryClass mc, bool set_diag, bool set_offd) + offd->data = mem_offd.data.Write(mc, mem_offd.data.Capacity()); + } + #if MFEM_HYPRE_VERSION >= 21800 ++#if MFEM_HYPRE_VERSION >= 22600 ++ decltype(diag->memory_location) ml = ++ (mc == MemoryClass::HOST) ? HYPRE_MEMORY_HOST : GetHypreMemoryLocation(); ++#else // MFEM_HYPRE_VERSION >= 22600 + decltype(diag->memory_location) ml = + (mc != GetHypreMemoryClass() ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE); ++#endif // MFEM_HYPRE_VERSION >= 22600 + if (set_diag) { diag->memory_location = ml; } + if (set_offd) { offd->memory_location = ml; } +-#endif ++#endif // MFEM_HYPRE_VERSION >= 21800 + } + + HypreParMatrix::HypreParMatrix() +@@ -759,7 +819,7 @@ signed char HypreParMatrix::HypreCsrToMem(hypre_CSRMatrix *h_mat, + h_mat->data = mem.data.ReadWrite(hypre_mc, nnz); + h_mat->owns_data = 0; + #if MFEM_HYPRE_VERSION >= 21800 +- h_mat->memory_location = HYPRE_MEMORY_DEVICE; ++ h_mat->memory_location = mfem::GetHypreMemoryLocation(); + #endif + return 3; + } +@@ -1360,8 +1420,11 @@ hypre_ParCSRMatrix* HypreParMatrix::StealData() + MFEM_ASSERT(ParCSROwner, ""); + hypre_ParCSRMatrix *R = A; + #ifdef HYPRE_USING_GPU +- if (diagOwner == -1) { HostReadWrite(); } +- else { HypreReadWrite(); } ++ if (HypreUsingGPU()) ++ { ++ if (diagOwner == -1) { HostReadWrite(); } ++ else { HypreReadWrite(); } ++ } + #endif + ParCSROwner = false; + Destroy(); +@@ -1709,7 +1772,10 @@ void HypreParMatrix::EnsureMultTranspose() const + #if (MFEM_HYPRE_VERSION == 22500 && HYPRE_DEVELOP_NUMBER >= 1) || \ + (MFEM_HYPRE_VERSION > 22500) + #ifdef HYPRE_USING_GPU +- hypre_ParCSRMatrixLocalTranspose(A); ++ if (HypreUsingGPU()) ++ { ++ hypre_ParCSRMatrixLocalTranspose(A); ++ } + #endif + #endif + } +@@ -1719,15 +1785,18 @@ void HypreParMatrix::ResetTranspose() const + #if (MFEM_HYPRE_VERSION == 22500 && HYPRE_DEVELOP_NUMBER >= 1) || \ + (MFEM_HYPRE_VERSION > 22500) + #ifdef HYPRE_USING_GPU +- if (A->diagT) ++ if (HypreUsingGPU()) + { +- hypre_CSRMatrixDestroy(A->diagT); +- A->diagT = NULL; +- } +- if (A->offdT) +- { +- hypre_CSRMatrixDestroy(A->offdT); +- A->offdT = NULL; ++ if (A->diagT) ++ { ++ hypre_CSRMatrixDestroy(A->diagT); ++ A->diagT = NULL; ++ } ++ if (A->offdT) ++ { ++ hypre_CSRMatrixDestroy(A->offdT); ++ A->offdT = NULL; ++ } + } + #endif + #endif +@@ -2416,8 +2485,8 @@ void HypreParMatrix::EliminateBC(const Array &ess_dofs, + } + + // Which of the local rows are to be eliminated? +- MFEM_HYPRE_FORALL(i, diag_nrows, eliminate_row[i] = 0; ); +- MFEM_HYPRE_FORALL(i, n_ess_dofs, eliminate_row[ess_dofs_d[i]] = 1; ); ++ mfem::forall_switch(HypreUsingGPU(), diag_nrows, [=] MFEM_HOST_DEVICE (int i) { eliminate_row[i] = 0; }); ++ mfem::forall_switch(HypreUsingGPU(), n_ess_dofs, [=] MFEM_HOST_DEVICE (int i) { eliminate_row[ess_dofs_d[i]] = 1; }); + + // Use a matvec communication pattern to find (in eliminate_col) which of + // the local offd columns are to be eliminated +@@ -2428,26 +2497,36 @@ void HypreParMatrix::EliminateBC(const Array &ess_dofs, + + HYPRE_Int *send_map_elmts; + #if defined(HYPRE_USING_GPU) +- hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg); +- send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg); +-#else +- send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg); ++ if (HypreUsingGPU()) ++ { ++ hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg); ++ send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg); ++ } ++ else + #endif +- MFEM_HYPRE_FORALL(i, int_buf_sz, ++ { ++ send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg); ++ } ++ mfem::forall_switch(HypreUsingGPU(), int_buf_sz, [=] MFEM_HOST_DEVICE (int i) + { + int k = send_map_elmts[i]; + int_buf_data[i] = eliminate_row[k]; + }); + + #if defined(HYPRE_USING_GPU) +- // Try to use device-aware MPI for the communication if available +- comm_handle = hypre_ParCSRCommHandleCreate_v2( +- 11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data, +- HYPRE_MEMORY_DEVICE, eliminate_col); +-#else +- comm_handle = hypre_ParCSRCommHandleCreate( +- 11, comm_pkg, int_buf_data, eliminate_col ); ++ if (HypreUsingGPU()) ++ { ++ // Try to use device-aware MPI for the communication if available ++ comm_handle = hypre_ParCSRCommHandleCreate_v2( ++ 11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data, ++ HYPRE_MEMORY_DEVICE, eliminate_col); ++ } ++ else + #endif ++ { ++ comm_handle = hypre_ParCSRCommHandleCreate( ++ 11, comm_pkg, int_buf_data, eliminate_col ); ++ } + } + + // Eliminate rows and columns in the diagonal block +@@ -2456,7 +2535,7 @@ void HypreParMatrix::EliminateBC(const Array &ess_dofs, + const auto J = diag->j; + auto data = diag->data; + +- MFEM_HYPRE_FORALL(i, n_ess_dofs, ++ mfem::forall_switch(HypreUsingGPU(), n_ess_dofs, [=] MFEM_HOST_DEVICE (int i) + { + const int idof = ess_dofs_d[i]; + for (int j=I[idof]; j &ess_dofs, + { + const auto I = offd->i; + auto data = offd->data; +- MFEM_HYPRE_FORALL(i, n_ess_dofs, ++ mfem::forall_switch(HypreUsingGPU(), n_ess_dofs, [=] MFEM_HOST_DEVICE (int i) + { + const int idof = ess_dofs_d[i]; + for (int j=I[idof]; j &ess_dofs, + const auto I = offd->i; + const auto J = offd->j; + auto data = offd->data; +- MFEM_HYPRE_FORALL(i, nrows_offd, ++ mfem::forall_switch(HypreUsingGPU(), nrows_offd, [=] MFEM_HOST_DEVICE (int i) + { + for (int j=I[i]; j{diag,offd}, so + // that they can be destroyed by hypre when hypre_ParCSRMatrixDestroy(A) +@@ -2855,10 +2934,15 @@ HypreParMatrix * ParMult(const HypreParMatrix *A, const HypreParMatrix *B, + { + hypre_ParCSRMatrix * ab; + #ifdef HYPRE_USING_GPU +- ab = hypre_ParCSRMatMat(*A, *B); +-#else +- ab = hypre_ParMatmul(*A,*B); ++ if (HypreUsingGPU()) ++ { ++ ab = hypre_ParCSRMatMat(*A, *B); ++ } ++ else + #endif ++ { ++ ab = hypre_ParMatmul(*A,*B); ++ } + hypre_ParCSRMatrixSetNumNonzeros(ab); + + if (!hypre_ParCSRMatrixCommPkg(ab)) { hypre_MatvecCommPkgCreate(ab); } +@@ -2882,6 +2966,7 @@ HypreParMatrix * RAP(const HypreParMatrix *A, const HypreParMatrix *P) + // in ex28p. + // Quick fix: add a diagonal matrix with 0 diagonal. + // Maybe use hypre_CSRMatrixCheckDiagFirst to see if we need the fix. ++ if (HypreUsingGPU()) + { + hypre_ParCSRMatrix *Q = hypre_ParCSRMatMat(*A,*P); + const bool keepTranspose = false; +@@ -2891,25 +2976,27 @@ HypreParMatrix * RAP(const HypreParMatrix *A, const HypreParMatrix *P) + // alternative: + // hypre_ParCSRMatrixRAPKT + } +-#else ++ else ++#endif ++ { + #if MFEM_HYPRE_VERSION <= 22200 +- HYPRE_Int P_owns_its_col_starts = +- hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P)); ++ HYPRE_Int P_owns_its_col_starts = ++ hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P)); + #endif + +- hypre_BoomerAMGBuildCoarseOperator(*P,*A,*P,&rap); ++ hypre_BoomerAMGBuildCoarseOperator(*P,*A,*P,&rap); + + #if MFEM_HYPRE_VERSION <= 22200 +- /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts +- from P (even if it does not own them)! */ +- hypre_ParCSRMatrixSetRowStartsOwner(rap,0); +- hypre_ParCSRMatrixSetColStartsOwner(rap,0); +- if (P_owns_its_col_starts) +- { +- hypre_ParCSRMatrixSetColStartsOwner(*P, 1); +- } +-#endif ++ /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts ++ from P (even if it does not own them)! */ ++ hypre_ParCSRMatrixSetRowStartsOwner(rap,0); ++ hypre_ParCSRMatrixSetColStartsOwner(rap,0); ++ if (P_owns_its_col_starts) ++ { ++ hypre_ParCSRMatrixSetColStartsOwner(*P, 1); ++ } + #endif ++ } + + hypre_ParCSRMatrixSetNumNonzeros(rap); + // hypre_MatvecCommPkgCreate(rap); +@@ -2923,36 +3010,39 @@ HypreParMatrix * RAP(const HypreParMatrix * Rt, const HypreParMatrix *A, + hypre_ParCSRMatrix * rap; + + #ifdef HYPRE_USING_GPU ++ if (HypreUsingGPU()) + { + hypre_ParCSRMatrix *Q = hypre_ParCSRMatMat(*A,*P); + rap = hypre_ParCSRTMatMat(*Rt,Q); + hypre_ParCSRMatrixDestroy(Q); + } +-#else ++ else ++#endif ++ { + #if MFEM_HYPRE_VERSION <= 22200 +- HYPRE_Int P_owns_its_col_starts = +- hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P)); +- HYPRE_Int Rt_owns_its_col_starts = +- hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*Rt)); ++ HYPRE_Int P_owns_its_col_starts = ++ hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P)); ++ HYPRE_Int Rt_owns_its_col_starts = ++ hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*Rt)); + #endif + +- hypre_BoomerAMGBuildCoarseOperator(*Rt,*A,*P,&rap); ++ hypre_BoomerAMGBuildCoarseOperator(*Rt,*A,*P,&rap); + + #if MFEM_HYPRE_VERSION <= 22200 +- /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts +- from Rt and P (even if they do not own them)! */ +- hypre_ParCSRMatrixSetRowStartsOwner(rap,0); +- hypre_ParCSRMatrixSetColStartsOwner(rap,0); +- if (P_owns_its_col_starts) +- { +- hypre_ParCSRMatrixSetColStartsOwner(*P, 1); +- } +- if (Rt_owns_its_col_starts) +- { +- hypre_ParCSRMatrixSetColStartsOwner(*Rt, 1); +- } +-#endif ++ /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts ++ from Rt and P (even if they do not own them)! */ ++ hypre_ParCSRMatrixSetRowStartsOwner(rap,0); ++ hypre_ParCSRMatrixSetColStartsOwner(rap,0); ++ if (P_owns_its_col_starts) ++ { ++ hypre_ParCSRMatrixSetColStartsOwner(*P, 1); ++ } ++ if (Rt_owns_its_col_starts) ++ { ++ hypre_ParCSRMatrixSetColStartsOwner(*Rt, 1); ++ } + #endif ++ } + + hypre_ParCSRMatrixSetNumNonzeros(rap); + // hypre_MatvecCommPkgCreate(rap); +@@ -3382,7 +3472,7 @@ int ParCSRRelax_FIR(hypre_ParCSRMatrix *A, // matrix to relax with + + HypreSmoother::HypreSmoother() : Solver() + { +- type = default_type; ++ type = DefaultType(); + relax_times = 1; + relax_weight = 1.0; + omega = 1.0; +@@ -3522,17 +3612,22 @@ void HypreSmoother::SetOperator(const Operator &op) + if (l1_norms && pos_l1_norms) + { + #if defined(HYPRE_USING_GPU) +- double *d_l1_norms = l1_norms; // avoid *this capture +- MFEM_GPU_FORALL(i, height, +- { +- d_l1_norms[i] = std::abs(d_l1_norms[i]); +- }); +-#else +- for (int i = 0; i < height; i++) ++ if (HypreUsingGPU()) + { +- l1_norms[i] = std::abs(l1_norms[i]); ++ double *d_l1_norms = l1_norms; // avoid *this capture ++ MFEM_GPU_FORALL(i, height, ++ { ++ d_l1_norms[i] = std::abs(d_l1_norms[i]); ++ }); + } ++ else + #endif ++ { ++ for (int i = 0; i < height; i++) ++ { ++ l1_norms[i] = std::abs(l1_norms[i]); ++ } ++ } + } + + if (type == 16) +@@ -4808,41 +4903,49 @@ HypreBoomerAMG::HypreBoomerAMG(const HypreParMatrix &A) : HypreSolver(&A) + + void HypreBoomerAMG::SetDefaultOptions() + { +-#if !defined(HYPRE_USING_GPU) +- // AMG coarsening options: +- int coarsen_type = 10; // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP +- int agg_levels = 1; // number of aggressive coarsening levels +- double theta = 0.25; // strength threshold: 0.25, 0.5, 0.8 +- + // AMG interpolation options: +- int interp_type = 6; // 6 = extended+i, 0 = classical +- int Pmax = 4; // max number of elements per row in P ++ int coarsen_type, agg_levels, interp_type, Pmax, relax_type, relax_sweeps, ++ print_level, max_levels; ++ double theta; + +- // AMG relaxation options: +- int relax_type = 8; // 8 = l1-GS, 6 = symm. GS, 3 = GS, 18 = l1-Jacobi +- int relax_sweeps = 1; // relaxation sweeps on each level ++ if (!HypreUsingGPU()) ++ { ++ // AMG coarsening options: ++ coarsen_type = 10; // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP ++ agg_levels = 1; // number of aggressive coarsening levels ++ theta = 0.25; // strength threshold: 0.25, 0.5, 0.8 + +- // Additional options: +- int print_level = 1; // print AMG iterations? 1 = no, 2 = yes +- int max_levels = 25; // max number of levels in AMG hierarchy +-#else +- // AMG coarsening options: +- int coarsen_type = 8; // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP +- int agg_levels = 0; // number of aggressive coarsening levels +- double theta = 0.25; // strength threshold: 0.25, 0.5, 0.8 ++ // AMG interpolation options: ++ interp_type = 6; // 6 = extended+i, 0 = classical ++ Pmax = 4; // max number of elements per row in P + +- // AMG interpolation options: +- int interp_type = 6; // 6 = extended+i, or 18 = extended+e +- int Pmax = 4; // max number of elements per row in P ++ // AMG relaxation options: ++ relax_type = 8; // 8 = l1-GS, 6 = symm. GS, 3 = GS, 18 = l1-Jacobi ++ relax_sweeps = 1; // relaxation sweeps on each level + +- // AMG relaxation options: +- int relax_type = 18; // 18 = l1-Jacobi, or 16 = Chebyshev +- int relax_sweeps = 1; // relaxation sweeps on each level ++ // Additional options: ++ print_level = 1; // print AMG iterations? 1 = no, 2 = yes ++ max_levels = 25; // max number of levels in AMG hierarchy ++ } ++ else ++ { ++ // AMG coarsening options: ++ coarsen_type = 8; // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP ++ agg_levels = 0; // number of aggressive coarsening levels ++ theta = 0.25; // strength threshold: 0.25, 0.5, 0.8 + +- // Additional options: +- int print_level = 1; // print AMG iterations? 1 = no, 2 = yes +- int max_levels = 25; // max number of levels in AMG hierarchy +-#endif ++ // AMG interpolation options: ++ interp_type = 6; // 6 = extended+i, or 18 = extended+e ++ Pmax = 4; // max number of elements per row in P ++ ++ // AMG relaxation options: ++ relax_type = 18; // 18 = l1-Jacobi, or 16 = Chebyshev ++ relax_sweeps = 1; // relaxation sweeps on each level ++ ++ // Additional options: ++ print_level = 1; // print AMG iterations? 1 = no, 2 = yes ++ max_levels = 25; // max number of levels in AMG hierarchy ++ } + + HYPRE_BoomerAMGSetCoarsenType(amg_precond, coarsen_type); + HYPRE_BoomerAMGSetAggNumLevels(amg_precond, agg_levels); +@@ -4979,14 +5082,20 @@ void HypreBoomerAMG::SetSystemsOptions(int dim, bool order_bynodes) + // After the addition of hypre_IntArray, mapping is assumed + // to be a device pointer. Previously, it was assumed to be + // a host pointer. ++ HYPRE_Int *mapping = nullptr; + #if defined(hypre_IntArrayData) && defined(HYPRE_USING_GPU) +- HYPRE_Int *mapping = mfem_hypre_CTAlloc(HYPRE_Int, height); +- hypre_TMemcpy(mapping, h_mapping, HYPRE_Int, height, +- HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); +- mfem_hypre_TFree_host(h_mapping); +-#else +- HYPRE_Int *mapping = h_mapping; ++ if (HypreUsingGPU()) ++ { ++ mapping = mfem_hypre_CTAlloc(HYPRE_Int, height); ++ hypre_TMemcpy(mapping, h_mapping, HYPRE_Int, height, ++ HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); ++ mfem_hypre_TFree_host(h_mapping); ++ } ++ else + #endif ++ { ++ mapping = h_mapping; ++ } + + // hypre actually deletes the mapping pointer in HYPRE_BoomerAMGDestroy, + // so we don't need to track it +@@ -5079,7 +5188,10 @@ void HypreBoomerAMG::SetElasticityOptions(ParFiniteElementSpace *fespace_, + bool interp_refine_) + { + #ifdef HYPRE_USING_GPU +- MFEM_ABORT("this method is not supported in hypre built with GPU support"); ++ if (HypreUsingGPU()) ++ { ++ MFEM_ABORT("this method is not supported in hypre built with GPU support"); ++ } + #endif + + // Save the finite element space to support multiple calls to SetOperator() +@@ -5315,23 +5427,14 @@ void HypreAMS::MakeSolver(int sdim, int cycle_type) + int rlx_sweeps = 1; + double rlx_weight = 1.0; + double rlx_omega = 1.0; +-#if !defined(HYPRE_USING_GPU) +- int amg_coarsen_type = 10; +- int amg_agg_levels = 1; +- int amg_rlx_type = 8; +- int rlx_type = 2; +- double theta = 0.25; +- int amg_interp_type = 6; +- int amg_Pmax = 4; +-#else +- int amg_coarsen_type = 8; +- int amg_agg_levels = 0; +- int amg_rlx_type = 18; +- int rlx_type = 1; ++ const bool hypre_gpu = HypreUsingGPU(); ++ int amg_coarsen_type = hypre_gpu ? 8 : 10; ++ int amg_agg_levels = hypre_gpu ? 0 : 1; ++ int amg_rlx_type = hypre_gpu ? 18 : 8; ++ int rlx_type = hypre_gpu ? 1: 2; + double theta = 0.25; + int amg_interp_type = 6; + int amg_Pmax = 4; +-#endif + + space_dim = sdim; + ams_cycle_type = cycle_type; +@@ -5692,23 +5795,14 @@ void HypreADS::MakeSolver() + int rlx_sweeps = 1; + double rlx_weight = 1.0; + double rlx_omega = 1.0; +-#if !defined(HYPRE_USING_GPU) +- int rlx_type = 2; +- int amg_coarsen_type = 10; +- int amg_agg_levels = 1; +- int amg_rlx_type = 8; +- double theta = 0.25; +- int amg_interp_type = 6; +- int amg_Pmax = 4; +-#else +- int rlx_type = 1; +- int amg_coarsen_type = 8; +- int amg_agg_levels = 0; +- int amg_rlx_type = 18; ++ const bool hypre_gpu = HypreUsingGPU(); ++ int rlx_type = hypre_gpu ? 1 : 2; ++ int amg_coarsen_type = hypre_gpu ? 8 : 10; ++ int amg_agg_levels = hypre_gpu ? 0 : 1; ++ int amg_rlx_type = hypre_gpu ? 18 : 8; + double theta = 0.25; + int amg_interp_type = 6; + int amg_Pmax = 4; +-#endif + + HYPRE_ADSCreate(&ads); + +diff --git a/linalg/hypre.hpp b/linalg/hypre.hpp +index 8e7802713..4dfa72260 100644 +--- a/linalg/hypre.hpp ++++ b/linalg/hypre.hpp +@@ -43,19 +43,6 @@ + #error "MFEM_USE_HIP=YES is required when HYPRE is built with HIP!" + #endif + +-// MFEM_HYPRE_FORALL is a macro similar to mfem::forall, but it executes on the +-// device that hypre was configured with (no matter what device was selected +-// in MFEM's runtime configuration). +-#if defined(HYPRE_USING_CUDA) +-#define MFEM_HYPRE_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \ +- (int i) {__VA_ARGS__}) +-#elif defined(HYPRE_USING_HIP) +-#define MFEM_HYPRE_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \ +- (int i) {__VA_ARGS__}) +-#else +-#define MFEM_HYPRE_FORALL(i, N,...) for (int i = 0; i < N; i++) { __VA_ARGS__ } +-#endif +- + #include "sparsemat.hpp" + #include "hypre_parcsr.hpp" + +@@ -134,14 +121,19 @@ inline int to_int(HYPRE_Int i) + + + /// The MemoryClass used by Hypre objects. +-inline constexpr MemoryClass GetHypreMemoryClass() ++inline MemoryClass GetHypreMemoryClass() + { + #if !defined(HYPRE_USING_GPU) + return MemoryClass::HOST; + #elif defined(HYPRE_USING_UNIFIED_MEMORY) + return MemoryClass::MANAGED; + #else ++#if MFEM_HYPRE_VERSION >= 22600 ++ return (GetHypreMemoryLocation() == HYPRE_MEMORY_DEVICE) ? MemoryClass::DEVICE : ++ MemoryClass::HOST; ++#else // MFEM_HYPRE_VERSION >= 22600 + return MemoryClass::DEVICE; ++#endif // MFEM_HYPRE_VERSION >= 22600 + #endif + } + +@@ -153,7 +145,27 @@ inline MemoryType GetHypreMemoryType() + #elif defined(HYPRE_USING_UNIFIED_MEMORY) + return MemoryType::MANAGED; + #else ++#if MFEM_HYPRE_VERSION >= 22600 ++ return (GetHypreMemoryLocation() == HYPRE_MEMORY_DEVICE) ? MemoryType::DEVICE : ++ Device::GetHostMemoryType(); ++#else // MFEM_HYPRE_VERSION >= 22600 + return MemoryType::DEVICE; ++#endif // MFEM_HYPRE_VERSION >= 22600 ++#endif ++} ++ ++inline bool HypreUsingGPU() ++{ ++#ifdef HYPRE_USING_GPU ++#if MFEM_HYPRE_VERSION >= 22600 ++ HYPRE_MemoryLocation loc; ++ HYPRE_GetMemoryLocation(&loc); ++ return loc == HYPRE_MEMORY_DEVICE; ++#else // MFEM_HYPRE_VERSION >= 22600 ++ return true; ++#endif // MFEM_HYPRE_VERSION >= 22600 ++#else ++ return false; + #endif + } + +@@ -1039,15 +1051,10 @@ public: + enum Type { Jacobi = 0, l1Jacobi = 1, l1GS = 2, l1GStr = 4, lumpedJacobi = 5, + GS = 6, OPFS = 10, Chebyshev = 16, Taubin = 1001, FIR = 1002 + }; +-#if !defined(HYPRE_USING_GPU) +- static constexpr Type default_type = l1GS; +-#else +- static constexpr Type default_type = l1Jacobi; +-#endif + + HypreSmoother(); + +- HypreSmoother(const HypreParMatrix &A_, int type = default_type, ++ HypreSmoother(const HypreParMatrix &A_, int type = DefaultType(), + int relax_times = 1, double relax_weight = 1.0, + double omega = 1.0, int poly_order = 2, + double poly_fraction = .3, int eig_est_cg_iter = 10); +@@ -1094,6 +1101,9 @@ public: + /// Apply transpose of the smoother to relax the linear system Ax=b + virtual void MultTranspose(const Vector &b, Vector &x) const; + ++ /// Return default smoother type for configuration ++ static Type DefaultType() { return HypreUsingGPU() ? l1Jacobi : l1GS; } ++ + virtual ~HypreSmoother(); + }; + +diff --git a/linalg/hypre_parcsr.hpp b/linalg/hypre_parcsr.hpp +index 3f428ecaa..c7cb98e76 100644 +--- a/linalg/hypre_parcsr.hpp ++++ b/linalg/hypre_parcsr.hpp +@@ -29,6 +29,33 @@ typedef HYPRE_Int HYPRE_BigInt; + #define HYPRE_MPI_BIG_INT HYPRE_MPI_INT + #endif + ++namespace mfem ++{ ++#if MFEM_HYPRE_VERSION >= 22600 ++inline HYPRE_MemoryLocation GetHypreMemoryLocation() ++{ ++ HYPRE_MemoryLocation loc; ++ HYPRE_GetMemoryLocation(&loc); ++ return loc; ++} ++inline HYPRE_ExecutionPolicy GetHypreExecutionPolicy() ++{ ++ HYPRE_ExecutionPolicy pol; ++ HYPRE_GetExecutionPolicy(&pol); ++ return pol; ++} ++#else ++inline HYPRE_MemoryLocation GetHypreMemoryLocation() ++{ ++#ifdef HYPRE_USING_GPU ++ return HYPRE_MEMORY_DEVICE; ++#else ++ return HYPRE_MEMORY_HOST; ++#endif // HYPRE_USING_GPU ++} ++#endif ++}; ++ + // Define macro wrappers for hypre_TAlloc, hypre_CTAlloc and hypre_TFree: + // mfem_hypre_TAlloc, mfem_hypre_CTAlloc, and mfem_hypre_TFree, respectively. + // Note: these macros are used in hypre.cpp, hypre_parcsr.cpp, and perhaps +@@ -46,10 +73,10 @@ typedef HYPRE_Int HYPRE_BigInt; + #else // MFEM_HYPRE_VERSION >= 21400 + + #define mfem_hypre_TAlloc(type, size) \ +- hypre_TAlloc(type, size, HYPRE_MEMORY_DEVICE) ++ hypre_TAlloc(type, size, mfem::GetHypreMemoryLocation()) + #define mfem_hypre_CTAlloc(type, size) \ +- hypre_CTAlloc(type, size, HYPRE_MEMORY_DEVICE) +-#define mfem_hypre_TFree(ptr) hypre_TFree(ptr, HYPRE_MEMORY_DEVICE) ++ hypre_CTAlloc(type, size, mfem::GetHypreMemoryLocation()) ++#define mfem_hypre_TFree(ptr) hypre_TFree(ptr, mfem::GetHypreMemoryLocation()) + + #define mfem_hypre_TAlloc_host(type, size) \ + hypre_TAlloc(type, size, HYPRE_MEMORY_HOST) +diff --git a/miniapps/hdiv-linear-solver/discrete_divergence.cpp b/miniapps/hdiv-linear-solver/discrete_divergence.cpp +index 7b4eb9e4f..fde64a4cd 100644 +--- a/miniapps/hdiv-linear-solver/discrete_divergence.cpp ++++ b/miniapps/hdiv-linear-solver/discrete_divergence.cpp +@@ -85,7 +85,7 @@ void EliminateColumns(HypreParMatrix &D, const Array &ess_dofs) + const auto I = diag->i; + const auto J = diag->j; + auto data = diag->data; +- MFEM_HYPRE_FORALL(i, nrows_diag, ++ mfem::forall_switch(HypreUsingGPU(), nrows_diag, [=] MFEM_HOST_DEVICE (int i) + { + for (int jj=I[i]; jj &ess_dofs) + const auto I = offd->i; + const auto J = offd->j; + auto data = offd->data; +- MFEM_HYPRE_FORALL(i, nrows_offd, ++ mfem::forall_switch(HypreUsingGPU(), nrows_offd, [=] MFEM_HOST_DEVICE (int i) + { + for (int jj=I[i]; jj::Copy(Array ©) const +@@ -868,11 +868,11 @@ inline void Array::Copy(Array ©) const } template @@ -379,7 +379,7 @@ index 0f16b3023..a19b099a1 100644 template diff --git a/general/communication.cpp b/general/communication.cpp -index 0c2fffc1f..e8002a273 100644 +index 840d1e39c..48209622e 100644 --- a/general/communication.cpp +++ b/general/communication.cpp @@ -275,7 +275,7 @@ void GroupTopology::Save(ostream &os) const @@ -392,7 +392,7 @@ index 0c2fffc1f..e8002a273 100644 { int group_size = GetGroupSize(group_id); diff --git a/general/table.hpp b/general/table.hpp -index 2ed9f4a1b..96373b2d1 100644 +index 6f90e95bf..85efaeaec 100644 --- a/general/table.hpp +++ b/general/table.hpp @@ -208,6 +208,7 @@ void Transpose (const Table &A, Table &At, int ncols_A_ = -1); @@ -404,7 +404,7 @@ index 2ed9f4a1b..96373b2d1 100644 /// C = A * B (as boolean matrices) diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index 7ce605dc8..7482859d3 100644 +index 572ad262d..ebb273f62 100644 --- a/mesh/mesh.cpp +++ b/mesh/mesh.cpp @@ -20,6 +20,7 @@ @@ -423,7 +423,7 @@ index 7ce605dc8..7482859d3 100644 #include // Include the METIS header, if using version 5. If using METIS 4, the needed -@@ -1290,7 +1292,7 @@ Mesh::FaceInformation::operator Mesh::FaceInfo() const +@@ -1332,7 +1334,7 @@ Mesh::FaceInformation::operator Mesh::FaceInfo() const return res; } @@ -432,7 +432,7 @@ index 7ce605dc8..7482859d3 100644 { os << "face topology="; switch (info.topology) -@@ -3124,7 +3126,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) +@@ -3166,7 +3168,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) { GetElementToFaceTable(); GenerateFaces(); @@ -441,7 +441,7 @@ index 7ce605dc8..7482859d3 100644 { GenerateBoundaryElements(); GetElementToFaceTable(); // update be_to_face -@@ -3144,7 +3146,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) +@@ -3186,7 +3188,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) if (Dim == 2) { GenerateFaces(); // 'Faces' in 2D refers to the edges @@ -450,7 +450,7 @@ index 7ce605dc8..7482859d3 100644 { GenerateBoundaryElements(); } -@@ -3158,7 +3160,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) +@@ -3200,7 +3202,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) if (Dim == 1) { GenerateFaces(); @@ -459,7 +459,7 @@ index 7ce605dc8..7482859d3 100644 { // be_to_face will be set inside GenerateBoundaryElements GenerateBoundaryElements(); -@@ -6083,6 +6085,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const +@@ -6125,6 +6127,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const void Mesh::SetCurvature(int order, bool discont, int space_dim, int ordering) { @@ -472,7 +472,7 @@ index 7ce605dc8..7482859d3 100644 space_dim = (space_dim == -1) ? spaceDim : space_dim; FiniteElementCollection* nfec; if (discont) -@@ -13079,6 +13087,878 @@ void Mesh::GetGeometricParametersFromJacobian(const DenseMatrix &J, +@@ -13121,6 +13129,878 @@ void Mesh::GetGeometricParametersFromJacobian(const DenseMatrix &J, } @@ -1352,7 +1352,7 @@ index 7ce605dc8..7482859d3 100644 int flags, MemoryType d_mt) { diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index 9820b4e95..fd8111f5e 100644 +index 12a7b3ed9..3870604aa 100644 --- a/mesh/mesh.hpp +++ b/mesh/mesh.hpp @@ -29,6 +29,7 @@ @@ -1401,7 +1401,7 @@ index 9820b4e95..fd8111f5e 100644 const std::string &comments = "") const; /// Used in GetFaceElementTransformations (...) -@@ -546,7 +549,7 @@ protected: +@@ -541,7 +544,7 @@ protected: mfem v1.2 format with the given section_delimiter at the end. If @a comments is non-empty, it will be printed after the first line of the file, and each line should begin with '#'. */ @@ -1410,7 +1410,7 @@ index 9820b4e95..fd8111f5e 100644 std::string section_delimiter = "", const std::string &comments = "") const; -@@ -957,7 +960,7 @@ public: +@@ -950,7 +953,7 @@ public: int AddBdrPoint(int v, int attr = 1); @@ -1419,7 +1419,7 @@ index 9820b4e95..fd8111f5e 100644 /// Finalize the construction of a triangular Mesh. void FinalizeTriMesh(int generate_edges = 0, int refine = 0, bool fix_orientation = true); -@@ -2203,7 +2206,7 @@ public: +@@ -2256,7 +2259,7 @@ public: std::ostream &os, int elem_attr = 0) const; void PrintElementsWithPartitioning (int *partitioning, @@ -1428,7 +1428,7 @@ index 9820b4e95..fd8111f5e 100644 int interior_faces = 0); /// Print set of disjoint surfaces: -@@ -2211,13 +2214,13 @@ public: +@@ -2264,13 +2267,13 @@ public: * If Aface_face(i,j) != 0, print face j as a boundary * element with attribute i+1. */ @@ -1444,7 +1444,7 @@ index 9820b4e95..fd8111f5e 100644 /** @brief Compute and print mesh characteristics such as number of vertices, number of elements, number of boundary elements, minimal and maximal -@@ -2237,7 +2240,7 @@ public: +@@ -2290,7 +2293,7 @@ public: #ifdef MFEM_DEBUG /// Output an NCMesh-compatible debug dump. @@ -1453,7 +1453,7 @@ index 9820b4e95..fd8111f5e 100644 #endif /// @} -@@ -2316,9 +2319,194 @@ public: +@@ -2369,9 +2372,194 @@ public: /// @} }; @@ -1651,7 +1651,7 @@ index 9820b4e95..fd8111f5e 100644 /** @brief Structure for storing mesh geometric factors: coordinates, Jacobians, -@@ -2327,7 +2515,6 @@ std::ostream &operator<<(std::ostream &out, const Mesh &mesh); +@@ -2380,7 +2568,6 @@ std::ostream &operator<<(std::ostream &out, const Mesh &mesh); Mesh. See Mesh::GetGeometricFactors(). */ class GeometricFactors { @@ -1659,7 +1659,7 @@ index 9820b4e95..fd8111f5e 100644 private: void Compute(const GridFunction &nodes, MemoryType d_mt = MemoryType::DEFAULT); -@@ -2375,6 +2562,7 @@ public: +@@ -2428,6 +2615,7 @@ public: Vector detJ; }; @@ -1667,7 +1667,7 @@ index 9820b4e95..fd8111f5e 100644 /** @brief Structure for storing face geometric factors: coordinates, Jacobians, determinants of the Jacobians, and normal vectors. */ /** Typically objects of this type are constructed and owned by objects of class -@@ -2429,6 +2617,7 @@ public: +@@ -2482,6 +2670,7 @@ public: Vector normal; }; @@ -1675,7 +1675,7 @@ index 9820b4e95..fd8111f5e 100644 /// Class used to extrude the nodes of a mesh class NodeExtrudeCoefficient : public VectorCoefficient { -@@ -2460,8 +2649,12 @@ inline void ShiftRight(int &a, int &b, int &c) +@@ -2513,8 +2702,12 @@ inline void ShiftRight(int &a, int &b, int &c) a = c; c = b; b = t; } @@ -1690,7 +1690,7 @@ index 9820b4e95..fd8111f5e 100644 } diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index 97b29108b..5017cd012 100644 +index 1c212c077..3f23d1877 100644 --- a/mesh/pmesh.cpp +++ b/mesh/pmesh.cpp @@ -249,6 +249,8 @@ ParMesh::ParMesh(MPI_Comm comm, Mesh &mesh, int *partitioning_, @@ -1710,7 +1710,7 @@ index 97b29108b..5017cd012 100644 Mesh::Finalize(refine, fix_orientation); -@@ -6245,6 +6248,7 @@ void ParMesh::ParPrint(ostream &os, const std::string &comments) const +@@ -6278,6 +6281,7 @@ void ParMesh::ParPrint(ostream &os, const std::string &comments) const { os << "total_shared_faces " << sface_lface.Size() << '\n'; } @@ -1719,10 +1719,10 @@ index 97b29108b..5017cd012 100644 { { diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp -index af5aa9a7c..d723204f6 100644 +index b7744f19c..479c6b502 100644 --- a/mesh/pmesh.hpp +++ b/mesh/pmesh.hpp -@@ -108,6 +108,8 @@ protected: +@@ -106,6 +106,8 @@ protected: // Convert the local 'meshgen' to a global one. void ReduceMeshGen(); @@ -1732,7 +1732,7 @@ index af5aa9a7c..d723204f6 100644 void FinalizeParTopo(); diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp -index d094b70cd..22eecbd42 100644 +index 283b35f8a..9653316bf 100644 --- a/mesh/tetrahedron.cpp +++ b/mesh/tetrahedron.cpp @@ -53,7 +53,7 @@ void Tetrahedron::Init(int ind1, int ind2, int ind3, int ind4, int attr, @@ -1758,7 +1758,7 @@ index d094b70cd..22eecbd42 100644 switch (face) { diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp -index 80f1de4e0..e6093bc08 100644 +index 2062c42ba..83665454e 100644 --- a/mesh/tetrahedron.hpp +++ b/mesh/tetrahedron.hpp @@ -58,12 +58,13 @@ public: @@ -1779,7 +1779,7 @@ index 80f1de4e0..e6093bc08 100644 void SetRefinementFlag(int rf) { refinement_flag = rf; } diff --git a/miniapps/meshing/makefile b/miniapps/meshing/makefile -index 1ccec0455..e34a5637e 100644 +index 7510d8892..2029d1967 100644 --- a/miniapps/meshing/makefile +++ b/miniapps/meshing/makefile @@ -123,7 +123,7 @@ clean-build: @@ -1792,7 +1792,7 @@ index 1ccec0455..e34a5637e 100644 @rm -f partitioning.txt shaper.mesh extruder.mesh @rm -f optimized* perturbed* polar-nc.mesh diff --git a/miniapps/meshing/mesh-explorer.cpp b/miniapps/meshing/mesh-explorer.cpp -index f05e18e83..67e8b1f65 100644 +index 925371f7f..a6257a034 100644 --- a/miniapps/meshing/mesh-explorer.cpp +++ b/miniapps/meshing/mesh-explorer.cpp @@ -308,6 +308,7 @@ int main (int argc, char *argv[]) diff --git a/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff b/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff new file mode 100644 index 000000000..fdaf7cadf --- /dev/null +++ b/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff @@ -0,0 +1,13 @@ +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index aa416e1b1..42faa8aa5 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -11581,7 +11581,7 @@ void Mesh::PrintVTU(std::ostream &os, int ref, VTKFormat format, + const int *p = VTKGeometry::VertexPermutation[geom]; + for (int k = 0; k < nv; k++, j++) + { +- WriteBinaryOrASCII(os, buf, np + RG[p ? p[j] : j], " ", ++ WriteBinaryOrASCII(os, buf, np + RG[p ? (j - k + p[k]) : j], " ", + format); + } + if (format == VTKFormat::ASCII) { os << '\n'; } diff --git a/extern/patch/mfem/patch_mesh_vis_dev.diff b/extern/patch/mfem/patch_mesh_vis_dev.diff index 0541362af..02ad69963 100644 --- a/extern/patch/mfem/patch_mesh_vis_dev.diff +++ b/extern/patch/mfem/patch_mesh_vis_dev.diff @@ -1,5 +1,5 @@ diff --git a/fem/datacollection.cpp b/fem/datacollection.cpp -index 0dc718b07..3d478edaa 100644 +index d73d2bb5b..f1a21e3d6 100644 --- a/fem/datacollection.cpp +++ b/fem/datacollection.cpp @@ -210,6 +210,11 @@ void DataCollection::Save() @@ -282,7 +282,7 @@ index 0dc718b07..3d478edaa 100644 { restart_mode = restart_mode_; diff --git a/fem/datacollection.hpp b/fem/datacollection.hpp -index c216188af..4a26cd1f8 100644 +index 351c1239d..3b33198cf 100644 --- a/fem/datacollection.hpp +++ b/fem/datacollection.hpp @@ -133,6 +133,10 @@ private: @@ -470,10 +470,10 @@ index c216188af..4a26cd1f8 100644 /// preserve timestep metadata for any solutions prior to the currently /// defined time. diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index 102ee4de6..625b049c0 100644 +index 41ac08480..572ad262d 100644 --- a/mesh/mesh.cpp +++ b/mesh/mesh.cpp -@@ -11423,7 +11423,7 @@ void Mesh::PrintVTU(std::string fname, +@@ -11476,7 +11476,7 @@ void Mesh::PrintVTU(std::string fname, VTKFormat format, bool high_order_output, int compression_level, @@ -482,7 +482,7 @@ index 102ee4de6..625b049c0 100644 { int ref = (high_order_output && Nodes) ? Nodes->FESpace()->GetMaxElementOrder() : 1; -@@ -11437,7 +11437,7 @@ void Mesh::PrintVTU(std::string fname, +@@ -11490,7 +11490,7 @@ void Mesh::PrintVTU(std::string fname, } os << " byte_order=\"" << VTKByteOrder() << "\">\n"; os << "\n"; @@ -492,10 +492,10 @@ index 102ee4de6..625b049c0 100644 os << "\n"; os << "" << std::endl; diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index b82ce4a76..317e4349a 100644 +index 02a02dfc6..12a7b3ed9 100644 --- a/mesh/mesh.hpp +++ b/mesh/mesh.hpp -@@ -2186,7 +2186,7 @@ public: +@@ -2240,7 +2240,7 @@ public: VTKFormat format=VTKFormat::ASCII, bool high_order_output=false, int compression_level=0, @@ -505,10 +505,10 @@ index b82ce4a76..317e4349a 100644 boundary attributes as a data array (useful for boundary conditions). */ void PrintBdrVTU(std::string fname, diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index 81012851d..d4f6d1abd 100644 +index ac353e3d7..1c212c077 100644 --- a/mesh/pmesh.cpp +++ b/mesh/pmesh.cpp -@@ -6303,7 +6303,7 @@ void ParMesh::PrintVTU(std::string pathname, +@@ -6332,7 +6332,7 @@ void ParMesh::PrintVTU(std::string pathname, VTKFormat format, bool high_order_output, int compression_level, @@ -517,7 +517,7 @@ index 81012851d..d4f6d1abd 100644 { int pad_digits_rank = 6; DataCollection::create_directory(pathname, this, MyRank); -@@ -6363,7 +6363,8 @@ void ParMesh::PrintVTU(std::string pathname, +@@ -6392,7 +6392,8 @@ void ParMesh::PrintVTU(std::string pathname, std::string vtu_fname = pathname + "/" + fname + ".proc" + to_padded_string(MyRank, pad_digits_rank); @@ -528,10 +528,10 @@ index 81012851d..d4f6d1abd 100644 int ParMesh::FindPoints(DenseMatrix& point_mat, Array& elem_id, diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp -index 472a0491e..af5aa9a7c 100644 +index 1aa027a3c..b7744f19c 100644 --- a/mesh/pmesh.hpp +++ b/mesh/pmesh.hpp -@@ -690,7 +690,7 @@ public: +@@ -721,7 +721,7 @@ public: VTKFormat format=VTKFormat::ASCII, bool high_order_output=false, int compression_level=0, diff --git a/extern/patch/mfem/patch_mfem_device_fixes.diff b/extern/patch/mfem/patch_mfem_device_fixes.diff index 448adba25..03da57976 100644 --- a/extern/patch/mfem/patch_mfem_device_fixes.diff +++ b/extern/patch/mfem/patch_mfem_device_fixes.diff @@ -1,8 +1,8 @@ diff --git a/general/device.cpp b/general/device.cpp -index ccee71cd7..f664f70c3 100644 +index b239eea59..8b3e586ba 100644 --- a/general/device.cpp +++ b/general/device.cpp -@@ -9,14 +9,13 @@ +@@ -9,14 +9,12 @@ // terms of the BSD-3 license. We welcome feedback and contributions, see file // CONTRIBUTING.md for details. @@ -12,13 +12,13 @@ index ccee71cd7..f664f70c3 100644 #ifdef MFEM_USE_CEED #include "../fem/ceed/interface/util.hpp" #endif - +- -#include -#include #include namespace mfem -@@ -144,13 +143,11 @@ Device::Device() +@@ -144,13 +142,11 @@ Device::Device() } } @@ -32,7 +32,7 @@ index ccee71cd7..f664f70c3 100644 #ifdef MFEM_USE_CEED // Destroy FES -> CeedBasis, CeedElemRestriction hash table contents for (auto entry : internal::ceed_basis_map) -@@ -169,7 +166,6 @@ Device::~Device() +@@ -169,7 +165,6 @@ Device::~Device() mm.Destroy(); } Get().ngpu = -1; @@ -40,18 +40,19 @@ index ccee71cd7..f664f70c3 100644 Get().backends = Backend::CPU; Get().host_mem_type = MemoryType::HOST; Get().host_mem_class = MemoryClass::HOST; -@@ -189,6 +185,7 @@ void Device::Configure(const std::string &device, const int device_id) +@@ -193,28 +188,22 @@ void Device::Configure(const std::string &device, const int device_id) + { + bmap[internal::backend_name[i]] = internal::backend_list[i]; } - - std::map bmap; +- std::string::size_type beg = 0, end, option; + std::string device_option; - for (int i = 0; i < Backend::NUM_BACKENDS; i++) ++ std::string::size_type beg = 0, end; + while (1) { - bmap[internal::backend_name[i]] = internal::backend_list[i]; -@@ -200,21 +197,14 @@ void Device::Configure(const std::string &device, const int device_id) + end = device.find(',', beg); end = (end != std::string::npos) ? end : device.size(); const std::string bname = device.substr(beg, end - beg); - option = bname.find(':'); +- option = bname.find(':'); - if (option==std::string::npos) // No option - { - const std::string backend = bname; @@ -60,9 +61,10 @@ index ccee71cd7..f664f70c3 100644 - Get().MarkBackend(it->second); - } - else ++ const auto option = bname.find(':'); + const std::string backend = (option != std::string::npos) ? + bname.substr(0, option) : bname; -+ std::map::iterator it = bmap.find(backend); ++ const auto it = bmap.find(backend); + MFEM_VERIFY(it != bmap.end(), "Invalid backend name: '" << backend << '\''); + Get().MarkBackend(it->second); + if (option != std::string::npos) @@ -77,7 +79,7 @@ index ccee71cd7..f664f70c3 100644 } if (end == device.size()) { break; } beg = end + 1; -@@ -240,10 +230,10 @@ void Device::Configure(const std::string &device, const int device_id) +@@ -240,10 +229,10 @@ void Device::Configure(const std::string &device, const int device_id) #endif // Perform setup. @@ -86,137 +88,39 @@ index ccee71cd7..f664f70c3 100644 - // Enable the device - Enable(); -+ // Enable the device. ++ // Configure the host/device MemoryType/MemoryClass. + Get().UpdateMemoryTypeAndClass(device_option); // Copy all data members from the global 'singleton_device' into '*this'. if (this != &Get()) { std::memcpy(this, &Get(), sizeof(Device)); } -@@ -252,30 +242,6 @@ void Device::Configure(const std::string &device, const int device_id) - destroy_mm = true; - } - --// static method --void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt) --{ -- // If the device and/or the MemoryTypes are configured through the -- // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this -- // method. -- if (mem_host_env || mem_device_env || device_env) { return; } -- -- MFEM_VERIFY(!IsConfigured(), "the default MemoryTypes can only be set before" -- " Device construction and configuration"); -- MFEM_VERIFY(IsHostMemory(h_mt), -- "invalid host MemoryType, h_mt = " << (int)h_mt); -- MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt, -- "invalid device MemoryType, d_mt = " << (int)d_mt -- << " (h_mt = " << (int)h_mt << ')'); -- -- Get().host_mem_type = h_mt; -- Get().device_mem_type = d_mt; -- mem_types_set = true; -- -- // h_mt and d_mt will be set as dual to each other during configuration by -- // the call mm.Configure(...) in UpdateMemoryTypeAndClass() --} -- - void Device::Print(std::ostream &os) - { - os << "Device configuration: "; -@@ -307,96 +273,53 @@ void Device::Print(std::ostream &os) +@@ -307,10 +296,9 @@ void Device::Print(std::ostream &os) os << std::endl; } -void Device::UpdateMemoryTypeAndClass() -+// static method -+void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt) ++void Device::UpdateMemoryTypeAndClass(const std::string &device_option) { -- const bool debug = Device::Allows(Backend::DEBUG_DEVICE); + const bool debug = Device::Allows(Backend::DEBUG_DEVICE); - -- const bool device = Device::Allows(Backend::DEVICE_MASK); -- --#ifdef MFEM_USE_UMPIRE -- // If MFEM has been compiled with Umpire support, use it as the default -- if (!mem_host_env && !mem_types_set) -- { -- host_mem_type = MemoryType::HOST_UMPIRE; -- if (!mem_device_env) -- { -- device_mem_type = MemoryType::HOST_UMPIRE; -- } -- } --#endif -- -- // Enable the device memory type -- if (device) -- { -- if (!mem_device_env) -- { -- if (mem_host_env) -- { -- switch (host_mem_type) -- { -- case MemoryType::HOST_UMPIRE: -- device_mem_type = MemoryType::DEVICE_UMPIRE; -- break; -- case MemoryType::HOST_DEBUG: -- device_mem_type = MemoryType::DEVICE_DEBUG; -- break; -- default: -- device_mem_type = MemoryType::DEVICE; -- } -- } -- else if (!mem_types_set) -- { --#ifndef MFEM_USE_UMPIRE -- device_mem_type = MemoryType::DEVICE; --#else -- device_mem_type = MemoryType::DEVICE_UMPIRE; --#endif -- } -- } -- device_mem_class = MemoryClass::DEVICE; -- } -- -- // Enable the UVM shortcut when requested -- if (device && device_option && !strcmp(device_option, "uvm")) -- { -- host_mem_type = MemoryType::MANAGED; -- device_mem_type = MemoryType::MANAGED; -- } -+ // If the device and/or the MemoryTypes are configured through the -+ // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this -+ // method. -+ if (mem_host_env || mem_device_env || device_env) { return; } + const bool device = Device::Allows(Backend::DEVICE_MASK); -- // Enable the DEBUG mode when requested -- if (debug) -- { -- host_mem_type = MemoryType::HOST_DEBUG; -- device_mem_type = MemoryType::DEVICE_DEBUG; -- } -+ MFEM_VERIFY(!IsConfigured(), "The default MemoryTypes can only be set before" -+ " Device construction and configuration"); -+ MFEM_VERIFY(IsHostMemory(h_mt), -+ "Invalid host MemoryType, h_mt = " << (int)h_mt); -+ MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt, -+ "Invalid device MemoryType, d_mt = " << (int)d_mt -+ << " (h_mt = " << (int)h_mt << ')'); - -- MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type), -- "invalid device memory configuration!"); -+ Get().host_mem_type = h_mt; -+ Get().device_mem_type = d_mt; -+ mem_types_set = true; - -- // Update the memory manager with the new settings -- mm.Configure(host_mem_type, device_mem_type); -+ // h_mt and d_mt will be set as dual to each other during configuration by -+ // the call mm.Configure(...) in UpdateMemoryTypeAndClass(). + #ifdef MFEM_USE_UMPIRE +@@ -357,7 +345,7 @@ void Device::UpdateMemoryTypeAndClass() + } + + // Enable the UVM shortcut when requested +- if (device && device_option && !strcmp(device_option, "uvm")) ++ if (device && device_option.find(":uvm") != std::string::npos) + { + host_mem_type = MemoryType::MANAGED; + device_mem_type = MemoryType::MANAGED; +@@ -377,26 +365,29 @@ void Device::UpdateMemoryTypeAndClass() + mm.Configure(host_mem_type, device_mem_type); } -void Device::Enable() +// static method -+int Device::GetNumGPU() ++int Device::GetDeviceCount() { - const bool accelerated = Get().backends & ~(Backend::CPU); - if (accelerated) { Get().mode = Device::ACCELERATED;} @@ -253,7 +157,7 @@ index ccee71cd7..f664f70c3 100644 #else MFEM_CONTRACT_VAR(dev); MFEM_CONTRACT_VAR(ngpu); -@@ -418,7 +341,7 @@ static void HipDeviceSetup(const int dev, int &ngpu) +@@ -418,7 +409,7 @@ static void HipDeviceSetup(const int dev, int &ngpu) static void RajaDeviceSetup(const int dev, int &ngpu) { #ifdef MFEM_USE_CUDA @@ -262,76 +166,25 @@ index ccee71cd7..f664f70c3 100644 #elif defined(MFEM_USE_HIP) HipDeviceSetup(dev, ngpu); #else -@@ -443,7 +366,7 @@ static void OccaDeviceSetup(const int dev) - std::string mode("mode: 'CUDA', device_id : "); - internal::occaDevice.setup(mode.append(1,'0'+dev)); - #else -- MFEM_ABORT("the OCCA CUDA backend requires OCCA built with CUDA!"); -+ MFEM_ABORT("The OCCA CUDA backend requires OCCA built with CUDA!"); - #endif - } - else if (omp) -@@ -451,7 +374,7 @@ static void OccaDeviceSetup(const int dev) - #if OCCA_OPENMP_ENABLED - internal::occaDevice.setup("mode: 'OpenMP'"); - #else -- MFEM_ABORT("the OCCA OpenMP backend requires OCCA built with OpenMP!"); -+ MFEM_ABORT("The OCCA OpenMP backend requires OCCA built with OpenMP!"); - #endif - } - else -@@ -477,7 +400,7 @@ static void OccaDeviceSetup(const int dev) - occa::loadKernels("mfem"); - #else - MFEM_CONTRACT_VAR(dev); -- MFEM_ABORT("the OCCA backends require MFEM built with MFEM_USE_OCCA=YES"); -+ MFEM_ABORT("The OCCA backends require MFEM built with MFEM_USE_OCCA=YES"); - #endif - } - -@@ -502,80 +425,136 @@ static void CeedDeviceSetup(const char* ceed_spec) +@@ -502,7 +493,7 @@ static void CeedDeviceSetup(const char* ceed_spec) #endif } -void Device::Setup(const int device_id) +void Device::Setup(const std::string &device_option, const int device_id) { -- MFEM_VERIFY(ngpu == -1, "the mfem::Device is already configured!"); -+ MFEM_VERIFY(ngpu == -1, "The mfem::Device is already configured!"); - - ngpu = 0; - dev = device_id; - #ifndef MFEM_USE_CUDA - MFEM_VERIFY(!Allows(Backend::CUDA_MASK), -- "the CUDA backends require MFEM built with MFEM_USE_CUDA=YES"); -+ "The CUDA backends require MFEM built with MFEM_USE_CUDA=YES"); - #endif - #ifndef MFEM_USE_HIP - MFEM_VERIFY(!Allows(Backend::HIP_MASK), -- "the HIP backends require MFEM built with MFEM_USE_HIP=YES"); -+ "The HIP backends require MFEM built with MFEM_USE_HIP=YES"); - #endif - #ifndef MFEM_USE_RAJA - MFEM_VERIFY(!Allows(Backend::RAJA_MASK), -- "the RAJA backends require MFEM built with MFEM_USE_RAJA=YES"); -+ "The RAJA backends require MFEM built with MFEM_USE_RAJA=YES"); - #endif - #ifndef MFEM_USE_OPENMP - MFEM_VERIFY(!Allows(Backend::OMP|Backend::RAJA_OMP), -- "the OpenMP and RAJA OpenMP backends require MFEM built with" -+ "The OpenMP and RAJA OpenMP backends require MFEM built with" - " MFEM_USE_OPENMP=YES"); - #endif + MFEM_VERIFY(ngpu == -1, "the mfem::Device is already configured!"); + +@@ -528,54 +519,41 @@ void Device::Setup(const int device_id) #ifndef MFEM_USE_CEED MFEM_VERIFY(!Allows(Backend::CEED_MASK), -- "the CEED backends require MFEM built with MFEM_USE_CEED=YES"); + "the CEED backends require MFEM built with MFEM_USE_CEED=YES"); -#else - int ceed_cpu = Allows(Backend::CEED_CPU); - int ceed_cuda = Allows(Backend::CEED_CUDA); - int ceed_hip = Allows(Backend::CEED_HIP); - MFEM_VERIFY(ceed_cpu + ceed_cuda + ceed_hip <= 1, - "Only one CEED backend can be enabled at a time!"); -+ "The CEED backends require MFEM built with MFEM_USE_CEED=YES"); #endif if (Allows(Backend::CUDA)) { CudaDeviceSetup(dev, ngpu); } if (Allows(Backend::HIP)) { HipDeviceSetup(dev, ngpu); } @@ -340,6 +193,29 @@ index ccee71cd7..f664f70c3 100644 - // The check for MFEM_USE_OCCA is in the function OccaDeviceSetup(). if (Allows(Backend::OCCA_MASK)) { OccaDeviceSetup(dev); } - if (Allows(Backend::CEED_CPU)) +- { +- if (!device_option) +- { +- CeedDeviceSetup("/cpu/self"); +- } +- else +- { +- CeedDeviceSetup(device_option); +- } +- } +- if (Allows(Backend::CEED_CUDA)) +- { +- if (!device_option) +- { +- // NOTE: libCEED's /gpu/cuda/gen backend is non-deterministic! +- CeedDeviceSetup("/gpu/cuda/gen"); +- } +- else +- { +- CeedDeviceSetup(device_option); +- } +- } +- if (Allows(Backend::CEED_HIP)) + if (Allows(Backend::CEED_MASK)) { - if (!device_option) @@ -360,7 +236,7 @@ index ccee71cd7..f664f70c3 100644 + std::string::size_type beg = device_option.find(ceed_spec_search), end; + if (beg == std::string::npos) { -- CeedDeviceSetup("/cpu/self"); +- CeedDeviceSetup("/gpu/hip"); + CeedDeviceSetup(ceed_spec_default); } else @@ -371,98 +247,13 @@ index ccee71cd7..f664f70c3 100644 + CeedDeviceSetup(device_option.substr(beg + 1, end - beg - 1).c_str()); } } -- if (Allows(Backend::CEED_CUDA)) -+ if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; } -+} -+ -+void Device::UpdateMemoryTypeAndClass(const std::string &device_option) -+{ -+ const bool debug = Device::Allows(Backend::DEBUG_DEVICE); -+ const bool device = Device::Allows(Backend::DEVICE_MASK); -+ -+#ifdef MFEM_USE_UMPIRE -+ // If MFEM has been compiled with Umpire support, use it as the default -+ if (!mem_host_env && !mem_types_set) - { -- if (!device_option) -- { -- // NOTE: libCEED's /gpu/cuda/gen backend is non-deterministic! -- CeedDeviceSetup("/gpu/cuda/gen"); -- } -- else -+ host_mem_type = MemoryType::HOST_UMPIRE; -+ if (!mem_device_env) - { -- CeedDeviceSetup(device_option); -+ device_mem_type = MemoryType::HOST_UMPIRE; - } - } -- if (Allows(Backend::CEED_HIP)) -+#endif -+ -+ // Enable the device memory type -+ if (device) - { -- if (!device_option) -- { -- CeedDeviceSetup("/gpu/hip"); -- } -- else -+ if (!mem_device_env) - { -- CeedDeviceSetup(device_option); -+ if (mem_host_env) -+ { -+ switch (host_mem_type) -+ { -+ case MemoryType::HOST_UMPIRE: -+ device_mem_type = MemoryType::DEVICE_UMPIRE; -+ break; -+ case MemoryType::HOST_DEBUG: -+ device_mem_type = MemoryType::DEVICE_DEBUG; -+ break; -+ default: -+ device_mem_type = MemoryType::DEVICE; -+ } -+ } -+ else if (!mem_types_set) -+ { -+#ifndef MFEM_USE_UMPIRE -+ device_mem_type = MemoryType::DEVICE; -+#else -+ device_mem_type = MemoryType::DEVICE_UMPIRE; -+#endif -+ } - } -+ device_mem_class = MemoryClass::DEVICE; - } -- if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; } -+ -+ // Enable the UVM shortcut when requested -+ if (device && device_option.find(":uvm") != std::string::npos) -+ { -+ host_mem_type = MemoryType::MANAGED; -+ device_mem_type = MemoryType::MANAGED; -+ } -+ -+ // Enable the DEBUG mode when requested -+ if (debug) -+ { -+ host_mem_type = MemoryType::HOST_DEBUG; -+ device_mem_type = MemoryType::DEVICE_DEBUG; -+ } -+ -+ MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type), -+ "Invalid device memory configuration!"); -+ -+ // Update the memory manager with the new settings -+ mm.Configure(host_mem_type, device_mem_type); + if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; } } -} // mfem +} // namespace mfem diff --git a/general/device.hpp b/general/device.hpp -index baa27397f..a2d89e22e 100644 +index 5e236430b..07e6d0c19 100644 --- a/general/device.hpp +++ b/general/device.hpp @@ -14,6 +14,7 @@ @@ -473,15 +264,7 @@ index baa27397f..a2d89e22e 100644 namespace mfem { -@@ -81,7 +82,6 @@ struct Backend - { - /// Number of backends: from (1 << 0) to (1 << (NUM_BACKENDS-1)). - NUM_BACKENDS = 15, -- - /// Biwise-OR of all CPU backends - CPU_MASK = CPU | RAJA_CPU | OCCA_CPU | CEED_CPU, - /// Biwise-OR of all CUDA backends -@@ -94,7 +94,6 @@ struct Backend +@@ -94,7 +95,6 @@ struct Backend CEED_MASK = CEED_CPU | CEED_CUDA | CEED_HIP, /// Biwise-OR of all device backends DEVICE_MASK = CUDA_MASK | HIP_MASK | DEBUG_DEVICE, @@ -489,15 +272,14 @@ index baa27397f..a2d89e22e 100644 /// Biwise-OR of all RAJA backends RAJA_MASK = RAJA_CPU | RAJA_OMP | RAJA_CUDA | RAJA_HIP, /// Biwise-OR of all OCCA backends -@@ -122,50 +121,44 @@ class Device +@@ -122,16 +122,16 @@ class Device { private: friend class MemoryManager; - enum MODES {SEQUENTIAL, ACCELERATED}; -- -- static bool device_env, mem_host_env, mem_device_env, mem_types_set; + + static bool device_env, mem_host_env, mem_device_env, mem_types_set; static MFEM_EXPORT Device device_singleton; -+ static bool device_env, mem_host_env, mem_device_env, mem_types_set; - MODES mode = Device::SEQUENTIAL; int dev = 0; ///< Device ID of the configured device. @@ -509,19 +291,8 @@ index baa27397f..a2d89e22e 100644 /// Set to true during configuration, except in 'device_singleton'. bool destroy_mm = false; bool mpi_gpu_aware = false; - -- MemoryType host_mem_type = MemoryType::HOST; ///< Current Host MemoryType -- MemoryClass host_mem_class = MemoryClass::HOST; ///< Current Host MemoryClass -+ /// Current host MemoryType. -+ MemoryType host_mem_type = MemoryType::HOST; -+ /// Current host MemoryClass. -+ MemoryClass host_mem_class = MemoryClass::HOST; - -- /// Current Device MemoryType -+ /// Current device MemoryType. - MemoryType device_mem_type = MemoryType::HOST; -- /// Current Device MemoryClass -+ /// Current device MemoryClass. +@@ -144,28 +144,21 @@ private: + /// Current Device MemoryClass MemoryClass device_mem_class = MemoryClass::HOST; - char *device_option = NULL; @@ -531,27 +302,27 @@ index baa27397f..a2d89e22e 100644 - - /// Setup switcher based on configuration settings - void Setup(const int device_id = 0); +- +- void MarkBackend(Backend::Id b) { backends |= b; } + // Delete copy constructor and copy assignment. + Device(Device const &) = delete; + void operator=(Device const &) = delete; -- void MarkBackend(Backend::Id b) { backends |= b; } -+ // Access the Device singleton. -+ static Device &Get() { return device_singleton; } - - void UpdateMemoryTypeAndClass(); -+ /// Setup switcher based on configuration settings. -+ void Setup(const std::string &device_option, const int device_id); ++ // Access the Device singleton. ++ static Device& Get() { return device_singleton; } - /// Enable the use of the configured device in the code that follows. - /** After this call MFEM classes will use the backend kernels whenever - possible, transferring data automatically to the device, if necessary. -+ /// Configure host/device MemoryType/MemoryClass. -+ void UpdateMemoryTypeAndClass(const std::string &device_option); ++ /// Setup switcher based on configuration settings. ++ void Setup(const std::string &device_option, const int device_id); - If the only configured backend is the default host CPU one, the device - will remain disabled. -- ++ /// Configure host/device MemoryType/MemoryClass. ++ void UpdateMemoryTypeAndClass(const std::string &device_option); + - If the device is actually enabled, this method will also update the - current host/device MemoryType and MemoryClass. */ - static void Enable(); @@ -581,91 +352,42 @@ index baa27397f..a2d89e22e 100644 * The available backends are described by the Backend class. * The string name of a backend is the lowercase version of the Backend::Id enumeration constant with '_' replaced by '-', e.g. the -@@ -219,8 +212,12 @@ public: +@@ -219,8 +212,10 @@ public: and evaluation of operators and enables the 'hip' backend to avoid transfers between host and device. * The 'debug' backend should not be combined with other device backends. - */ - void Configure(const std::string &device, const int dev = 0); ++ + @note If the device is actually enabled, this method will also update the + current host/device MemoryType and MemoryClass. */ + void Configure(const std::string &device, const int device_id = 0); -+ -+ /// Print the configuration of the MFEM virtual device object. -+ void Print(std::ostream &out = mfem::out); /// Set the default host and device MemoryTypes, @a h_mt and @a d_mt. /** The host and device MemoryTypes are also set to be dual to each other. -@@ -233,60 +230,64 @@ public: - the subsequent Device configuration. */ - static void SetMemoryTypes(MemoryType h_mt, MemoryType d_mt); - -- /// Print the configuration of the MFEM virtual device object. -- void Print(std::ostream &out = mfem::out); -- - /// Return true if Configure() has been called previously. -- static inline bool IsConfigured() { return Get().ngpu >= 0; } -+ static bool IsConfigured() { return Get().ngpu >= 0; } - - /// Return true if an actual device (e.g. GPU) has been configured. -- static inline bool IsAvailable() { return Get().ngpu > 0; } -+ static bool IsAvailable() { return Get().ngpu > 0; } +@@ -243,14 +238,17 @@ public: + static inline bool IsAvailable() { return Get().ngpu > 0; } /// Return true if any backend other than Backend::CPU is enabled. - static inline bool IsEnabled() { return Get().mode == ACCELERATED; } -+ static bool IsEnabled() { return Get().backends & ~(Backend::CPU); } ++ static inline bool IsEnabled() { return Get().backends & ~(Backend::CPU); } /// The opposite of IsEnabled(). -- static inline bool IsDisabled() { return !IsEnabled(); } -+ static bool IsDisabled() { return !IsEnabled(); } -+ -+ /// Get the device ID of the configured device. -+ static int GetId() { return Get().dev; } + static inline bool IsDisabled() { return !IsEnabled(); } - /// Get the device id of the configured device. -- static inline int GetId() { return Get().dev; } -+ /// Get the number of available devices (may be called before configuration). -+ static int GetNumGPU(); ++ /// Get the device ID of the configured device. + static inline int GetId() { return Get().dev; } ++ /// Get the number of available devices (may be called before configuration). ++ static int GetDeviceCount(); ++ /** @brief Return true if any of the backends in the backend mask, @a b_mask, are allowed. */ /** This method can be used with any of the Backend::Id constants, the - Backend::*_MASK, or combinations of those. */ -- static inline bool Allows(unsigned long b_mask) -+ static bool Allows(unsigned long b_mask) - { return Get().backends & b_mask; } - - /** @brief Get the current Host MemoryType. This is the MemoryType used by - most MFEM classes when allocating memory used on the host. - */ -- static inline MemoryType GetHostMemoryType() { return Get().host_mem_type; } -+ static MemoryType GetHostMemoryType() { return Get().host_mem_type; } - - /** @brief Get the current Host MemoryClass. This is the MemoryClass used - by most MFEM host Memory objects. */ -- static inline MemoryClass GetHostMemoryClass() { return Get().host_mem_class; } -+ static MemoryClass GetHostMemoryClass() { return Get().host_mem_class; } - - /** @brief Get the current Device MemoryType. This is the MemoryType used by - most MFEM classes when allocating memory to be used with device kernels. - */ -- static inline MemoryType GetDeviceMemoryType() { return Get().device_mem_type; } -+ static MemoryType GetDeviceMemoryType() { return Get().device_mem_type; } - - /// (DEPRECATED) Equivalent to GetDeviceMemoryType(). - /** @deprecated Use GetDeviceMemoryType() instead. */ -- static inline MemoryType GetMemoryType() { return Get().device_mem_type; } -+ static MemoryType GetMemoryType() { return Get().device_mem_type; } - - /** @brief Get the current Device MemoryClass. This is the MemoryClass used - by most MFEM device kernels to access Memory objects. */ -- static inline MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; } -+ static MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; } - - /// (DEPRECATED) Equivalent to GetDeviceMemoryClass(). +@@ -284,9 +282,13 @@ public: /** @deprecated Use GetDeviceMemoryClass() instead. */ -- static inline MemoryClass GetMemoryClass() { return Get().device_mem_class; } -+ static MemoryClass GetMemoryClass() { return Get().device_mem_class; } + static inline MemoryClass GetMemoryClass() { return Get().device_mem_class; } + /** @brief Manually set the status of GPU-aware MPI flag for use in MPI + communication routines which have optimized implementations for device @@ -677,7 +399,7 @@ index baa27397f..a2d89e22e 100644 static bool GetGPUAwareMPI() { return Get().mpi_gpu_aware; } }; -@@ -298,7 +299,7 @@ public: +@@ -298,7 +300,7 @@ public: and ReadWrite(), while setting the device use flag in @a mem, if @a on_dev is true. */ template @@ -686,7 +408,7 @@ index baa27397f..a2d89e22e 100644 { if (!on_dev) { -@@ -362,6 +363,6 @@ inline T *HostReadWrite(Memory &mem, int size) +@@ -362,6 +364,6 @@ inline T *HostReadWrite(Memory &mem, int size) return mfem::ReadWrite(mem, size, false); } diff --git a/extern/patch/mfem/patch_par_tet_mesh_fix.diff b/extern/patch/mfem/patch_par_tet_mesh_fix.diff index 58aa441f4..c4842acc2 100644 --- a/extern/patch/mfem/patch_par_tet_mesh_fix.diff +++ b/extern/patch/mfem/patch_par_tet_mesh_fix.diff @@ -1,5 +1,5 @@ diff --git a/mesh/element.hpp b/mesh/element.hpp -index 1a265a506..12467ad20 100644 +index e9dd07217..4129789bb 100644 --- a/mesh/element.hpp +++ b/mesh/element.hpp @@ -87,9 +87,6 @@ public: @@ -13,7 +13,7 @@ index 1a265a506..12467ad20 100644 virtual int NeedRefinement(HashTable &v_to_v) const { return 0; } diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index d20f689ab..88f5537cc 100644 +index ebb273f62..aa416e1b1 100644 --- a/mesh/mesh.cpp +++ b/mesh/mesh.cpp @@ -2146,7 +2146,7 @@ void Mesh::FinalizeTriMesh(int generate_edges, int refine, bool fix_orientation) @@ -384,7 +384,7 @@ index d20f689ab..88f5537cc 100644 // static method diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index fd9d0f3c6..67ab7a5ea 100644 +index 3870604aa..df53ac980 100644 --- a/mesh/mesh.hpp +++ b/mesh/mesh.hpp @@ -336,12 +336,9 @@ protected: @@ -403,7 +403,7 @@ index fd9d0f3c6..67ab7a5ea 100644 // Methods used to prepare and apply permutation of the mesh nodes assuming diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index ec612ee02..25e6bae5b 100644 +index 3f23d1877..f8d934556 100644 --- a/mesh/pmesh.cpp +++ b/mesh/pmesh.cpp @@ -20,6 +20,7 @@ @@ -580,7 +580,7 @@ index ec612ee02..25e6bae5b 100644 } diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp -index 22eecbd42..133a69b41 100644 +index 9653316bf..cf7f1e77a 100644 --- a/mesh/tetrahedron.cpp +++ b/mesh/tetrahedron.cpp @@ -13,6 +13,8 @@ @@ -670,7 +670,7 @@ index 22eecbd42..133a69b41 100644 + } diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp -index e6093bc08..002163714 100644 +index 83665454e..821d18de4 100644 --- a/mesh/tetrahedron.hpp +++ b/mesh/tetrahedron.hpp @@ -74,7 +74,9 @@ public: @@ -685,7 +685,7 @@ index e6093bc08..002163714 100644 void ResetTransform(int tr) override { transform = tr; } unsigned GetTransform() const override { return transform; } diff --git a/mesh/triangle.cpp b/mesh/triangle.cpp -index eb7493398..80d11f4b6 100644 +index ed05f1d37..26334fd94 100644 --- a/mesh/triangle.cpp +++ b/mesh/triangle.cpp @@ -11,6 +11,8 @@ @@ -794,7 +794,7 @@ index eb7493398..80d11f4b6 100644 + } // namespace mfem diff --git a/mesh/triangle.hpp b/mesh/triangle.hpp -index fafd8811b..e35baf294 100644 +index 7bfdb23ef..feaa1674c 100644 --- a/mesh/triangle.hpp +++ b/mesh/triangle.hpp @@ -47,13 +47,14 @@ public: diff --git a/extern/patch/mfem/patch_pncmesh_update_fix.diff b/extern/patch/mfem/patch_pncmesh_update_fix.diff deleted file mode 100644 index 2e4810382..000000000 --- a/extern/patch/mfem/patch_pncmesh_update_fix.diff +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/mesh/pncmesh.cpp b/mesh/pncmesh.cpp -index 60aca4583..34d59e567 100644 ---- a/mesh/pncmesh.cpp -+++ b/mesh/pncmesh.cpp -@@ -107,6 +107,8 @@ void ParNCMesh::Update() - entity_owner[i].DeleteAll(); - entity_pmat_group[i].DeleteAll(); - entity_index_rank[i].DeleteAll(); -+ entity_conf_group[i].DeleteAll(); -+ entity_elem_local[i].DeleteAll(); - } - - shared_vertices.Clear(); diff --git a/extern/patch/mfem/patch_workspace_vectors.diff b/extern/patch/mfem/patch_workspace_vectors.diff new file mode 100644 index 000000000..04522f66a --- /dev/null +++ b/extern/patch/mfem/patch_workspace_vectors.diff @@ -0,0 +1,1562 @@ +diff --git a/fem/bilinearform_ext.cpp b/fem/bilinearform_ext.cpp +index 1c26575b8..17808cda2 100644 +--- a/fem/bilinearform_ext.cpp ++++ b/fem/bilinearform_ext.cpp +@@ -13,6 +13,7 @@ + // PABilinearFormExtension and MFBilinearFormExtension. + + #include "../general/forall.hpp" ++#include "../general/workspace.hpp" + #include "bilinearform.hpp" + #include "pbilinearform.hpp" + #include "pgridfunc.hpp" +@@ -68,6 +69,9 @@ void MFBilinearFormExtension::AssembleDiagonal(Vector &y) const + const int iSz = integrators.Size(); + if (elem_restrict && !DeviceCanUseCeed()) + { ++ auto localX = Workspace::NewVector(elem_restrict->Height()); ++ auto localY = Workspace::NewVector(elem_restrict->Height()); ++ + localY = 0.0; + for (int i = 0; i < iSz; ++i) + { +@@ -142,6 +146,9 @@ void MFBilinearFormExtension::Mult(const Vector &x, Vector &y) const + } + else + { ++ auto localX = Workspace::NewVector(elem_restrict->Height()); ++ auto localY = Workspace::NewVector(elem_restrict->Height()); ++ + elem_restrict->Mult(x, localX); + localY = 0.0; + for (int i = 0; i < iSz; ++i) +@@ -155,6 +162,9 @@ void MFBilinearFormExtension::Mult(const Vector &x, Vector &y) const + const int iFISz = intFaceIntegrators.Size(); + if (int_face_restrict_lex && iFISz>0) + { ++ auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height()); ++ auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height()); ++ + int_face_restrict_lex->Mult(x, int_face_X); + if (int_face_X.Size()>0) + { +@@ -171,6 +181,9 @@ void MFBilinearFormExtension::Mult(const Vector &x, Vector &y) const + const int bFISz = bdrFaceIntegrators.Size(); + if (bdr_face_restrict_lex && bFISz>0) + { ++ auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ + bdr_face_restrict_lex->Mult(x, bdr_face_X); + if (bdr_face_X.Size()>0) + { +@@ -190,6 +203,9 @@ void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const int iSz = integrators.Size(); + if (elem_restrict) + { ++ auto localX = Workspace::NewVector(elem_restrict->Height()); ++ auto localY = Workspace::NewVector(elem_restrict->Height()); ++ + elem_restrict->Mult(x, localX); + localY = 0.0; + for (int i = 0; i < iSz; ++i) +@@ -212,6 +228,9 @@ void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const int iFISz = intFaceIntegrators.Size(); + if (int_face_restrict_lex && iFISz>0) + { ++ auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height()); ++ auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height()); ++ + int_face_restrict_lex->Mult(x, int_face_X); + if (int_face_X.Size()>0) + { +@@ -228,6 +247,9 @@ void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const int bFISz = bdrFaceIntegrators.Size(); + if (bdr_face_restrict_lex && bFISz>0) + { ++ auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ + bdr_face_restrict_lex->Mult(x, bdr_face_X); + if (bdr_face_X.Size()>0) + { +@@ -259,10 +281,6 @@ void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m) + elem_restrict = trial_fes->GetElementRestriction(ordering); + if (elem_restrict) + { +- localX.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType()); +- localY.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType()); +- localY.UseDevice(true); // ensure 'localY = 0.0' is done on device +- + // Gather the attributes on the host from all the elements + const Mesh &mesh = *trial_fes->GetMesh(); + elem_attributes.SetSize(mesh.GetNE()); +@@ -279,9 +297,6 @@ void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m) + int_face_restrict_lex = trial_fes->GetFaceRestriction( + ElementDofOrdering::LEXICOGRAPHIC, + FaceType::Interior); +- int_face_X.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType()); +- int_face_Y.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType()); +- int_face_Y.UseDevice(true); // ensure 'int_face_Y = 0.0' is done on device + } + + const bool has_bdr_integs = (a->GetBFBFI()->Size() > 0 || +@@ -292,9 +307,6 @@ void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m) + ElementDofOrdering::LEXICOGRAPHIC, + FaceType::Boundary, + m); +- bdr_face_X.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType()); +- bdr_face_Y.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType()); +- bdr_face_Y.UseDevice(true); // ensure 'faceBoundY = 0.0' is done on device + + const Mesh &mesh = *trial_fes->GetMesh(); + // See LinearFormExtension::Update for explanation of f_to_be logic. +@@ -383,6 +395,7 @@ void PABilinearFormExtension::AssembleDiagonal(Vector &y) const + const int iSz = integrators.Size(); + if (elem_restrict && !DeviceCanUseCeed()) + { ++ auto localY = Workspace::NewVector(elem_restrict->Height()); + if (iSz > 0) + { + localY = 0.0; +@@ -420,6 +433,7 @@ void PABilinearFormExtension::AssembleDiagonal(Vector &y) const + const int n_bdr_integs = bdr_integs.Size(); + if (bdr_face_restrict_lex && n_bdr_integs > 0) + { ++ auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height()); + bdr_face_Y = 0.0; + for (int i = 0; i < n_bdr_integs; ++i) + { +@@ -504,6 +518,9 @@ void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const + { + if (iSz) + { ++ auto localX = Workspace::NewVector(elem_restrict->Height()); ++ auto localY = Workspace::NewVector(elem_restrict->Height()); ++ + Array*> &elem_markers = *a->GetDBFI_Marker(); + elem_restrict->Mult(x, localX); + localY = 0.0; +@@ -524,6 +541,9 @@ void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const + const int iFISz = intFaceIntegrators.Size(); + if (int_face_restrict_lex && iFISz>0) + { ++ auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height()); ++ auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height()); ++ + int_face_restrict_lex->Mult(x, int_face_X); + if (int_face_X.Size()>0) + { +@@ -543,6 +563,9 @@ void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const + const bool has_bdr_integs = (n_bdr_face_integs > 0 || n_bdr_integs > 0); + if (bdr_face_restrict_lex && has_bdr_integs) + { ++ auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ + Array*> &bdr_markers = *a->GetBBFI_Marker(); + Array*> &bdr_face_markers = *a->GetBFBFI_Marker(); + bdr_face_restrict_lex->Mult(x, bdr_face_X); +@@ -570,6 +593,9 @@ void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const int iSz = integrators.Size(); + if (elem_restrict) + { ++ auto localX = Workspace::NewVector(elem_restrict->Height()); ++ auto localY = Workspace::NewVector(elem_restrict->Height()); ++ + Array*> &elem_markers = *a->GetDBFI_Marker(); + elem_restrict->Mult(x, localX); + localY = 0.0; +@@ -594,6 +620,9 @@ void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const int iFISz = intFaceIntegrators.Size(); + if (int_face_restrict_lex && iFISz>0) + { ++ auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height()); ++ auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height()); ++ + int_face_restrict_lex->Mult(x, int_face_X); + if (int_face_X.Size()>0) + { +@@ -613,6 +642,9 @@ void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const bool has_bdr_integs = (n_bdr_face_integs > 0 || n_bdr_integs > 0); + if (bdr_face_restrict_lex && has_bdr_integs) + { ++ auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ + Array*> &bdr_markers = *a->GetBBFI_Marker(); + Array*> &bdr_face_markers = *a->GetBFBFI_Marker(); + +@@ -670,7 +702,7 @@ void PABilinearFormExtension::AddMultWithMarkers( + { + if (markers) + { +- tmp_evec.SetSize(y.Size()); ++ auto tmp_evec = Workspace::NewVector(y.Size()); + tmp_evec = 0.0; + if (transpose) { integ.AddMultTransposePA(x, tmp_evec); } + else { integ.AddMultPA(x, tmp_evec); } +@@ -769,6 +801,10 @@ void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const + { + // Apply the Element Restriction + const bool useRestrict = !DeviceCanUseCeed() && elem_restrict; ++ ++ auto localX = Workspace::NewVector(useRestrict?elem_restrict->Height():0); ++ auto localY = Workspace::NewVector(useRestrict?elem_restrict->Height():0); ++ + if (!useRestrict) + { + y.UseDevice(true); // typically this is a large vector, so store on device +@@ -808,6 +844,9 @@ void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const + const int iFISz = intFaceIntegrators.Size(); + if (int_face_restrict_lex && iFISz>0) + { ++ auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height()); ++ auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height()); ++ + // Apply the Interior Face Restriction + int_face_restrict_lex->Mult(x, int_face_X); + if (int_face_X.Size()>0) +@@ -866,6 +905,9 @@ void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const + const int bFISz = bdrFaceIntegrators.Size(); + if (!factorize_face_terms && bdr_face_restrict_lex && bFISz>0) + { ++ auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ + // Apply the Boundary Face Restriction + bdr_face_restrict_lex->Mult(x, bdr_face_X); + if (bdr_face_X.Size()>0) +@@ -897,6 +939,10 @@ void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + { + // Apply the Element Restriction + const bool useRestrict = !DeviceCanUseCeed() && elem_restrict; ++ ++ auto localX = Workspace::NewVector(useRestrict?elem_restrict->Height():0); ++ auto localY = Workspace::NewVector(useRestrict?elem_restrict->Height():0); ++ + if (!useRestrict) + { + y.UseDevice(true); // typically this is a large vector, so store on device +@@ -936,6 +982,9 @@ void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const int iFISz = intFaceIntegrators.Size(); + if (int_face_restrict_lex && iFISz>0) + { ++ auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height()); ++ auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height()); ++ + // Apply the Interior Face Restriction + int_face_restrict_lex->Mult(x, int_face_X); + if (int_face_X.Size()>0) +@@ -994,6 +1043,9 @@ void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const + const int bFISz = bdrFaceIntegrators.Size(); + if (!factorize_face_terms && bdr_face_restrict_lex && bFISz>0) + { ++ auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height()); ++ + // Apply the Boundary Face Restriction + bdr_face_restrict_lex->Mult(x, bdr_face_X); + if (bdr_face_X.Size()>0) +@@ -1050,12 +1102,10 @@ void FABilinearFormExtension::Assemble() + { + pfes->ExchangeFaceNbrData(); + width += pfes->GetFaceNbrVSize(); +- dg_x.SetSize(width); + ParBilinearForm *pb = nullptr; + if ((pb = dynamic_cast(a)) && (pb->keep_nbr_block)) + { + height += pfes->GetFaceNbrVSize(); +- dg_y.SetSize(height); + keep_nbr_block = true; + } + } +@@ -1217,6 +1267,7 @@ void FABilinearFormExtension::DGMult(const Vector &x, Vector &y) const + x_gf.ExchangeFaceNbrData(); + Vector &shared_x = x_gf.FaceNbrData(); + const int local_size = a->FESpace()->GetVSize(); ++ auto dg_x = Workspace::NewVector(width); + auto dg_x_ptr = dg_x.Write(); + auto x_ptr = x.Read(); + mfem::forall(local_size, [=] MFEM_HOST_DEVICE (int i) +@@ -1232,6 +1283,7 @@ void FABilinearFormExtension::DGMult(const Vector &x, Vector &y) const + ParBilinearForm *pform = nullptr; + if ((pform = dynamic_cast(a)) && (pform->keep_nbr_block)) + { ++ auto dg_y = Workspace::NewVector(height); + mat->Mult(dg_x, dg_y); + // DG Restriction + auto dg_y_ptr = dg_y.Read(); +@@ -1278,6 +1330,7 @@ void FABilinearFormExtension::DGMultTranspose(const Vector &x, Vector &y) const + x_gf.ExchangeFaceNbrData(); + Vector &shared_x = x_gf.FaceNbrData(); + const int local_size = a->FESpace()->GetVSize(); ++ auto dg_x = Workspace::NewVector(width); + auto dg_x_ptr = dg_x.Write(); + auto x_ptr = x.Read(); + mfem::forall(local_size, [=] MFEM_HOST_DEVICE (int i) +@@ -1293,6 +1346,7 @@ void FABilinearFormExtension::DGMultTranspose(const Vector &x, Vector &y) const + ParBilinearForm *pb = nullptr; + if ((pb = dynamic_cast(a)) && (pb->keep_nbr_block)) + { ++ auto dg_y = Workspace::NewVector(height); + mat->MultTranspose(dg_x, dg_y); + // DG Restriction + auto dg_y_ptr = dg_y.Read(); +@@ -1392,17 +1446,6 @@ void PAMixedBilinearFormExtension::Update() + ElementDofOrdering::LEXICOGRAPHIC); + elem_restrict_test = test_fes->GetElementRestriction( + ElementDofOrdering::LEXICOGRAPHIC); +- if (elem_restrict_trial) +- { +- localTrial.UseDevice(true); +- localTrial.SetSize(elem_restrict_trial->Height(), +- Device::GetMemoryType()); +- } +- if (elem_restrict_test) +- { +- localTest.UseDevice(true); // ensure 'localY = 0.0' is done on device +- localTest.SetSize(elem_restrict_test->Height(), Device::GetMemoryType()); +- } + } + + void PAMixedBilinearFormExtension::FormRectangularSystemOperator( +@@ -1481,6 +1524,11 @@ void PAMixedBilinearFormExtension::AddMult(const Vector &x, Vector &y, + Array &integrators = *a->GetDBFI(); + const int iSz = integrators.Size(); + ++ auto localTrial = Workspace::NewVector(elem_restrict_trial ++ ?elem_restrict_trial->Height():0); ++ auto localTest = Workspace::NewVector(elem_restrict_test ++ ?elem_restrict_test->Height():0); ++ + // * G operation + SetupMultInputs(elem_restrict_trial, x, localTrial, + elem_restrict_test, y, localTest, c); +@@ -1494,9 +1542,7 @@ void PAMixedBilinearFormExtension::AddMult(const Vector &x, Vector &y, + // * G^T operation + if (elem_restrict_test) + { +- tempY.SetSize(y.Size()); +- elem_restrict_test->MultTranspose(localTest, tempY); +- y += tempY; ++ elem_restrict_test->AddMultTranspose(localTest, y); + } + } + +@@ -1513,6 +1559,11 @@ void PAMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y, + Array &integrators = *a->GetDBFI(); + const int iSz = integrators.Size(); + ++ auto localTrial = Workspace::NewVector(elem_restrict_trial ++ ?elem_restrict_trial->Height():0); ++ auto localTest = Workspace::NewVector(elem_restrict_test ++ ?elem_restrict_test->Height():0); ++ + // * G operation + SetupMultInputs(elem_restrict_test, x, localTest, + elem_restrict_trial, y, localTrial, c); +@@ -1526,9 +1577,7 @@ void PAMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y, + // * G^T operation + if (elem_restrict_trial) + { +- tempY.SetSize(y.Size()); +- elem_restrict_trial->MultTranspose(localTrial, tempY); +- y += tempY; ++ elem_restrict_trial->AddMultTranspose(localTrial, y); + } + } + +@@ -1539,6 +1588,8 @@ void PAMixedBilinearFormExtension::AssembleDiagonal_ADAt(const Vector &D, + + const int iSz = integrators.Size(); + ++ auto localTrial = Workspace::NewVector(elem_restrict_trial ++ ?elem_restrict_trial->Height():0); + if (elem_restrict_trial) + { + const ElementRestriction* H1elem_restrict_trial = +@@ -1555,6 +1606,8 @@ void PAMixedBilinearFormExtension::AssembleDiagonal_ADAt(const Vector &D, + + if (elem_restrict_test) + { ++ auto localTest = Workspace::NewVector(elem_restrict_test->Height()); ++ + localTest = 0.0; + for (int i = 0; i < iSz; ++i) + { +@@ -1620,7 +1673,8 @@ void PADiscreteLinearOperatorExtension::Assemble() + + test_multiplicity.UseDevice(true); + test_multiplicity.SetSize(elem_restrict_test->Width()); // l-vector +- Vector ones(elem_restrict_test->Height()); // e-vector ++ ++ auto ones = Workspace::NewVector(elem_restrict_test->Height()); // e-vector + ones = 1.0; + + const ElementRestriction* elem_restrict = +@@ -1631,7 +1685,7 @@ void PADiscreteLinearOperatorExtension::Assemble() + } + else + { +- mfem_error("A real ElementRestriction is required in this setting!"); ++ MFEM_ABORT("A real ElementRestriction is required in this setting!"); + } + + auto tm = test_multiplicity.ReadWrite(); +@@ -1647,6 +1701,11 @@ void PADiscreteLinearOperatorExtension::AddMult( + Array &integrators = *a->GetDBFI(); + const int iSz = integrators.Size(); + ++ auto localTrial = Workspace::NewVector(elem_restrict_trial ++ ?elem_restrict_trial->Height():0); ++ auto localTest = Workspace::NewVector(elem_restrict_test ++ ?elem_restrict_test->Height():0); ++ + // * G operation + SetupMultInputs(elem_restrict_trial, x, localTrial, + elem_restrict_test, y, localTest, c); +@@ -1664,13 +1723,13 @@ void PADiscreteLinearOperatorExtension::AddMult( + dynamic_cast(elem_restrict_test); + if (elem_restrict) + { +- tempY.SetSize(y.Size()); ++ auto tempY = Workspace::NewVector(y.Size()); + elem_restrict->MultLeftInverse(localTest, tempY); + y += tempY; + } + else + { +- mfem_error("In this setting you need a real ElementRestriction!"); ++ MFEM_ABORT("In this setting you need a real ElementRestriction!"); + } + } + +@@ -1691,6 +1750,12 @@ void PADiscreteLinearOperatorExtension::AddMultTranspose( + { + xs[i] *= tm[i]; + }); ++ ++ auto localTrial = Workspace::NewVector(elem_restrict_trial ++ ?elem_restrict_trial->Height():0); ++ auto localTest = Workspace::NewVector(elem_restrict_test ++ ?elem_restrict_test->Height():0); ++ + SetupMultInputs(elem_restrict_test, xscaled, localTest, + elem_restrict_trial, y, localTrial, c); + +@@ -1703,13 +1768,11 @@ void PADiscreteLinearOperatorExtension::AddMultTranspose( + // * G^T operation + if (elem_restrict_trial) + { +- tempY.SetSize(y.Size()); +- elem_restrict_trial->MultTranspose(localTrial, tempY); +- y += tempY; ++ elem_restrict_trial->AddMultTranspose(localTrial, y); + } + else + { +- mfem_error("Trial ElementRestriction not defined"); ++ MFEM_ABORT("Trial ElementRestriction not defined"); + } + } + +diff --git a/fem/bilinearform_ext.hpp b/fem/bilinearform_ext.hpp +index 69caf2d42..ee76fe9d5 100644 +--- a/fem/bilinearform_ext.hpp ++++ b/fem/bilinearform_ext.hpp +@@ -70,10 +70,6 @@ protected: + const FiniteElementSpace *trial_fes, *test_fes; // Not owned + /// Attributes of all mesh elements. + Array elem_attributes, bdr_attributes; +- mutable Vector tmp_evec; // Work array +- mutable Vector localX, localY; +- mutable Vector int_face_X, int_face_Y; +- mutable Vector bdr_face_X, bdr_face_Y; + const Operator *elem_restrict; // Not owned + const FaceRestriction *int_face_restrict_lex; // Not owned + const FaceRestriction *bdr_face_restrict_lex; // Not owned +@@ -141,7 +137,6 @@ class FABilinearFormExtension : public EABilinearFormExtension + { + private: + SparseMatrix *mat; +- mutable Vector dg_x, dg_y; + + public: + FABilinearFormExtension(BilinearForm *form); +@@ -170,9 +165,6 @@ class MFBilinearFormExtension : public BilinearFormExtension + { + protected: + const FiniteElementSpace *trial_fes, *test_fes; // Not owned +- mutable Vector localX, localY; +- mutable Vector int_face_X, int_face_Y; +- mutable Vector bdr_face_X, bdr_face_Y; + const Operator *elem_restrict; // Not owned + const FaceRestriction *int_face_restrict_lex; // Not owned + const FaceRestriction *bdr_face_restrict_lex; // Not owned +@@ -240,7 +232,6 @@ class PAMixedBilinearFormExtension : public MixedBilinearFormExtension + { + protected: + const FiniteElementSpace *trial_fes, *test_fes; // Not owned +- mutable Vector localTrial, localTest, tempY; + const Operator *elem_restrict_trial; // Not owned + const Operator *elem_restrict_test; // Not owned + +diff --git a/fem/linearform_ext.cpp b/fem/linearform_ext.cpp +index 355431eb7..f3d5adba2 100644 +--- a/fem/linearform_ext.cpp ++++ b/fem/linearform_ext.cpp +@@ -11,6 +11,7 @@ + + #include "linearform.hpp" + #include "../general/forall.hpp" ++#include "../general/workspace.hpp" + + namespace mfem + { +@@ -61,6 +62,7 @@ void LinearFormExtension::Assemble() + } + + // Assemble the linear form ++ auto b = Workspace::NewVector(elem_restrict_lex->Height()); + b = 0.0; + domain_integs[k]->AssembleDevice(fes, markers, b); + if (k == 0) { elem_restrict_lex->MultTranspose(b, *lf); } +@@ -104,6 +106,7 @@ void LinearFormExtension::Assemble() + } + + // Assemble the linear form ++ auto bdr_b = Workspace::NewVector(bdr_restrict_lex->Height()); + bdr_b = 0.0; + boundary_integs[k]->AssembleDevice(fes, bdr_markers, bdr_b); + bdr_restrict_lex->AddMultTranspose(bdr_b, *lf); +@@ -130,8 +133,6 @@ void LinearFormExtension::Update() + + elem_restrict_lex = fes.GetElementRestriction(ordering); + MFEM_VERIFY(elem_restrict_lex, "Element restriction not available"); +- b.SetSize(elem_restrict_lex->Height(), Device::GetMemoryType()); +- b.UseDevice(true); + } + + if (lf->boundary_integs.Size() > 0) +@@ -168,9 +169,6 @@ void LinearFormExtension::Update() + dynamic_cast( + fes.GetFaceRestriction(ordering, FaceType::Boundary, + L2FaceValues::SingleValued)); +- MFEM_VERIFY(bdr_restrict_lex, "Face restriction not available"); +- bdr_b.SetSize(bdr_restrict_lex->Height(), Device::GetMemoryType()); +- bdr_b.UseDevice(true); + } + } + +diff --git a/fem/linearform_ext.hpp b/fem/linearform_ext.hpp +index 0103185e0..eb54d8745 100644 +--- a/fem/linearform_ext.hpp ++++ b/fem/linearform_ext.hpp +@@ -39,9 +39,6 @@ class LinearFormExtension + /// Operator that converts L-vectors to boundary E-vectors. + const FaceRestriction *bdr_restrict_lex; // Not owned + +- /// Internal E-vectors. +- mutable Vector b, bdr_b; +- + public: + + /// \brief Create a LinearForm extension of @a lf. +diff --git a/fem/qinterp/det.cpp b/fem/qinterp/det.cpp +index e0b034b82..dda386644 100644 +--- a/fem/qinterp/det.cpp ++++ b/fem/qinterp/det.cpp +@@ -11,6 +11,7 @@ + + #include "../quadinterpolator.hpp" + #include "../../general/forall.hpp" ++#include "../../general/workspace.hpp" + #include "../../linalg/dtensor.hpp" + #include "../../fem/kernels.hpp" + #include "../../linalg/kernels.hpp" +@@ -204,8 +205,7 @@ static void Det3D(const int NE, + const double *x, + double *y, + const int d1d = 0, +- const int q1d = 0, +- Vector *d_buff = nullptr) // used only with SMEM = false ++ const int q1d = 0) + { + constexpr int DIM = 3; + static constexpr int GRID = SMEM ? 0 : 128; +@@ -219,6 +219,7 @@ static void Det3D(const int NE, + auto Y = Reshape(y, Q1D, Q1D, Q1D, NE); + + double *GM = nullptr; ++ int buffer_size = 0; + if (!SMEM) + { + const DeviceDofQuadLimits &limits = DeviceDofQuadLimits::Get(); +@@ -226,9 +227,11 @@ static void Det3D(const int NE, + const int max_d1d = T_D1D ? T_D1D : limits.MAX_Q1D; + const int max_qd = std::max(max_q1d, max_d1d); + const int mem_size = max_qd * max_qd * max_qd * 9; +- d_buff->SetSize(2*mem_size*GRID); +- GM = d_buff->Write(); ++ buffer_size = 2*mem_size*GRID; + } ++ // if SMEM is true, d_buff will be empty (zero size) ++ auto d_buff = Workspace::NewVector(buffer_size); ++ GM = d_buff.Write(); + + mfem::forall_3D_grid(NE, Q1D, Q1D, Q1D, GRID, [=] MFEM_HOST_DEVICE (int e) + { +@@ -278,8 +281,7 @@ void TensorDeterminants(const int NE, + const int vdim, + const DofToQuad &maps, + const Vector &e_vec, +- Vector &q_det, +- Vector &d_buff) ++ Vector &q_det) + { + if (NE == 0) { return; } + const int dim = maps.FE->GetDim(); +@@ -347,8 +349,7 @@ void TensorDeterminants(const int NE, + if (D1D <= MD && Q1D <= MQ) + { return Det3D<0,0,true>(NE,B,G,X,Y,D1D,Q1D); } + // Last fall-back will use global memory +- return Det3D<0,0,false>( +- NE,B,G,X,Y,D1D,Q1D,&d_buff); ++ return Det3D<0,0,false>(NE,B,G,X,Y,D1D,Q1D); + } + } + } +diff --git a/fem/qinterp/dispatch.hpp b/fem/qinterp/dispatch.hpp +index 7e7c0ebfa..9faf53b81 100644 +--- a/fem/qinterp/dispatch.hpp ++++ b/fem/qinterp/dispatch.hpp +@@ -54,8 +54,7 @@ void TensorDeterminants(const int NE, + const int vdim, + const DofToQuad &maps, + const Vector &e_vec, +- Vector &q_det, +- Vector &d_buff); ++ Vector &q_det); + + } // namespace quadrature_interpolator + +diff --git a/fem/quadinterpolator.cpp b/fem/quadinterpolator.cpp +index 8314fdcfb..67e77dd9a 100644 +--- a/fem/quadinterpolator.cpp ++++ b/fem/quadinterpolator.cpp +@@ -28,7 +28,6 @@ QuadratureInterpolator::QuadratureInterpolator(const FiniteElementSpace &fes, + q_layout(QVectorLayout::byNODES), + use_tensor_products(UsesTensorBasis(fes)) + { +- d_buffer.UseDevice(true); + if (fespace->GetNE() == 0) { return; } + const FiniteElement *fe = fespace->GetFE(0); + MFEM_VERIFY(dynamic_cast(fe) != NULL, +@@ -44,7 +43,6 @@ QuadratureInterpolator::QuadratureInterpolator(const FiniteElementSpace &fes, + q_layout(QVectorLayout::byNODES), + use_tensor_products(UsesTensorBasis(fes)) + { +- d_buffer.UseDevice(true); + if (fespace->GetNE() == 0) { return; } + const FiniteElement *fe = fespace->GetFE(0); + MFEM_VERIFY(dynamic_cast(fe) != NULL, +@@ -530,7 +528,7 @@ void QuadratureInterpolator::Mult(const Vector &e_vec, + } + if (eval_flags & DETERMINANTS) + { +- TensorDeterminants(ne, vdim, maps, e_vec, q_det, d_buffer); ++ TensorDeterminants(ne, vdim, maps, e_vec, q_det); + } + } + else // use_tensor_eval == false +diff --git a/fem/quadinterpolator.hpp b/fem/quadinterpolator.hpp +index 6672ee2f1..95fa8c9a3 100644 +--- a/fem/quadinterpolator.hpp ++++ b/fem/quadinterpolator.hpp +@@ -37,7 +37,6 @@ protected: + mutable QVectorLayout q_layout; ///< Output Q-vector layout + + mutable bool use_tensor_products; ///< Tensor product evaluation mode +- mutable Vector d_buffer; ///< Auxiliary device buffer + + public: + static const int MAX_NQ2D = 100; +diff --git a/fem/transfer.cpp b/fem/transfer.cpp +index a1eee41db..1db97d041 100644 +--- a/fem/transfer.cpp ++++ b/fem/transfer.cpp +@@ -12,6 +12,7 @@ + #include "transfer.hpp" + #include "bilinearform.hpp" + #include "../general/forall.hpp" ++#include "../general/workspace.hpp" + + namespace mfem + { +@@ -1399,15 +1400,10 @@ TensorProductPRefinementTransferOperator( + MFEM_VERIFY(elem_restrict_lex_h, + "High order ElementRestriction not available"); + +- localL.SetSize(elem_restrict_lex_l->Height(), Device::GetMemoryType()); +- localH.SetSize(elem_restrict_lex_h->Height(), Device::GetMemoryType()); +- localL.UseDevice(true); +- localH.UseDevice(true); +- + MFEM_VERIFY(dynamic_cast(elem_restrict_lex_h), + "High order element restriction is of unsupported type"); + +- mask.SetSize(localH.Size(), Device::GetMemoryType()); ++ mask.SetSize(elem_restrict_lex_h->Height(), Device::GetMemoryType()); + static_cast(elem_restrict_lex_h) + ->BooleanMask(mask); + mask.UseDevice(true); +@@ -1680,6 +1676,9 @@ void TensorProductPRefinementTransferOperator::Mult(const Vector& x, + return; + } + ++ auto localH = Workspace::NewVector(elem_restrict_lex_h->Height()); ++ auto localL = Workspace::NewVector(elem_restrict_lex_l->Height()); ++ + elem_restrict_lex_l->Mult(x, localL); + if (dim == 2) + { +@@ -1706,6 +1705,9 @@ void TensorProductPRefinementTransferOperator::MultTranspose(const Vector& x, + return; + } + ++ auto localH = Workspace::NewVector(elem_restrict_lex_h->Height()); ++ auto localL = Workspace::NewVector(elem_restrict_lex_l->Height()); ++ + elem_restrict_lex_h->Mult(x, localH); + if (dim == 2) + { +@@ -1731,7 +1733,7 @@ TrueTransferOperator::TrueTransferOperator(const FiniteElementSpace& lFESpace_, + lFESpace(lFESpace_), + hFESpace(hFESpace_) + { +- localTransferOperator = new TransferOperator(lFESpace_, hFESpace_); ++ localTransferOperator.reset(new TransferOperator(lFESpace_, hFESpace_)); + + P = lFESpace.GetProlongationMatrix(); + R = hFESpace.IsVariableOrder() ? hFESpace.GetHpRestrictionMatrix() : +@@ -1741,34 +1743,23 @@ TrueTransferOperator::TrueTransferOperator(const FiniteElementSpace& lFESpace_, + // P can be null and R not null + // If P is not null it is assumed that R is not null as well + if (P) { MFEM_VERIFY(R, "Both P and R have to be not NULL") } +- +- if (P) +- { +- tmpL.SetSize(lFESpace_.GetVSize()); +- tmpH.SetSize(hFESpace_.GetVSize()); +- } +- // P can be null and R not null +- else if (R) +- { +- tmpH.SetSize(hFESpace_.GetVSize()); +- } +-} +- +-TrueTransferOperator::~TrueTransferOperator() +-{ +- delete localTransferOperator; + } + + void TrueTransferOperator::Mult(const Vector& x, Vector& y) const + { + if (P) + { ++ auto tmpL = Workspace::NewVector(lFESpace.GetVSize()); ++ auto tmpH = Workspace::NewVector(hFESpace.GetVSize()); ++ + P->Mult(x, tmpL); + localTransferOperator->Mult(tmpL, tmpH); + R->Mult(tmpH, y); + } + else if (R) + { ++ auto tmpH = Workspace::NewVector(hFESpace.GetVSize()); ++ + localTransferOperator->Mult(x, tmpH); + R->Mult(tmpH, y); + } +@@ -1782,12 +1773,17 @@ void TrueTransferOperator::MultTranspose(const Vector& x, Vector& y) const + { + if (P) + { ++ auto tmpL = Workspace::NewVector(lFESpace.GetVSize()); ++ auto tmpH = Workspace::NewVector(hFESpace.GetVSize()); ++ + R->MultTranspose(x, tmpH); + localTransferOperator->MultTranspose(tmpH, tmpL); + P->MultTranspose(tmpL, y); + } + else if (R) + { ++ auto tmpH = Workspace::NewVector(hFESpace.GetVSize()); ++ + R->MultTranspose(x, tmpH); + localTransferOperator->MultTranspose(tmpH, y); + } +diff --git a/fem/transfer.hpp b/fem/transfer.hpp +index 10e3a3de3..58cd6583e 100644 +--- a/fem/transfer.hpp ++++ b/fem/transfer.hpp +@@ -470,8 +470,6 @@ private: + const Operator* elem_restrict_lex_l; + const Operator* elem_restrict_lex_h; + Vector mask; +- mutable Vector localL; +- mutable Vector localH; + + public: + /// @brief Constructs a transfer operator from \p lFESpace to \p hFESpace +@@ -505,9 +503,7 @@ private: + const FiniteElementSpace& hFESpace; + const Operator * P = nullptr; + const SparseMatrix * R = nullptr; +- TransferOperator* localTransferOperator; +- mutable Vector tmpL; +- mutable Vector tmpH; ++ std::unique_ptr localTransferOperator; + + public: + /// @brief Constructs a transfer operator working on true degrees of freedom +@@ -515,9 +511,6 @@ public: + TrueTransferOperator(const FiniteElementSpace& lFESpace_, + const FiniteElementSpace& hFESpace_); + +- /// Destructor +- ~TrueTransferOperator(); +- + /// @brief Interpolation or prolongation of a true dof vector \p x to a true + /// dof vector \p y. + /** The true dof vector \p x corresponding to the coarse space is restricted +diff --git a/general/CMakeLists.txt b/general/CMakeLists.txt +index 67c3653c4..6f610f959 100644 +--- a/general/CMakeLists.txt ++++ b/general/CMakeLists.txt +@@ -31,6 +31,7 @@ list(APPEND SRCS + tinyxml2.cpp + version.cpp + hip.cpp ++ workspace.cpp + ) + + list(APPEND HDRS +@@ -64,6 +65,7 @@ list(APPEND HDRS + text.hpp + version.hpp + hip.hpp ++ workspace.hpp + ) + + if (MFEM_USE_MPI) +diff --git a/general/workspace.cpp b/general/workspace.cpp +new file mode 100644 +index 000000000..ad8210a61 +--- /dev/null ++++ b/general/workspace.cpp +@@ -0,0 +1,144 @@ ++// Copyright (c) 2010-2024, Lawrence Livermore National Security, LLC. Produced ++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files ++// LICENSE and NOTICE for details. LLNL-CODE-806117. ++// ++// This file is part of the MFEM library. For more information and source code ++// availability visit https://mfem.org. ++// ++// MFEM is free software; you can redistribute it and/or modify it under the ++// terms of the BSD-3 license. We welcome feedback and contributions, see file ++// CONTRIBUTING.md for details. ++ ++#include "workspace.hpp" ++ ++namespace mfem ++{ ++ ++WorkspaceVector::WorkspaceVector( ++ internal::WorkspaceChunk &chunk_, int offset_, int n) ++ : Vector(chunk_.GetData(), chunk_.GetOffset(), n), ++ chunk(chunk_), ++ offset(offset_), ++ original_size(n) ++{ ++ UseDevice(true); ++} ++ ++WorkspaceVector::WorkspaceVector(WorkspaceVector &&other) ++ : Vector(std::move(other)), ++ chunk(other.chunk), ++ offset(other.offset), ++ original_size(other.original_size) ++{ ++ if (this != &other) { other.moved_from = true; } ++} ++ ++WorkspaceVector::~WorkspaceVector() ++{ ++ if (!moved_from) { chunk.FreeVector(*this); } ++} ++ ++namespace internal ++{ ++ ++WorkspaceChunk::WorkspaceChunk(int capacity) ++ : data(capacity), original_capacity(capacity) ++{ } ++ ++WorkspaceVector WorkspaceChunk::NewVector(int n) ++{ ++ MFEM_ASSERT(HasCapacityFor(n), "Requested vector is too large."); ++ WorkspaceVector vector(*this, offset, n); ++ offset += n; ++ vector_count += 1; ++ return vector; ++} ++ ++void WorkspaceChunk::FreeVector(const WorkspaceVector &v) ++{ ++ MFEM_ASSERT(vector_count >= 0, ""); ++ vector_count -= 1; ++ // If the chunk is completely empty, we can reclaim all of the memory and ++ // allow new allocations (before it is completely empty, we cannot reclaim ++ // memory because we don't track the specific regions that are freed). ++ if (vector_count == 0) ++ { ++ offset = 0; ++ // If we are not the front chunk, deallocate the backing memory. This ++ // chunk will be consolidated later anyway. ++ if (!front) { data.Destroy(); } ++ } ++ else ++ { ++ // If the vector being freed is the most recent vector allocated (i.e. if ++ // the vector is freed in stack/LIFO order), then we can reclaim its ++ // memory by moving the offset. ++ const int size = v.original_size; ++ if (v.offset == offset - size) ++ { ++ offset -= size; ++ } ++ } ++} ++ ++} // namespace internal ++ ++Workspace &Workspace::Instance() ++{ ++ static Workspace ws; ++ return ws; ++} ++ ++void Workspace::ConsolidateAndEnsureAvailable(int requested_size) ++{ ++ int n_empty = 0; ++ int empty_capacity = 0; ++ // Merge all empty chunks at the beginning of the list ++ auto it = chunks.begin(); ++ while (it != chunks.end() && it->IsEmpty()) ++ { ++ empty_capacity += it->GetOriginalCapacity(); ++ ++it; ++ ++n_empty; ++ } ++ ++ // If we have multiple empty chunks at the beginning of the list, we need ++ // to merge them. Also, if the front chunk is empty, but not big enough, ++ // we need to replace it, so we remove it here. ++ if (n_empty > 1 || requested_size > empty_capacity) ++ { ++ // Note: if chunks is empty, then 'it' will be chunks.begin(), and the ++ // next line is a no-op. ++ chunks.erase_after(chunks.before_begin(), it); ++ } ++ ++ const int capacity = chunks.empty() ? -1 : ++ chunks.front().GetAvailableCapacity(); ++ ++ const int min_chunk_size = std::max(requested_size, empty_capacity); ++ ++ if (min_chunk_size > capacity) ++ { ++ if (!chunks.empty()) { chunks.front().SetFront(false); } ++ chunks.emplace_front(min_chunk_size); ++ } ++} ++ ++WorkspaceVector Workspace::NewVector(int n) ++{ ++ Workspace &ws = Instance(); ++ ws.ConsolidateAndEnsureAvailable(n); ++ return ws.chunks.front().NewVector(n); ++} ++ ++void Workspace::Reserve(int n) ++{ ++ Instance().ConsolidateAndEnsureAvailable(n); ++} ++ ++void Workspace::Clear() ++{ ++ Instance().chunks.clear(); ++} ++ ++} // namespace mfem +diff --git a/general/workspace.hpp b/general/workspace.hpp +new file mode 100644 +index 000000000..b5c0ebae8 +--- /dev/null ++++ b/general/workspace.hpp +@@ -0,0 +1,176 @@ ++// Copyright (c) 2010-2024, Lawrence Livermore National Security, LLC. Produced ++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files ++// LICENSE and NOTICE for details. LLNL-CODE-806117. ++// ++// This file is part of the MFEM library. For more information and source code ++// availability visit https://mfem.org. ++// ++// MFEM is free software; you can redistribute it and/or modify it under the ++// terms of the BSD-3 license. We welcome feedback and contributions, see file ++// CONTRIBUTING.md for details. ++ ++#ifndef MFEM_WORKSPACE_HPP ++#define MFEM_WORKSPACE_HPP ++ ++#include "../linalg/vector.hpp" ++#include ++ ++namespace mfem ++{ ++ ++// Forward declaration of internal::WorkspaceChunk ++namespace internal { class WorkspaceChunk; } ++ ++/// @brief A vector used as a short-lived workspace for temporary calculations, ++/// created with Workspace::NewVector. ++/// ++/// A WorkspaceVector is created using the Workspace bump allocator. The ++/// allocator can quickly and cheaply create new vectors, so that these vectors ++/// can be created and destroyed in loops without incurring the memory ++/// allocation and deallocation overhead. ++/// ++/// WorkspaceVector%s should be used only for short-lived temporary storage; for ++/// example, they are not intended to be stored as member data in other classes. ++class WorkspaceVector : public Vector ++{ ++ // using internal::WorkspaceChunk; ++ friend class internal::WorkspaceChunk; ++ ++ /// The WorkspaceChunk containing the data for this vector. ++ internal::WorkspaceChunk &chunk; ++ ++ /// Offset in the chunk. ++ const int offset; ++ ++ /// Original size allocated. ++ const int original_size; ++ ++ /// @brief Has this WorkspaceVector been moved from? If so, don't deallocate ++ /// from its WorkspaceChunk in the destructor. ++ bool moved_from = false; ++ ++ /// Private constructor, create with Workspace::NewVector() instead. ++ WorkspaceVector(internal::WorkspaceChunk &chunk_, int offset_, int n); ++ ++public: ++ /// @brief Move constructor. The moved-from WorkspaceVector has @a ++ /// size_in_chunk set to zero. ++ WorkspaceVector(WorkspaceVector &&other); ++ ++ /// No copy constructor. ++ WorkspaceVector(WorkspaceVector &other) = delete; ++ ++ /// Copy assignment: copy contents of vector, not metadata. ++ WorkspaceVector& operator=(WorkspaceVector &other) ++ { ++ Vector::operator=(other); ++ return *this; ++ } ++ ++ /// Cannot move to an existing WorkspaceVector. ++ WorkspaceVector& operator=(WorkspaceVector &&other) = delete; ++ ++ // All other operator=, inherit from Vector ++ using Vector::operator=; ++ ++ /// Destructor. Notifies the WorkspaceChunk that this vector has been freed. ++ ~WorkspaceVector(); ++}; ++ ++namespace internal ++{ ++ ++/// @brief A chunk of storage used to allocate WorkspaceVector%s. ++/// ++/// This is an internal class used by Workspace. ++class WorkspaceChunk ++{ ++ /// The data used as a base for the WorkspaceVector%s. ++ Vector data; ++ ++ /// The offset of currently allocated WorkspaceVector%s in thus chunk. ++ int offset = 0; ++ ++ /// How many vectors have been allocated in this chunk. ++ int vector_count = 0; ++ ++ /// Is the vector in the front of the list? ++ bool front = true; ++ ++ /// The original capacity allocated. ++ const int original_capacity; ++ ++public: ++ /// Create a WorkspaceChunk with the given @a capacity. ++ WorkspaceChunk(int capacity); ++ ++ /// @brief Return the available capacity (i.e. the largest vector that will ++ /// fit in this chunk). ++ int GetAvailableCapacity() const { return data.Size() - offset; } ++ ++ /// @brief Returns the original capacity of the chunk. ++ /// ++ /// If the chunk is not in the front of the list and all of its vectors are ++ /// freed, it may deallocate its data, so the capacity becomes zero. The ++ /// "original capacity" remains unchained. ++ int GetOriginalCapacity() const { return original_capacity; } ++ ++ /// Return the data offset. ++ int GetOffset() const { return offset; } ++ ++ /// Sets whether the chunk is in the front of the list ++ void SetFront(bool front_) { front = front_; } ++ ++ /// Returns true if this chunk can fit a new vector of size @a n. ++ bool HasCapacityFor(int n) const { return n <= GetAvailableCapacity(); } ++ ++ /// Returns true if this chunk is empty. ++ bool IsEmpty() const { return vector_count == 0; } ++ ++ /// Note that a vector from this chunk has been deallocated. ++ void FreeVector(const WorkspaceVector &v); ++ ++ /// Returns the backing data Vector. ++ Vector &GetData() { return data; } ++ ++ /// Returns a new WorkspaceVector of size @a n. ++ WorkspaceVector NewVector(int n); ++}; ++ ++} ++ ++/// @brief Storage for temporary, short-lived workspace vectors. ++/// ++/// This class implements a simple bump allocator to quickly allocate and ++/// deallocate workspace vectors without incurring the overhead of memory ++/// allocation and deallocation. ++class Workspace ++{ ++ /// Chunks of storage to hold the vectors. ++ std::forward_list chunks; ++ ++ /// Default constructor, private (singleton class). ++ Workspace() = default; ++ ++ /// @brief Consolidate the chunks (merge consecutive empty chunks), and ++ /// ensure that the front chunk has sufficient available capacity for a ++ /// vector of @a requested_size. ++ void ConsolidateAndEnsureAvailable(int requested_size); ++ ++ /// Return the singleton instance. ++ static Workspace &Instance(); ++ ++public: ++ /// Return a new WorkspaceVector of the requested size. ++ static WorkspaceVector NewVector(int n); ++ ++ /// Ensure that capacity of at least @a n is available for allocations. ++ static void Reserve(int n); ++ ++ /// Clear all storage. Invalidates any existing WorkspaceVector%s. ++ static void Clear(); ++}; ++ ++} // namespace mfem ++ ++#endif +diff --git a/linalg/blockoperator.cpp b/linalg/blockoperator.cpp +index f5e1d02c7..c6b48d8b9 100644 +--- a/linalg/blockoperator.cpp ++++ b/linalg/blockoperator.cpp +@@ -11,6 +11,7 @@ + + + #include "../general/array.hpp" ++#include "../general/workspace.hpp" + #include "operator.hpp" + #include "blockvector.hpp" + #include "blockoperator.hpp" +@@ -78,7 +79,7 @@ void BlockOperator::Mult (const Vector & x, Vector & y) const + + for (int iRow=0; iRow < nRowBlocks; ++iRow) + { +- tmp.SetSize(row_offsets[iRow+1] - row_offsets[iRow]); ++ auto tmp = Workspace::NewVector(row_offsets[iRow+1] - row_offsets[iRow]); + for (int jCol=0; jCol < nColBlocks; ++jCol) + { + if (op(iRow,jCol)) +@@ -109,7 +110,7 @@ void BlockOperator::MultTranspose (const Vector & x, Vector & y) const + + for (int iRow=0; iRow < nColBlocks; ++iRow) + { +- tmp.SetSize(col_offsets[iRow+1] - col_offsets[iRow]); ++ auto tmp = Workspace::NewVector(col_offsets[iRow+1] - col_offsets[iRow]); + for (int jCol=0; jCol < nRowBlocks; ++jCol) + { + if (op(jCol,iRow)) +@@ -284,8 +285,8 @@ void BlockLowerTriangularPreconditioner::Mult (const Vector & x, + y = 0.0; + for (int iRow=0; iRow < nBlocks; ++iRow) + { +- tmp.SetSize(offsets[iRow+1] - offsets[iRow]); +- tmp2.SetSize(offsets[iRow+1] - offsets[iRow]); ++ auto tmp = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]); ++ auto tmp2 = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]); + tmp2 = 0.0; + tmp2 += xblock.GetBlock(iRow); + for (int jCol=0; jCol < iRow; ++jCol) +@@ -320,8 +321,8 @@ void BlockLowerTriangularPreconditioner::MultTranspose (const Vector & x, + y = 0.0; + for (int iRow=nBlocks-1; iRow >=0; --iRow) + { +- tmp.SetSize(offsets[iRow+1] - offsets[iRow]); +- tmp2.SetSize(offsets[iRow+1] - offsets[iRow]); ++ auto tmp = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]); ++ auto tmp2 = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]); + tmp2 = 0.0; + tmp2 += xblock.GetBlock(iRow); + for (int jCol=iRow+1; jCol < nBlocks; ++jCol) +diff --git a/linalg/blockoperator.hpp b/linalg/blockoperator.hpp +index a6c05c477..8900a380b 100644 +--- a/linalg/blockoperator.hpp ++++ b/linalg/blockoperator.hpp +@@ -127,7 +127,6 @@ private: + //! Temporary Vectors used to efficiently apply the Mult and MultTranspose methods. + mutable BlockVector xblock; + mutable BlockVector yblock; +- mutable Vector tmp; + }; + + //! @class BlockDiagonalPreconditioner +@@ -281,8 +280,6 @@ private: + //! methods. + mutable BlockVector xblock; + mutable BlockVector yblock; +- mutable Vector tmp; +- mutable Vector tmp2; + }; + + } +diff --git a/linalg/operator.cpp b/linalg/operator.cpp +index 3fd4d64f9..9b7f8ed54 100644 +--- a/linalg/operator.cpp ++++ b/linalg/operator.cpp +@@ -12,6 +12,7 @@ + #include "vector.hpp" + #include "operator.hpp" + #include "../general/forall.hpp" ++#include "../general/workspace.hpp" + + #include + #include +@@ -50,7 +51,7 @@ void Operator::InitTVectors(const Operator *Po, const Operator *Ri, + + void Operator::AddMult(const Vector &x, Vector &y, const double a) const + { +- mfem::Vector z(y.Size()); ++ auto z = Workspace::NewVector(y.Size()); + Mult(x, z); + y.Add(a, z); + } +@@ -58,7 +59,7 @@ void Operator::AddMult(const Vector &x, Vector &y, const double a) const + void Operator::AddMultTranspose(const Vector &x, Vector &y, + const double a) const + { +- mfem::Vector z(y.Size()); ++ auto z = Workspace::NewVector(y.Size()); + MultTranspose(x, z); + y.Add(a, z); + } +@@ -516,13 +517,8 @@ ConstrainedOperator::ConstrainedOperator(Operator *A, const Array &list, + { + // 'mem_class' should work with A->Mult() and mfem::forall(): + mem_class = A->GetMemoryClass()*Device::GetDeviceMemoryClass(); +- MemoryType mem_type = GetMemoryType(mem_class); + list.Read(); // TODO: just ensure 'list' is registered, no need to copy it + constraint_list.MakeRef(list); +- // typically z and w are large vectors, so use the device (GPU) to perform +- // operations on them +- z.SetSize(height, mem_type); z.UseDevice(true); +- w.SetSize(height, mem_type); w.UseDevice(true); + } + + void ConstrainedOperator::AssembleDiagonal(Vector &diag) const +@@ -558,6 +554,7 @@ void ConstrainedOperator::AssembleDiagonal(Vector &diag) const + + void ConstrainedOperator::EliminateRHS(const Vector &x, Vector &b) const + { ++ auto w = Workspace::NewVector(height); + w = 0.0; + const int csz = constraint_list.Size(); + auto idx = constraint_list.Read(); +@@ -570,9 +567,7 @@ void ConstrainedOperator::EliminateRHS(const Vector &x, Vector &b) const + d_w[id] = d_x[id]; + }); + +- // A.AddMult(w, b, -1.0); // if available to all Operators +- A->Mult(w, z); +- b -= z; ++ A->AddMult(w, b, -1.0); + + // Use read+write access - we are modifying sub-vector of b + auto d_b = b.ReadWrite(); +@@ -600,6 +595,7 @@ void ConstrainedOperator::ConstrainedMult(const Vector &x, Vector &y, + return; + } + ++ auto z = Workspace::NewVector(height); + z = x; + + auto idx = constraint_list.Read(); +@@ -657,13 +653,6 @@ void ConstrainedOperator::MultTranspose(const Vector &x, Vector &y) const + ConstrainedMult(x, y, transpose); + } + +-void ConstrainedOperator::AddMult(const Vector &x, Vector &y, +- const double a) const +-{ +- Mult(x, w); +- y.Add(a, w); +-} +- + RectangularConstrainedOperator::RectangularConstrainedOperator( + Operator *A, + const Array &trial_list, +@@ -673,19 +662,16 @@ RectangularConstrainedOperator::RectangularConstrainedOperator( + { + // 'mem_class' should work with A->Mult() and mfem::forall(): + mem_class = A->GetMemoryClass()*Device::GetMemoryClass(); +- MemoryType mem_type = GetMemoryType(mem_class); + trial_list.Read(); // TODO: just ensure 'list' is registered, no need to copy it + test_list.Read(); // TODO: just ensure 'list' is registered, no need to copy it + trial_constraints.MakeRef(trial_list); + test_constraints.MakeRef(test_list); +- // typically z and w are large vectors, so store them on the device +- z.SetSize(height, mem_type); z.UseDevice(true); +- w.SetSize(width, mem_type); w.UseDevice(true); + } + + void RectangularConstrainedOperator::EliminateRHS(const Vector &x, + Vector &b) const + { ++ auto w = Workspace::NewVector(width); + w = 0.0; + const int trial_csz = trial_constraints.Size(); + auto trial_idx = trial_constraints.Read(); +@@ -719,6 +705,7 @@ void RectangularConstrainedOperator::Mult(const Vector &x, Vector &y) const + } + else + { ++ auto w = Workspace::NewVector(width); + w = x; + + auto idx = trial_constraints.Read(); +@@ -754,6 +741,7 @@ void RectangularConstrainedOperator::MultTranspose(const Vector &x, + } + else + { ++ auto z = Workspace::NewVector(height); + z = x; + + auto idx = test_constraints.Read(); +diff --git a/linalg/operator.hpp b/linalg/operator.hpp +index fa856ffb8..b8b1761ae 100644 +--- a/linalg/operator.hpp ++++ b/linalg/operator.hpp +@@ -897,7 +897,6 @@ protected: + Array constraint_list; ///< List of constrained indices/dofs. + Operator *A; ///< The unconstrained Operator. + bool own_A; ///< Ownership flag for A. +- mutable Vector z, w; ///< Auxiliary vectors. + MemoryClass mem_class; + DiagonalPolicy diag_policy; ///< Diagonal policy for constrained dofs + +@@ -946,8 +945,6 @@ public: + the vectors, and "_i" -- the rest of the entries. */ + void Mult(const Vector &x, Vector &y) const override; + +- void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; +- + void MultTranspose(const Vector &x, Vector &y) const override; + + /** @brief Implementation of Mult or MultTranspose. +@@ -972,7 +969,6 @@ protected: + Array trial_constraints, test_constraints; + Operator *A; + bool own_A; +- mutable Vector z, w; + MemoryClass mem_class; + + public: +diff --git a/linalg/solvers.cpp b/linalg/solvers.cpp +index ba0af7e91..053607e88 100644 +--- a/linalg/solvers.cpp ++++ b/linalg/solvers.cpp +@@ -13,6 +13,7 @@ + #include "../general/annotation.hpp" + #include "../general/forall.hpp" + #include "../general/globals.hpp" ++#include "../general/workspace.hpp" + #include "../fem/bilinearform.hpp" + #include + #include +@@ -212,10 +213,9 @@ OperatorJacobiSmoother::OperatorJacobiSmoother(const BilinearForm &a, + dinv(height), + damping(dmpng), + ess_tdof_list(&ess_tdofs), +- residual(height), + allow_updates(false) + { +- Vector &diag(residual); ++ auto diag = Workspace::NewVector(height); + a.AssembleDiagonal(diag); + // 'a' cannot be used for iterative_mode == true because its size may be + // different. +@@ -231,7 +231,6 @@ OperatorJacobiSmoother::OperatorJacobiSmoother(const Vector &d, + dinv(height), + damping(dmpng), + ess_tdof_list(&ess_tdofs), +- residual(height), + oper(NULL), + allow_updates(false) + { +@@ -268,15 +267,13 @@ void OperatorJacobiSmoother::SetOperator(const Operator &op) + ess_tdof_list = nullptr; + } + dinv.SetSize(height); +- residual.SetSize(height); +- Vector &diag(residual); ++ auto diag = Workspace::NewVector(height); + op.AssembleDiagonal(diag); + Setup(diag); + } + + void OperatorJacobiSmoother::Setup(const Vector &diag) + { +- residual.UseDevice(true); + const double delta = damping; + auto D = diag.Read(); + auto DI = dinv.Write(); +@@ -307,6 +304,8 @@ void OperatorJacobiSmoother::Mult(const Vector &x, Vector &y) const + MFEM_ASSERT(x.Size() == Width(), "invalid input vector"); + MFEM_ASSERT(y.Size() == Height(), "invalid output vector"); + ++ auto residual = Workspace::NewVector(height); ++ + if (iterative_mode) + { + MFEM_VERIFY(oper, "iterative_mode == true requires the forward operator"); +@@ -341,7 +340,6 @@ OperatorChebyshevSmoother::OperatorChebyshevSmoother(const Operator &oper_, + diag(d), + coeffs(order), + ess_tdof_list(ess_tdofs), +- residual(N), + oper(&oper_) { Setup(); } + + #ifdef MFEM_USE_MPI +@@ -362,7 +360,6 @@ OperatorChebyshevSmoother::OperatorChebyshevSmoother(const Operator &oper_, + diag(d), + coeffs(order), + ess_tdof_list(ess_tdofs), +- residual(N), + oper(&oper_) + { + OperatorJacobiSmoother invDiagOperator(diag, ess_tdofs, 1.0); +@@ -405,7 +402,6 @@ OperatorChebyshevSmoother::OperatorChebyshevSmoother(const Operator* oper_, + void OperatorChebyshevSmoother::Setup() + { + // Invert diagonal +- residual.UseDevice(true); + auto D = diag.Read(); + auto X = dinv.Write(); + mfem::forall(N, [=] MFEM_HOST_DEVICE (int i) { X[i] = 1.0 / D[i]; }); +@@ -494,8 +490,10 @@ void OperatorChebyshevSmoother::Mult(const Vector& x, Vector &y) const + MFEM_ABORT("Chebyshev smoother requires operator"); + } + ++ auto residual = Workspace::NewVector(x.Size()); ++ auto helperVector = Workspace::NewVector(x.Size()); ++ + residual = x; +- helperVector.SetSize(x.Size()); + + y.UseDevice(true); + y = 0.0; +@@ -2997,7 +2995,7 @@ void BlockILU::Mult(const Vector &b, Vector &x) const + { + MFEM_ASSERT(height > 0, "BlockILU(0) preconditioner is not constructed"); + int nblockrows = Height()/block_size; +- y.SetSize(Height()); ++ auto y = Workspace::NewVector(Height()); + + DenseMatrix B; + Vector yi, yj, xi, xj; +@@ -3457,6 +3455,8 @@ void OrthoSolver::Mult(const Vector &b, Vector &x) const + MFEM_VERIFY(height == b.Size(), "incompatible input Vector size!"); + MFEM_VERIFY(height == x.Size(), "incompatible output Vector size!"); + ++ auto b_ortho = Workspace::NewVector(height); ++ + // Orthogonalize input + Orthogonalize(b, b_ortho); + +diff --git a/linalg/solvers.hpp b/linalg/solvers.hpp +index 5b222fc50..ff243746c 100644 +--- a/linalg/solvers.hpp ++++ b/linalg/solvers.hpp +@@ -368,7 +368,6 @@ private: + Vector dinv; + const double damping; + const Array *ess_tdof_list; // not owned; may be NULL +- mutable Vector residual; + /// Uses absolute values of the diagonal entries. + bool use_abs_diag = false; + +@@ -465,8 +464,6 @@ private: + const Vector &diag; + Array coeffs; + const Array& ess_tdof_list; +- mutable Vector residual; +- mutable Vector helperVector; + const Operator* oper; + }; + +@@ -1046,9 +1043,6 @@ private: + + Reordering reordering; + +- /// Temporary vector used in the Mult() function. +- mutable Vector y; +- + /// Permutation and inverse permutation vectors for the block reordering. + Array P, Pinv; + +@@ -1249,8 +1243,6 @@ public: + private: + Solver *solver = nullptr; + +- mutable Vector b_ortho; +- + void Orthogonalize(const Vector &v, Vector &v_ortho) const; + }; +