diff --git a/cmake/ExternalGitTags.cmake b/cmake/ExternalGitTags.cmake
index 1d4dbb245..81c2fca94 100644
--- a/cmake/ExternalGitTags.cmake
+++ b/cmake/ExternalGitTags.cmake
@@ -20,7 +20,7 @@ set(EXTERN_ARPACK_GIT_BRANCH
   "Git branch for external ARPACK-NG build"
 )
 set(EXTERN_ARPACK_GIT_TAG
-  "0eaf38bd51efd3046f0a7e61a08ddcd7ed1f1fcc" CACHE STRING  # 11/22/2023
+  "51ce061ff7066ce712638bb058c805c40a326582" CACHE STRING
   "Git tag for external ARPACK-NG build"
 )
 
@@ -34,7 +34,7 @@ set(EXTERN_BUTTERFLYPACK_GIT_BRANCH
   "Git branch for external ButterflyPACK build"
 )
 set(EXTERN_BUTTERFLYPACK_GIT_TAG
-  "e1ad6091e8dc2cb906ec426222f4acedb4eeeff2" CACHE STRING  # 01/10/2024
+  "e1ad6091e8dc2cb906ec426222f4acedb4eeeff2" CACHE STRING
   "Git tag for external ButterflyPACK build"
 )
 
@@ -48,7 +48,7 @@ set(EXTERN_GSLIB_GIT_BRANCH
   "Git branch for external GSLIB build"
 )
 set(EXTERN_GSLIB_GIT_TAG
-  "39d1baae8f4bfebe3ebca6a234dcc8ba1ee5edc7" CACHE STRING  # 11/09/2022
+  "39d1baae8f4bfebe3ebca6a234dcc8ba1ee5edc7" CACHE STRING
   "Git tag for external GSLIB build"
 )
 
@@ -62,7 +62,7 @@ set(EXTERN_HYPRE_GIT_BRANCH
   "Git branch for external HYPRE build"
 )
 set(EXTERN_HYPRE_GIT_TAG
-  "57bfb26e268ddf003668c5d0b5938ae258922a83" CACHE STRING  # 01/08/2024
+  "7e7fc8ce09153c60ae538a52a5f870f93b9608ca" CACHE STRING
   "Git tag for external HYPRE build"
 )
 
@@ -76,7 +76,7 @@ set(EXTERN_LIBCEED_GIT_BRANCH
   "Git branch for external libCEED build"
 )
 set(EXTERN_LIBCEED_GIT_TAG
-  "699fe8f87dbb33a262c5d9f777cbfdff72550eee" CACHE STRING  # main @ 01/11/2024
+  "437ae4f6500350dd714f2f799cd1eb0b6575cf0f" CACHE STRING  # main @ 02/14/2024
   "Git tag for external libCEED build"
 )
 
@@ -90,7 +90,7 @@ set(EXTERN_LIBXSMM_GIT_BRANCH
   "Git branch for external LIBXSMM build"
 )
 set(EXTERN_LIBXSMM_GIT_TAG
-  "462cbc42f7c5423642a340a71f825731a013ecd6" CACHE STRING  # 01/11/2023
+  "13df674c4b73a1b84f6456de8595903ebfbb43e0" CACHE STRING
   "Git tag for external LIBXSMM build"
 )
 
@@ -104,7 +104,7 @@ set(EXTERN_MAGMA_GIT_BRANCH
   "Git branch for external MAGMA build"
 )
 set(EXTERN_MAGMA_GIT_TAG
-  "061a800467aab6d15f4353c4fc0770375863b43e" CACHE STRING  # 12/29/2023
+  "1628609ab4a8421bd50bafb33cabf49001c1c97b" CACHE STRING
   "Git tag for external MAGMA build"
 )
 
@@ -118,7 +118,7 @@ set(EXTERN_METIS_GIT_BRANCH
   "Git branch for external METIS build"
 )
 set(EXTERN_METIS_GIT_TAG
-  "8b194fdf09661ac41b36fa16db0474d38f46f1ac" CACHE STRING  # 01/10/2023
+  "8b194fdf09661ac41b36fa16db0474d38f46f1ac" CACHE STRING
   "Git tag for external METIS build"
 )
 
@@ -132,7 +132,7 @@ set(EXTERN_MFEM_GIT_BRANCH
   "Git branch for external MFEM build"
 )
 set(EXTERN_MFEM_GIT_TAG
-  "2b27d4815f6e549ffb01065e9ad2d3549706ccec" CACHE STRING  # master @ 01/11/2024
+  "db77249ac0f22e277081d43a5db6c3601e9e2830" CACHE STRING  # master @ 02/13/2024
   "Git tag for external MFEM build"
 )
 
@@ -146,7 +146,7 @@ set(EXTERN_MUMPS_GIT_BRANCH
   "Git branch for external MUMPS build"
 )
 set(EXTERN_MUMPS_GIT_TAG
-  "89d12a54114636a040baf652a3b654530ef2590f" CACHE STRING  # 01/11/2023
+  "888feec5482665dc7f334e9d08cc45e98128340c" CACHE STRING
   "Git tag for external MUMPS build"
 )
 
@@ -160,7 +160,7 @@ set(EXTERN_PARMETIS_GIT_BRANCH
   "Git branch for external ParMETIS build"
 )
 set(EXTERN_PARMETIS_GIT_TAG
-  "f5e3aab04fd5fe6e09fa02f885c1c29d349f9f8b" CACHE STRING  # 01/11/2023
+  "f5e3aab04fd5fe6e09fa02f885c1c29d349f9f8b" CACHE STRING
   "Git tag for external ParMETIS build"
 )
 
@@ -174,7 +174,7 @@ set(EXTERN_PETSC_GIT_BRANCH
   "Git branch for external PETSc build"
 )
 set(EXTERN_PETSC_GIT_TAG
-  "e98d5aa1d9a8b761c58f921ec2d97e44eaf55cc3" CACHE STRING  # 01/12/2024
+  "d1841210cfd64925ca30ed94c24c0bbb5cdc9142" CACHE STRING
   "Git tag for external PETSc build"
 )
 
@@ -188,7 +188,7 @@ set(EXTERN_SCALAPACK_GIT_BRANCH
   "Git branch for external ScaLAPACK build"
 )
 set(EXTERN_SCALAPACK_GIT_TAG
-  "8435bf3bc435d9611d096abab46ebf6beb1d35ea" CACHE STRING  # 11/15/2023
+  "8435bf3bc435d9611d096abab46ebf6beb1d35ea" CACHE STRING
   "Git tag for external ScaLAPACK build"
 )
 
@@ -202,7 +202,7 @@ set(EXTERN_SLEPC_GIT_BRANCH
   "Git branch for external SLEPc build"
 )
 set(EXTERN_SLEPC_GIT_TAG
-  "9fccf47e48b0f46fc169e86349cb3ddd4f8dee5b" CACHE STRING  # 01/12/2024
+  "7980363cc28f3ee28098ac846022f6cc6d867aa6" CACHE STRING
   "Git tag for external SLEPc build"
 )
 
@@ -216,7 +216,7 @@ set(EXTERN_STRUMPACK_GIT_BRANCH
   "Git branch for external STRUMPACK build"
 )
 set(EXTERN_STRUMPACK_GIT_TAG
-  "90dcaaa3c7f06e69f94da43aa1043dd9e030cc2a" CACHE STRING  # 12/14/2023
+  "3462e8cbc6e57d0c3934492abbcc69ee7edb9457" CACHE STRING
   "Git tag for external STRUMPACK build"
 )
 
@@ -230,7 +230,7 @@ set(EXTERN_SUPERLU_GIT_BRANCH
   "Git branch for external SuperLU_DIST build"
 )
 set(EXTERN_SUPERLU_GIT_TAG
-  "b3eecd3eaac3a1332d0d2c5fc052d1af114df192" CACHE STRING  # 11/17/2023
+  "67575a491de06e81a98c804929c8f0de40ee5990" CACHE STRING
   "Git tag for external SuperLU_DIST build"
 )
 
@@ -244,7 +244,7 @@ set(EXTERN_ZFP_GIT_BRANCH
   "Git branch for external ZFP build"
 )
 set(EXTERN_ZFP_GIT_TAG
-  "f61aac5ca3445ac48bab94d0e3da62bb77bdd989" CACHE STRING  # 01/11/2024
+  "6d2b93cf0fda729903fa4b95d4b8580f8f86950f" CACHE STRING
   "Git tag for external ZFP build"
 )
 
@@ -256,7 +256,7 @@ set(EXTERN_JSON_URL
 
 # fmt
 set(EXTERN_FMT_URL
-  "https://github.com/fmtlib/fmt/releases/download/10.1.1/fmt-10.1.1.zip" CACHE STRING
+  "https://github.com/fmtlib/fmt/releases/download/10.2.0/fmt-10.2.0.zip" CACHE STRING
   "URL for external fmt build"
 )
 
diff --git a/cmake/ExternalLibCEED.cmake b/cmake/ExternalLibCEED.cmake
index 008fa9754..203038d8e 100644
--- a/cmake/ExternalLibCEED.cmake
+++ b/cmake/ExternalLibCEED.cmake
@@ -121,6 +121,11 @@ endif()
 string(REPLACE ";" "; " LIBCEED_OPTIONS_PRINT "${LIBCEED_OPTIONS}")
 message(STATUS "LIBCEED_OPTIONS: ${LIBCEED_OPTIONS_PRINT}")
 
+# Patch for LIBXSMM pre-v2
+set(LIBCEED_PATCH_FILES
+  "${CMAKE_SOURCE_DIR}/extern/patch/libCEED/patch_xsmm.diff"
+)
+
 include(ExternalProject)
 ExternalProject_Add(libCEED
   DEPENDS           ${LIBCEED_DEPENDENCIES}
@@ -131,6 +136,7 @@ ExternalProject_Add(libCEED
   PREFIX            ${CMAKE_BINARY_DIR}/extern/libCEED-cmake
   BUILD_IN_SOURCE   TRUE
   UPDATE_COMMAND    ""
+  PATCH_COMMAND     git apply "${LIBCEED_PATCH_FILES}"
   CONFIGURE_COMMAND ""
   BUILD_COMMAND     ""
   INSTALL_COMMAND   ${CMAKE_MAKE_PROGRAM} ${LIBCEED_OPTIONS} install
diff --git a/cmake/ExternalMFEM.cmake b/cmake/ExternalMFEM.cmake
index 1e3927366..0d5c3e380 100644
--- a/cmake/ExternalMFEM.cmake
+++ b/cmake/ExternalMFEM.cmake
@@ -98,6 +98,7 @@ endif()
 # Configure BLAS/LAPACK for dependencies
 if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "")
   list(APPEND MFEM_OPTIONS
+    # "-DMFEM_USE_LAPACK=YES"
     "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}"
     "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}"
   )
@@ -361,8 +362,9 @@ set(MFEM_PATCH_FILES
   "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_vis_dev.diff"
   "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_partitioner_dev.diff"
   "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_par_tet_mesh_fix.diff"
-  "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_pncmesh_update_fix.diff"
-  "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_getnodalvalues_device_fix.diff"
+  "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff"
+  "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_workspace_vectors.diff"
+  "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff"
 )
 
 include(ExternalProject)
diff --git a/extern/patch/libCEED/patch_xsmm.diff b/extern/patch/libCEED/patch_xsmm.diff
new file mode 100644
index 000000000..aeb05410e
--- /dev/null
+++ b/extern/patch/libCEED/patch_xsmm.diff
@@ -0,0 +1,98 @@
+diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
+index 44e9081e..09bdb6fd 100644
+--- a/backends/ref/ceed-ref-restriction.c
++++ b/backends/ref/ceed-ref-restriction.c
+@@ -233,7 +233,7 @@ static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestr
+           CeedScalar vv_loc;
+ 
+           vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset];
+-          CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
++          vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
+         }
+       }
+     }
+@@ -257,7 +257,7 @@ static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRes
+           CeedScalar vv_loc;
+ 
+           vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0);
+-          CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
++          vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
+         }
+       }
+     }
+@@ -287,7 +287,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle
+                         impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
+       }
+       for (CeedInt j = 0; j < block_end; j++) {
+-        CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
++        vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
+       }
+       for (n = 1; n < elem_size - 1; n++) {
+         CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
+@@ -299,7 +299,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle
+                           impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
+         }
+         for (CeedInt j = 0; j < block_end; j++) {
+-          CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
++          vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
+         }
+       }
+       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
+@@ -309,7 +309,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle
+                         impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
+       }
+       for (CeedInt j = 0; j < block_end; j++) {
+-        CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
++        vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
+       }
+     }
+   }
+@@ -338,7 +338,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core
+                         abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
+       }
+       for (CeedInt j = 0; j < block_end; j++) {
+-        CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
++        vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
+       }
+       for (n = 1; n < elem_size - 1; n++) {
+         CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
+@@ -350,7 +350,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core
+                           abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
+         }
+         for (CeedInt j = 0; j < block_end; j++) {
+-          CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
++          vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
+         }
+       }
+       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
+@@ -360,7 +360,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core
+                         abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
+       }
+       for (CeedInt j = 0; j < block_end; j++) {
+-        CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
++        vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
+       }
+     }
+   }
+diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c
+index a64f5e9f..555352ef 100644
+--- a/backends/xsmm/ceed-xsmm-tensor.c
++++ b/backends/xsmm/ceed-xsmm-tensor.c
+@@ -30,7 +30,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
+                                                                             LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64)
+                                                 : libxsmm_create_gemm_shape(J, A, B, !t_mode ? B : J, B, J, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
+                                                                             LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32);
+-    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
++    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
+     libxsmm_gemm_param         gemm_param;
+ 
+     CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
+@@ -50,7 +50,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
+                                                                             LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64)
+                                                 : libxsmm_create_gemm_shape(C, J, B, C, !t_mode ? B : J, C, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
+                                                                             LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32);
+-    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
++    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
+     libxsmm_gemm_param         gemm_param;
+ 
+     CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
diff --git a/extern/patch/mfem/patch_getnodalvalues_device_fix.diff b/extern/patch/mfem/patch_getnodalvalues_device_fix.diff
deleted file mode 100644
index 1f41b8123..000000000
--- a/extern/patch/mfem/patch_getnodalvalues_device_fix.diff
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/fem/gridfunc.cpp b/fem/gridfunc.cpp
-index 310d8d704..bd5504d63 100644
---- a/fem/gridfunc.cpp
-+++ b/fem/gridfunc.cpp
-@@ -1924,9 +1924,9 @@ void GridFunction::GetNodalValues(Vector &nval, int vdim) const
-    Array<double> values;
-    Array<int> overlap(fes->GetNV());
-    nval.SetSize(fes->GetNV());
--
-    nval = 0.0;
-    overlap = 0;
-+   nval.HostReadWrite();
-    for (i = 0; i < fes->GetNE(); i++)
-    {
-       fes->GetElementVertices(i, vertices);
diff --git a/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff b/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff
new file mode 100644
index 000000000..9658c832c
--- /dev/null
+++ b/extern/patch/mfem/patch_hypre_runtime_compute_policy.diff
@@ -0,0 +1,970 @@
+diff --git a/fem/complex_fem.cpp b/fem/complex_fem.cpp
+index 95565804d..a26707cbe 100644
+--- a/fem/complex_fem.cpp
++++ b/fem/complex_fem.cpp
+@@ -1243,25 +1243,28 @@ ParSesquilinearForm::FormLinearSystem(const Array<int> &ess_tdof_list,
+          HypreParMatrix * Ah;
+          A_i.Get(Ah);
+          hypre_ParCSRMatrix *Aih = *Ah;
+-#if !defined(HYPRE_USING_GPU)
+-         ess_tdof_list.HostRead();
+-         for (int k = 0; k < n; k++)
++         if (!HypreUsingGPU())
+          {
+-            const int j = ess_tdof_list[k];
+-            Aih->diag->data[Aih->diag->i[j]] = 0.0;
++            ess_tdof_list.HostRead();
++            for (int k = 0; k < n; k++)
++            {
++               const int j = ess_tdof_list[k];
++               Aih->diag->data[Aih->diag->i[j]] = 0.0;
++            }
+          }
+-#else
+-         Ah->HypreReadWrite();
+-         const int *d_ess_tdof_list =
+-            ess_tdof_list.GetMemory().Read(MemoryClass::DEVICE, n);
+-         const int *d_diag_i = Aih->diag->i;
+-         double *d_diag_data = Aih->diag->data;
+-         MFEM_GPU_FORALL(k, n,
++         else
+          {
+-            const int j = d_ess_tdof_list[k];
+-            d_diag_data[d_diag_i[j]] = 0.0;
+-         });
+-#endif
++            Ah->HypreReadWrite();
++            const int *d_ess_tdof_list =
++               ess_tdof_list.GetMemory().Read(MemoryClass::DEVICE, n);
++            HYPRE_Int *d_diag_i = Aih->diag->i;
++            double *d_diag_data = Aih->diag->data;
++            mfem::forall(n, [=] MFEM_HOST_DEVICE (int k)
++            {
++               const int j = d_ess_tdof_list[k];
++               d_diag_data[d_diag_i[j]] = 0.0;
++            });
++         }
+       }
+       else
+       {
+diff --git a/fem/lor/lor_ams.cpp b/fem/lor/lor_ams.cpp
+index f8d0dfd8c..9066cc20a 100644
+--- a/fem/lor/lor_ams.cpp
++++ b/fem/lor/lor_ams.cpp
+@@ -291,7 +291,7 @@ void BatchedLOR_AMS::FormCoordinateVectors(const Vector &X_vert)
+    const auto ltdof_ldof = HypreRead(R->GetMemoryJ());
+ 
+    // Go from E-vector format directly to T-vector format
+-   MFEM_HYPRE_FORALL(i, ntdofs,
++   mfem::forall_switch(HypreUsingGPU(), ntdofs, [=] MFEM_HOST_DEVICE (int i)
+    {
+       const int j = d_offsets[ltdof_ldof[i]];
+       for (int c = 0; c < sdim; ++c)
+diff --git a/general/mem_manager.hpp b/general/mem_manager.hpp
+index c5fed95a0..753bd74b3 100644
+--- a/general/mem_manager.hpp
++++ b/general/mem_manager.hpp
+@@ -995,7 +995,7 @@ inline void Memory<T>::MakeAlias(const Memory &base, int offset, int size)
+    if (!(base.flags & Registered))
+    {
+       if (
+-#if !defined(HYPRE_USING_GPU)
++#if !defined(HYPRE_USING_GPU) || MFEM_HYPRE_VERSION >= 22600
+          // If the following condition is true then MemoryManager::Exists()
+          // should also be true:
+          IsDeviceMemory(MemoryManager::GetDeviceMemoryType())
+diff --git a/linalg/hypre.cpp b/linalg/hypre.cpp
+index 0fd16be03..19cd473de 100644
+--- a/linalg/hypre.cpp
++++ b/linalg/hypre.cpp
+@@ -65,6 +65,18 @@ void Hypre::SetDefaultOptions()
+ #endif
+ #endif
+ 
++#if MFEM_HYPRE_VERSION >= 22600
++   if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK))
++   {
++      HYPRE_SetMemoryLocation(HYPRE_MEMORY_DEVICE);
++      HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE);
++   }
++   else
++   {
++      HYPRE_SetMemoryLocation(HYPRE_MEMORY_HOST);
++      HYPRE_SetExecutionPolicy(HYPRE_EXEC_HOST);
++   }
++#else
+    // The following options are hypre's defaults as of hypre-2.24
+ 
+    // Allocate hypre objects in GPU memory (default)
+@@ -72,6 +84,16 @@ void Hypre::SetDefaultOptions()
+ 
+    // Where to execute when using UVM (default)
+    // HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE);
++#endif
++
++   // TODO 23/11/28: introduced in https://github.com/hypre-space/hypre/pull/962,
++   //                not yet in a release
++#if MFEM_HYPRE_VERSION >= 23100
++   if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK))
++   {
++      HYPRE_DeviceInitialize();
++   }
++#endif
+ 
+    // Use GPU-based random number generator (default)
+    // HYPRE_SetUseGpuRand(1);
+@@ -121,22 +143,27 @@ bool CanShallowCopy(const Memory<T> &src, MemoryClass mc)
+ inline void HypreParVector::_SetDataAndSize_()
+ {
+    hypre_Vector *x_loc = hypre_ParVectorLocalVector(x);
+-#if !defined(HYPRE_USING_GPU)
+-   SetDataAndSize(hypre_VectorData(x_loc),
+-                  internal::to_int(hypre_VectorSize(x_loc)));
+-#else
+-   size = internal::to_int(hypre_VectorSize(x_loc));
+-   MemoryType mt = (hypre_VectorMemoryLocation(x_loc) == HYPRE_MEMORY_HOST
+-                    ? MemoryType::HOST : GetHypreMemoryType());
+-   if (hypre_VectorData(x_loc) != NULL)
++#if defined(HYPRE_USING_GPU)
++   if (HypreUsingGPU())
+    {
+-      data.Wrap(hypre_VectorData(x_loc), size, mt, false);
++      size = internal::to_int(hypre_VectorSize(x_loc));
++      MemoryType mt = (hypre_VectorMemoryLocation(x_loc) == HYPRE_MEMORY_HOST
++                       ? MemoryType::HOST : GetHypreMemoryType());
++      if (hypre_VectorData(x_loc) != NULL)
++      {
++         data.Wrap(hypre_VectorData(x_loc), size, mt, false);
++      }
++      else
++      {
++         data.Reset();
++      }
+    }
+    else
++#endif
+    {
+-      data.Reset();
++      SetDataAndSize(hypre_VectorData(x_loc),
++                     internal::to_int(hypre_VectorSize(x_loc)));
+    }
+-#endif
+ }
+ 
+ HypreParVector::HypreParVector(MPI_Comm comm, HYPRE_BigInt glob_size,
+@@ -315,7 +342,7 @@ void HypreParVector::HypreRead() const
+    hypre_VectorData(x_loc) =
+       const_cast<double*>(data.Read(GetHypreMemoryClass(), size));
+ #ifdef HYPRE_USING_GPU
+-   hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE;
++   hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation();
+ #endif
+ }
+ 
+@@ -324,7 +351,7 @@ void HypreParVector::HypreReadWrite()
+    hypre_Vector *x_loc = hypre_ParVectorLocalVector(x);
+    hypre_VectorData(x_loc) = data.ReadWrite(GetHypreMemoryClass(), size);
+ #ifdef HYPRE_USING_GPU
+-   hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE;
++   hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation();
+ #endif
+ }
+ 
+@@ -333,7 +360,7 @@ void HypreParVector::HypreWrite()
+    hypre_Vector *x_loc = hypre_ParVectorLocalVector(x);
+    hypre_VectorData(x_loc) = data.Write(GetHypreMemoryClass(), size);
+ #ifdef HYPRE_USING_GPU
+-   hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE;
++   hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation();
+ #endif
+ }
+ 
+@@ -347,7 +374,7 @@ void HypreParVector::WrapMemoryRead(const Memory<double> &mem)
+    hypre_VectorData(x_loc) =
+       const_cast<double*>(mem.Read(GetHypreMemoryClass(), size));
+ #ifdef HYPRE_USING_GPU
+-   hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE;
++   hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation();
+ #endif
+    data.MakeAlias(mem, 0, size);
+ }
+@@ -361,7 +388,7 @@ void HypreParVector::WrapMemoryReadWrite(Memory<double> &mem)
+    hypre_Vector *x_loc = hypre_ParVectorLocalVector(x);
+    hypre_VectorData(x_loc) = mem.ReadWrite(GetHypreMemoryClass(), size);
+ #ifdef HYPRE_USING_GPU
+-   hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE;
++   hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation();
+ #endif
+    data.MakeAlias(mem, 0, size);
+ }
+@@ -375,7 +402,7 @@ void HypreParVector::WrapMemoryWrite(Memory<double> &mem)
+    hypre_Vector *x_loc = hypre_ParVectorLocalVector(x);
+    hypre_VectorData(x_loc) = mem.Write(GetHypreMemoryClass(), size);
+ #ifdef HYPRE_USING_GPU
+-   hypre_VectorMemoryLocation(x_loc) = HYPRE_MEMORY_DEVICE;
++   hypre_VectorMemoryLocation(x_loc) = mfem::GetHypreMemoryLocation();
+ #endif
+    data.MakeAlias(mem, 0, size);
+ }
+@@ -535,6 +562,12 @@ void HypreParMatrix::Init()
+ 
+ void HypreParMatrix::Read(MemoryClass mc) const
+ {
++#if MFEM_HYPRE_VERSION >= 22600
++   if (GetHypreMemoryLocation() == HYPRE_MEMORY_HOST && mc != MemoryClass::HOST)
++   {
++      MFEM_ABORT("Hypre is configured to use the HOST but the MemoryClass is DEVICE");
++   }
++#endif
+    hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
+    hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
+    const int num_rows = NumRows();
+@@ -547,15 +580,26 @@ void HypreParMatrix::Read(MemoryClass mc) const
+    offd->j = const_cast<HYPRE_Int*>(mem_offd.J.Read(mc, offd_nnz));
+    offd->data = const_cast<double*>(mem_offd.data.Read(mc, offd_nnz));
+ #if MFEM_HYPRE_VERSION >= 21800
++#if MFEM_HYPRE_VERSION >= 22600
++   decltype(diag->memory_location) ml =
++      (mc == MemoryClass::HOST) ? HYPRE_MEMORY_HOST : GetHypreMemoryLocation();
++#else // MFEM_HYPRE_VERSION >= 22600
+    decltype(diag->memory_location) ml =
+       (mc != GetHypreMemoryClass() ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE);
++#endif // MFEM_HYPRE_VERSION >= 22600
+    diag->memory_location = ml;
+    offd->memory_location = ml;
+-#endif
++#endif // MFEM_HYPRE_VERSION >= 21800
+ }
+ 
+ void HypreParMatrix::ReadWrite(MemoryClass mc)
+ {
++#if MFEM_HYPRE_VERSION >= 22600
++   if (GetHypreMemoryLocation() == HYPRE_MEMORY_HOST && mc != MemoryClass::HOST)
++   {
++      MFEM_ABORT("Hypre is configured to use the HOST but the MemoryClass is DEVICE");
++   }
++#endif
+    hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
+    hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
+    const int num_rows = NumRows();
+@@ -568,15 +612,26 @@ void HypreParMatrix::ReadWrite(MemoryClass mc)
+    offd->j = mem_offd.J.ReadWrite(mc, offd_nnz);
+    offd->data = mem_offd.data.ReadWrite(mc, offd_nnz);
+ #if MFEM_HYPRE_VERSION >= 21800
++#if MFEM_HYPRE_VERSION >= 22600
++   decltype(diag->memory_location) ml =
++      (mc == MemoryClass::HOST) ? HYPRE_MEMORY_HOST : GetHypreMemoryLocation();
++#else // MFEM_HYPRE_VERSION >= 22600
+    decltype(diag->memory_location) ml =
+       (mc != GetHypreMemoryClass() ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE);
++#endif // MFEM_HYPRE_VERSION >= 22600
+    diag->memory_location = ml;
+    offd->memory_location = ml;
+-#endif
++#endif // MFEM_HYPRE_VERSION >= 21800
+ }
+ 
+ void HypreParMatrix::Write(MemoryClass mc, bool set_diag, bool set_offd)
+ {
++#if MFEM_HYPRE_VERSION >= 22600
++   if (GetHypreMemoryLocation() == HYPRE_MEMORY_HOST && mc != MemoryClass::HOST)
++   {
++      MFEM_ABORT("Hypre is configured to use the HOST but the MemoryClass is DEVICE");
++   }
++#endif
+    hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(A);
+    hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(A);
+    if (set_diag)
+@@ -592,11 +647,16 @@ void HypreParMatrix::Write(MemoryClass mc, bool set_diag, bool set_offd)
+       offd->data = mem_offd.data.Write(mc, mem_offd.data.Capacity());
+    }
+ #if MFEM_HYPRE_VERSION >= 21800
++#if MFEM_HYPRE_VERSION >= 22600
++   decltype(diag->memory_location) ml =
++      (mc == MemoryClass::HOST) ? HYPRE_MEMORY_HOST : GetHypreMemoryLocation();
++#else // MFEM_HYPRE_VERSION >= 22600
+    decltype(diag->memory_location) ml =
+       (mc != GetHypreMemoryClass() ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE);
++#endif // MFEM_HYPRE_VERSION >= 22600
+    if (set_diag) { diag->memory_location = ml; }
+    if (set_offd) { offd->memory_location = ml; }
+-#endif
++#endif // MFEM_HYPRE_VERSION >= 21800
+ }
+ 
+ HypreParMatrix::HypreParMatrix()
+@@ -759,7 +819,7 @@ signed char HypreParMatrix::HypreCsrToMem(hypre_CSRMatrix *h_mat,
+       h_mat->data = mem.data.ReadWrite(hypre_mc, nnz);
+       h_mat->owns_data = 0;
+ #if MFEM_HYPRE_VERSION >= 21800
+-      h_mat->memory_location = HYPRE_MEMORY_DEVICE;
++      h_mat->memory_location = mfem::GetHypreMemoryLocation();
+ #endif
+       return 3;
+    }
+@@ -1360,8 +1420,11 @@ hypre_ParCSRMatrix* HypreParMatrix::StealData()
+    MFEM_ASSERT(ParCSROwner, "");
+    hypre_ParCSRMatrix *R = A;
+ #ifdef HYPRE_USING_GPU
+-   if (diagOwner == -1) { HostReadWrite(); }
+-   else { HypreReadWrite(); }
++   if (HypreUsingGPU())
++   {
++      if (diagOwner == -1) { HostReadWrite(); }
++      else { HypreReadWrite(); }
++   }
+ #endif
+    ParCSROwner = false;
+    Destroy();
+@@ -1709,7 +1772,10 @@ void HypreParMatrix::EnsureMultTranspose() const
+ #if (MFEM_HYPRE_VERSION == 22500 && HYPRE_DEVELOP_NUMBER >= 1) || \
+     (MFEM_HYPRE_VERSION > 22500)
+ #ifdef HYPRE_USING_GPU
+-   hypre_ParCSRMatrixLocalTranspose(A);
++   if (HypreUsingGPU())
++   {
++      hypre_ParCSRMatrixLocalTranspose(A);
++   }
+ #endif
+ #endif
+ }
+@@ -1719,15 +1785,18 @@ void HypreParMatrix::ResetTranspose() const
+ #if (MFEM_HYPRE_VERSION == 22500 && HYPRE_DEVELOP_NUMBER >= 1) || \
+     (MFEM_HYPRE_VERSION > 22500)
+ #ifdef HYPRE_USING_GPU
+-   if (A->diagT)
++   if (HypreUsingGPU())
+    {
+-      hypre_CSRMatrixDestroy(A->diagT);
+-      A->diagT = NULL;
+-   }
+-   if (A->offdT)
+-   {
+-      hypre_CSRMatrixDestroy(A->offdT);
+-      A->offdT = NULL;
++      if (A->diagT)
++      {
++         hypre_CSRMatrixDestroy(A->diagT);
++         A->diagT = NULL;
++      }
++      if (A->offdT)
++      {
++         hypre_CSRMatrixDestroy(A->offdT);
++         A->offdT = NULL;
++      }
+    }
+ #endif
+ #endif
+@@ -2416,8 +2485,8 @@ void HypreParMatrix::EliminateBC(const Array<int> &ess_dofs,
+       }
+ 
+       // Which of the local rows are to be eliminated?
+-      MFEM_HYPRE_FORALL(i, diag_nrows, eliminate_row[i] = 0; );
+-      MFEM_HYPRE_FORALL(i, n_ess_dofs, eliminate_row[ess_dofs_d[i]] = 1; );
++      mfem::forall_switch(HypreUsingGPU(), diag_nrows, [=] MFEM_HOST_DEVICE (int i) { eliminate_row[i] = 0; });
++      mfem::forall_switch(HypreUsingGPU(), n_ess_dofs, [=] MFEM_HOST_DEVICE (int i) { eliminate_row[ess_dofs_d[i]] = 1; });
+ 
+       // Use a matvec communication pattern to find (in eliminate_col) which of
+       // the local offd columns are to be eliminated
+@@ -2428,26 +2497,36 @@ void HypreParMatrix::EliminateBC(const Array<int> &ess_dofs,
+ 
+       HYPRE_Int *send_map_elmts;
+ #if defined(HYPRE_USING_GPU)
+-      hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
+-      send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg);
+-#else
+-      send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
++      if (HypreUsingGPU())
++      {
++         hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
++         send_map_elmts = hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg);
++      }
++      else
+ #endif
+-      MFEM_HYPRE_FORALL(i, int_buf_sz,
++      {
++         send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
++      }
++      mfem::forall_switch(HypreUsingGPU(), int_buf_sz, [=] MFEM_HOST_DEVICE (int i)
+       {
+          int k = send_map_elmts[i];
+          int_buf_data[i] = eliminate_row[k];
+       });
+ 
+ #if defined(HYPRE_USING_GPU)
+-      // Try to use device-aware MPI for the communication if available
+-      comm_handle = hypre_ParCSRCommHandleCreate_v2(
+-                       11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data,
+-                       HYPRE_MEMORY_DEVICE, eliminate_col);
+-#else
+-      comm_handle = hypre_ParCSRCommHandleCreate(
+-                       11, comm_pkg, int_buf_data, eliminate_col );
++      if (HypreUsingGPU())
++      {
++         // Try to use device-aware MPI for the communication if available
++         comm_handle = hypre_ParCSRCommHandleCreate_v2(
++                          11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data,
++                          HYPRE_MEMORY_DEVICE, eliminate_col);
++      }
++      else
+ #endif
++      {
++         comm_handle = hypre_ParCSRCommHandleCreate(
++                          11, comm_pkg, int_buf_data, eliminate_col );
++      }
+    }
+ 
+    // Eliminate rows and columns in the diagonal block
+@@ -2456,7 +2535,7 @@ void HypreParMatrix::EliminateBC(const Array<int> &ess_dofs,
+       const auto J = diag->j;
+       auto data = diag->data;
+ 
+-      MFEM_HYPRE_FORALL(i, n_ess_dofs,
++      mfem::forall_switch(HypreUsingGPU(), n_ess_dofs, [=] MFEM_HOST_DEVICE (int i)
+       {
+          const int idof = ess_dofs_d[i];
+          for (int j=I[idof]; j<I[idof+1]; ++j)
+@@ -2494,7 +2573,7 @@ void HypreParMatrix::EliminateBC(const Array<int> &ess_dofs,
+    {
+       const auto I = offd->i;
+       auto data = offd->data;
+-      MFEM_HYPRE_FORALL(i, n_ess_dofs,
++      mfem::forall_switch(HypreUsingGPU(), n_ess_dofs, [=] MFEM_HOST_DEVICE (int i)
+       {
+          const int idof = ess_dofs_d[i];
+          for (int j=I[idof]; j<I[idof+1]; ++j)
+@@ -2515,7 +2594,7 @@ void HypreParMatrix::EliminateBC(const Array<int> &ess_dofs,
+       const auto I = offd->i;
+       const auto J = offd->j;
+       auto data = offd->data;
+-      MFEM_HYPRE_FORALL(i, nrows_offd,
++      mfem::forall_switch(HypreUsingGPU(), nrows_offd, [=] MFEM_HOST_DEVICE (int i)
+       {
+          for (int j=I[i]; j<I[i+1]; ++j)
+          {
+@@ -2691,7 +2770,7 @@ void HypreParMatrix::Destroy()
+    if (A == NULL) { return; }
+ 
+ #ifdef HYPRE_USING_GPU
+-   if (ParCSROwner && (diagOwner < 0 || offdOwner < 0))
++   if (HypreUsingGPU() && ParCSROwner && (diagOwner < 0 || offdOwner < 0))
+    {
+       // Put the "host" or "hypre" pointers in {i,j,data} of A->{diag,offd}, so
+       // that they can be destroyed by hypre when hypre_ParCSRMatrixDestroy(A)
+@@ -2855,10 +2934,15 @@ HypreParMatrix * ParMult(const HypreParMatrix *A, const HypreParMatrix *B,
+ {
+    hypre_ParCSRMatrix * ab;
+ #ifdef HYPRE_USING_GPU
+-   ab = hypre_ParCSRMatMat(*A, *B);
+-#else
+-   ab = hypre_ParMatmul(*A,*B);
++   if (HypreUsingGPU())
++   {
++      ab = hypre_ParCSRMatMat(*A, *B);
++   }
++   else
+ #endif
++   {
++      ab = hypre_ParMatmul(*A,*B);
++   }
+    hypre_ParCSRMatrixSetNumNonzeros(ab);
+ 
+    if (!hypre_ParCSRMatrixCommPkg(ab)) { hypre_MatvecCommPkgCreate(ab); }
+@@ -2882,6 +2966,7 @@ HypreParMatrix * RAP(const HypreParMatrix *A, const HypreParMatrix *P)
+    //        in ex28p.
+    // Quick fix: add a diagonal matrix with 0 diagonal.
+    // Maybe use hypre_CSRMatrixCheckDiagFirst to see if we need the fix.
++   if (HypreUsingGPU())
+    {
+       hypre_ParCSRMatrix *Q = hypre_ParCSRMatMat(*A,*P);
+       const bool keepTranspose = false;
+@@ -2891,25 +2976,27 @@ HypreParMatrix * RAP(const HypreParMatrix *A, const HypreParMatrix *P)
+       // alternative:
+       // hypre_ParCSRMatrixRAPKT
+    }
+-#else
++   else
++#endif
++   {
+ #if MFEM_HYPRE_VERSION <= 22200
+-   HYPRE_Int P_owns_its_col_starts =
+-      hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P));
++      HYPRE_Int P_owns_its_col_starts =
++         hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P));
+ #endif
+ 
+-   hypre_BoomerAMGBuildCoarseOperator(*P,*A,*P,&rap);
++      hypre_BoomerAMGBuildCoarseOperator(*P,*A,*P,&rap);
+ 
+ #if MFEM_HYPRE_VERSION <= 22200
+-   /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts
+-      from P (even if it does not own them)! */
+-   hypre_ParCSRMatrixSetRowStartsOwner(rap,0);
+-   hypre_ParCSRMatrixSetColStartsOwner(rap,0);
+-   if (P_owns_its_col_starts)
+-   {
+-      hypre_ParCSRMatrixSetColStartsOwner(*P, 1);
+-   }
+-#endif
++      /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts
++         from P (even if it does not own them)! */
++      hypre_ParCSRMatrixSetRowStartsOwner(rap,0);
++      hypre_ParCSRMatrixSetColStartsOwner(rap,0);
++      if (P_owns_its_col_starts)
++      {
++         hypre_ParCSRMatrixSetColStartsOwner(*P, 1);
++      }
+ #endif
++   }
+ 
+    hypre_ParCSRMatrixSetNumNonzeros(rap);
+    // hypre_MatvecCommPkgCreate(rap);
+@@ -2923,36 +3010,39 @@ HypreParMatrix * RAP(const HypreParMatrix * Rt, const HypreParMatrix *A,
+    hypre_ParCSRMatrix * rap;
+ 
+ #ifdef HYPRE_USING_GPU
++   if (HypreUsingGPU())
+    {
+       hypre_ParCSRMatrix *Q = hypre_ParCSRMatMat(*A,*P);
+       rap = hypre_ParCSRTMatMat(*Rt,Q);
+       hypre_ParCSRMatrixDestroy(Q);
+    }
+-#else
++   else
++#endif
++   {
+ #if MFEM_HYPRE_VERSION <= 22200
+-   HYPRE_Int P_owns_its_col_starts =
+-      hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P));
+-   HYPRE_Int Rt_owns_its_col_starts =
+-      hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*Rt));
++      HYPRE_Int P_owns_its_col_starts =
++         hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*P));
++      HYPRE_Int Rt_owns_its_col_starts =
++         hypre_ParCSRMatrixOwnsColStarts((hypre_ParCSRMatrix*)(*Rt));
+ #endif
+ 
+-   hypre_BoomerAMGBuildCoarseOperator(*Rt,*A,*P,&rap);
++      hypre_BoomerAMGBuildCoarseOperator(*Rt,*A,*P,&rap);
+ 
+ #if MFEM_HYPRE_VERSION <= 22200
+-   /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts
+-      from Rt and P (even if they do not own them)! */
+-   hypre_ParCSRMatrixSetRowStartsOwner(rap,0);
+-   hypre_ParCSRMatrixSetColStartsOwner(rap,0);
+-   if (P_owns_its_col_starts)
+-   {
+-      hypre_ParCSRMatrixSetColStartsOwner(*P, 1);
+-   }
+-   if (Rt_owns_its_col_starts)
+-   {
+-      hypre_ParCSRMatrixSetColStartsOwner(*Rt, 1);
+-   }
+-#endif
++      /* Warning: hypre_BoomerAMGBuildCoarseOperator steals the col_starts
++         from Rt and P (even if they do not own them)! */
++      hypre_ParCSRMatrixSetRowStartsOwner(rap,0);
++      hypre_ParCSRMatrixSetColStartsOwner(rap,0);
++      if (P_owns_its_col_starts)
++      {
++         hypre_ParCSRMatrixSetColStartsOwner(*P, 1);
++      }
++      if (Rt_owns_its_col_starts)
++      {
++         hypre_ParCSRMatrixSetColStartsOwner(*Rt, 1);
++      }
+ #endif
++   }
+ 
+    hypre_ParCSRMatrixSetNumNonzeros(rap);
+    // hypre_MatvecCommPkgCreate(rap);
+@@ -3382,7 +3472,7 @@ int ParCSRRelax_FIR(hypre_ParCSRMatrix *A, // matrix to relax with
+ 
+ HypreSmoother::HypreSmoother() : Solver()
+ {
+-   type = default_type;
++   type = DefaultType();
+    relax_times = 1;
+    relax_weight = 1.0;
+    omega = 1.0;
+@@ -3522,17 +3612,22 @@ void HypreSmoother::SetOperator(const Operator &op)
+    if (l1_norms && pos_l1_norms)
+    {
+ #if defined(HYPRE_USING_GPU)
+-      double *d_l1_norms = l1_norms;  // avoid *this capture
+-      MFEM_GPU_FORALL(i, height,
+-      {
+-         d_l1_norms[i] = std::abs(d_l1_norms[i]);
+-      });
+-#else
+-      for (int i = 0; i < height; i++)
++      if (HypreUsingGPU())
+       {
+-         l1_norms[i] = std::abs(l1_norms[i]);
++         double *d_l1_norms = l1_norms;  // avoid *this capture
++         MFEM_GPU_FORALL(i, height,
++         {
++            d_l1_norms[i] = std::abs(d_l1_norms[i]);
++         });
+       }
++      else
+ #endif
++      {
++         for (int i = 0; i < height; i++)
++         {
++            l1_norms[i] = std::abs(l1_norms[i]);
++         }
++      }
+    }
+ 
+    if (type == 16)
+@@ -4808,41 +4903,49 @@ HypreBoomerAMG::HypreBoomerAMG(const HypreParMatrix &A) : HypreSolver(&A)
+ 
+ void HypreBoomerAMG::SetDefaultOptions()
+ {
+-#if !defined(HYPRE_USING_GPU)
+-   // AMG coarsening options:
+-   int coarsen_type = 10;   // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP
+-   int agg_levels   = 1;    // number of aggressive coarsening levels
+-   double theta     = 0.25; // strength threshold: 0.25, 0.5, 0.8
+-
+    // AMG interpolation options:
+-   int interp_type  = 6;    // 6 = extended+i, 0 = classical
+-   int Pmax         = 4;    // max number of elements per row in P
++   int coarsen_type, agg_levels, interp_type, Pmax, relax_type, relax_sweeps,
++       print_level, max_levels;
++   double theta;
+ 
+-   // AMG relaxation options:
+-   int relax_type   = 8;    // 8 = l1-GS, 6 = symm. GS, 3 = GS, 18 = l1-Jacobi
+-   int relax_sweeps = 1;    // relaxation sweeps on each level
++   if (!HypreUsingGPU())
++   {
++      // AMG coarsening options:
++      coarsen_type = 10;   // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP
++      agg_levels   = 1;    // number of aggressive coarsening levels
++      theta     = 0.25; // strength threshold: 0.25, 0.5, 0.8
+ 
+-   // Additional options:
+-   int print_level  = 1;    // print AMG iterations? 1 = no, 2 = yes
+-   int max_levels   = 25;   // max number of levels in AMG hierarchy
+-#else
+-   // AMG coarsening options:
+-   int coarsen_type = 8;    // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP
+-   int agg_levels   = 0;    // number of aggressive coarsening levels
+-   double theta     = 0.25; // strength threshold: 0.25, 0.5, 0.8
++      // AMG interpolation options:
++      interp_type  = 6;    // 6 = extended+i, 0 = classical
++      Pmax         = 4;    // max number of elements per row in P
+ 
+-   // AMG interpolation options:
+-   int interp_type  = 6;    // 6 = extended+i, or 18 = extended+e
+-   int Pmax         = 4;    // max number of elements per row in P
++      // AMG relaxation options:
++      relax_type   = 8;    // 8 = l1-GS, 6 = symm. GS, 3 = GS, 18 = l1-Jacobi
++      relax_sweeps = 1;    // relaxation sweeps on each level
+ 
+-   // AMG relaxation options:
+-   int relax_type   = 18;   // 18 = l1-Jacobi, or 16 = Chebyshev
+-   int relax_sweeps = 1;    // relaxation sweeps on each level
++      // Additional options:
++      print_level  = 1;    // print AMG iterations? 1 = no, 2 = yes
++      max_levels   = 25;   // max number of levels in AMG hierarchy
++   }
++   else
++   {
++      // AMG coarsening options:
++      coarsen_type = 8;    // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP
++      agg_levels   = 0;    // number of aggressive coarsening levels
++      theta     = 0.25; // strength threshold: 0.25, 0.5, 0.8
+ 
+-   // Additional options:
+-   int print_level  = 1;    // print AMG iterations? 1 = no, 2 = yes
+-   int max_levels   = 25;   // max number of levels in AMG hierarchy
+-#endif
++      // AMG interpolation options:
++      interp_type  = 6;    // 6 = extended+i, or 18 = extended+e
++      Pmax         = 4;    // max number of elements per row in P
++
++      // AMG relaxation options:
++      relax_type   = 18;   // 18 = l1-Jacobi, or 16 = Chebyshev
++      relax_sweeps = 1;    // relaxation sweeps on each level
++
++      // Additional options:
++      print_level  = 1;    // print AMG iterations? 1 = no, 2 = yes
++      max_levels   = 25;   // max number of levels in AMG hierarchy
++   }
+ 
+    HYPRE_BoomerAMGSetCoarsenType(amg_precond, coarsen_type);
+    HYPRE_BoomerAMGSetAggNumLevels(amg_precond, agg_levels);
+@@ -4979,14 +5082,20 @@ void HypreBoomerAMG::SetSystemsOptions(int dim, bool order_bynodes)
+       // After the addition of hypre_IntArray, mapping is assumed
+       // to be a device pointer. Previously, it was assumed to be
+       // a host pointer.
++      HYPRE_Int *mapping = nullptr;
+ #if defined(hypre_IntArrayData) && defined(HYPRE_USING_GPU)
+-      HYPRE_Int *mapping = mfem_hypre_CTAlloc(HYPRE_Int, height);
+-      hypre_TMemcpy(mapping, h_mapping, HYPRE_Int, height,
+-                    HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
+-      mfem_hypre_TFree_host(h_mapping);
+-#else
+-      HYPRE_Int *mapping = h_mapping;
++      if (HypreUsingGPU())
++      {
++         mapping = mfem_hypre_CTAlloc(HYPRE_Int, height);
++         hypre_TMemcpy(mapping, h_mapping, HYPRE_Int, height,
++                       HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
++         mfem_hypre_TFree_host(h_mapping);
++      }
++      else
+ #endif
++      {
++         mapping = h_mapping;
++      }
+ 
+       // hypre actually deletes the mapping pointer in HYPRE_BoomerAMGDestroy,
+       // so we don't need to track it
+@@ -5079,7 +5188,10 @@ void HypreBoomerAMG::SetElasticityOptions(ParFiniteElementSpace *fespace_,
+                                           bool interp_refine_)
+ {
+ #ifdef HYPRE_USING_GPU
+-   MFEM_ABORT("this method is not supported in hypre built with GPU support");
++   if (HypreUsingGPU())
++   {
++      MFEM_ABORT("this method is not supported in hypre built with GPU support");
++   }
+ #endif
+ 
+    // Save the finite element space to support multiple calls to SetOperator()
+@@ -5315,23 +5427,14 @@ void HypreAMS::MakeSolver(int sdim, int cycle_type)
+    int rlx_sweeps       = 1;
+    double rlx_weight    = 1.0;
+    double rlx_omega     = 1.0;
+-#if !defined(HYPRE_USING_GPU)
+-   int amg_coarsen_type = 10;
+-   int amg_agg_levels   = 1;
+-   int amg_rlx_type     = 8;
+-   int rlx_type         = 2;
+-   double theta         = 0.25;
+-   int amg_interp_type  = 6;
+-   int amg_Pmax         = 4;
+-#else
+-   int amg_coarsen_type = 8;
+-   int amg_agg_levels   = 0;
+-   int amg_rlx_type     = 18;
+-   int rlx_type         = 1;
++   const bool hypre_gpu = HypreUsingGPU();
++   int amg_coarsen_type = hypre_gpu ? 8 : 10;
++   int amg_agg_levels   = hypre_gpu ? 0 : 1;
++   int amg_rlx_type     = hypre_gpu ? 18 : 8;
++   int rlx_type         = hypre_gpu ? 1: 2;
+    double theta         = 0.25;
+    int amg_interp_type  = 6;
+    int amg_Pmax         = 4;
+-#endif
+ 
+    space_dim = sdim;
+    ams_cycle_type = cycle_type;
+@@ -5692,23 +5795,14 @@ void HypreADS::MakeSolver()
+    int rlx_sweeps       = 1;
+    double rlx_weight    = 1.0;
+    double rlx_omega     = 1.0;
+-#if !defined(HYPRE_USING_GPU)
+-   int rlx_type         = 2;
+-   int amg_coarsen_type = 10;
+-   int amg_agg_levels   = 1;
+-   int amg_rlx_type     = 8;
+-   double theta         = 0.25;
+-   int amg_interp_type  = 6;
+-   int amg_Pmax         = 4;
+-#else
+-   int rlx_type         = 1;
+-   int amg_coarsen_type = 8;
+-   int amg_agg_levels   = 0;
+-   int amg_rlx_type     = 18;
++   const bool hypre_gpu = HypreUsingGPU();
++   int rlx_type         = hypre_gpu ? 1 : 2;
++   int amg_coarsen_type = hypre_gpu ? 8 : 10;
++   int amg_agg_levels   = hypre_gpu ? 0 : 1;
++   int amg_rlx_type     = hypre_gpu ? 18 : 8;
+    double theta         = 0.25;
+    int amg_interp_type  = 6;
+    int amg_Pmax         = 4;
+-#endif
+ 
+    HYPRE_ADSCreate(&ads);
+ 
+diff --git a/linalg/hypre.hpp b/linalg/hypre.hpp
+index 8e7802713..4dfa72260 100644
+--- a/linalg/hypre.hpp
++++ b/linalg/hypre.hpp
+@@ -43,19 +43,6 @@
+ #error "MFEM_USE_HIP=YES is required when HYPRE is built with HIP!"
+ #endif
+ 
+-// MFEM_HYPRE_FORALL is a macro similar to mfem::forall, but it executes on the
+-// device that hypre was configured with (no matter what device was selected
+-// in MFEM's runtime configuration).
+-#if defined(HYPRE_USING_CUDA)
+-#define MFEM_HYPRE_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE      \
+-                                       (int i) {__VA_ARGS__})
+-#elif defined(HYPRE_USING_HIP)
+-#define MFEM_HYPRE_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE     \
+-                                        (int i) {__VA_ARGS__})
+-#else
+-#define MFEM_HYPRE_FORALL(i, N,...) for (int i = 0; i < N; i++) { __VA_ARGS__ }
+-#endif
+-
+ #include "sparsemat.hpp"
+ #include "hypre_parcsr.hpp"
+ 
+@@ -134,14 +121,19 @@ inline int to_int(HYPRE_Int i)
+ 
+ 
+ /// The MemoryClass used by Hypre objects.
+-inline constexpr MemoryClass GetHypreMemoryClass()
++inline MemoryClass GetHypreMemoryClass()
+ {
+ #if !defined(HYPRE_USING_GPU)
+    return MemoryClass::HOST;
+ #elif defined(HYPRE_USING_UNIFIED_MEMORY)
+    return MemoryClass::MANAGED;
+ #else
++#if MFEM_HYPRE_VERSION >= 22600
++   return (GetHypreMemoryLocation() == HYPRE_MEMORY_DEVICE) ? MemoryClass::DEVICE :
++          MemoryClass::HOST;
++#else // MFEM_HYPRE_VERSION >= 22600
+    return MemoryClass::DEVICE;
++#endif // MFEM_HYPRE_VERSION >= 22600
+ #endif
+ }
+ 
+@@ -153,7 +145,27 @@ inline MemoryType GetHypreMemoryType()
+ #elif defined(HYPRE_USING_UNIFIED_MEMORY)
+    return MemoryType::MANAGED;
+ #else
++#if MFEM_HYPRE_VERSION >= 22600
++   return (GetHypreMemoryLocation() == HYPRE_MEMORY_DEVICE) ? MemoryType::DEVICE :
++          Device::GetHostMemoryType();
++#else // MFEM_HYPRE_VERSION >= 22600
+    return MemoryType::DEVICE;
++#endif // MFEM_HYPRE_VERSION >= 22600
++#endif
++}
++
++inline bool HypreUsingGPU()
++{
++#ifdef HYPRE_USING_GPU
++#if MFEM_HYPRE_VERSION >= 22600
++   HYPRE_MemoryLocation loc;
++   HYPRE_GetMemoryLocation(&loc);
++   return loc == HYPRE_MEMORY_DEVICE;
++#else // MFEM_HYPRE_VERSION >= 22600
++   return true;
++#endif // MFEM_HYPRE_VERSION >= 22600
++#else
++   return false;
+ #endif
+ }
+ 
+@@ -1039,15 +1051,10 @@ public:
+    enum Type { Jacobi = 0, l1Jacobi = 1, l1GS = 2, l1GStr = 4, lumpedJacobi = 5,
+                GS = 6, OPFS = 10, Chebyshev = 16, Taubin = 1001, FIR = 1002
+              };
+-#if !defined(HYPRE_USING_GPU)
+-   static constexpr Type default_type = l1GS;
+-#else
+-   static constexpr Type default_type = l1Jacobi;
+-#endif
+ 
+    HypreSmoother();
+ 
+-   HypreSmoother(const HypreParMatrix &A_, int type = default_type,
++   HypreSmoother(const HypreParMatrix &A_, int type = DefaultType(),
+                  int relax_times = 1, double relax_weight = 1.0,
+                  double omega = 1.0, int poly_order = 2,
+                  double poly_fraction = .3, int eig_est_cg_iter = 10);
+@@ -1094,6 +1101,9 @@ public:
+    /// Apply transpose of the smoother to relax the linear system Ax=b
+    virtual void MultTranspose(const Vector &b, Vector &x) const;
+ 
++   /// Return default smoother type for configuration
++   static Type DefaultType() { return HypreUsingGPU() ? l1Jacobi : l1GS; }
++
+    virtual ~HypreSmoother();
+ };
+ 
+diff --git a/linalg/hypre_parcsr.hpp b/linalg/hypre_parcsr.hpp
+index 3f428ecaa..c7cb98e76 100644
+--- a/linalg/hypre_parcsr.hpp
++++ b/linalg/hypre_parcsr.hpp
+@@ -29,6 +29,33 @@ typedef HYPRE_Int HYPRE_BigInt;
+ #define HYPRE_MPI_BIG_INT HYPRE_MPI_INT
+ #endif
+ 
++namespace mfem
++{
++#if MFEM_HYPRE_VERSION >= 22600
++inline HYPRE_MemoryLocation GetHypreMemoryLocation()
++{
++   HYPRE_MemoryLocation loc;
++   HYPRE_GetMemoryLocation(&loc);
++   return loc;
++}
++inline HYPRE_ExecutionPolicy GetHypreExecutionPolicy()
++{
++   HYPRE_ExecutionPolicy pol;
++   HYPRE_GetExecutionPolicy(&pol);
++   return pol;
++}
++#else
++inline HYPRE_MemoryLocation GetHypreMemoryLocation()
++{
++#ifdef HYPRE_USING_GPU
++   return HYPRE_MEMORY_DEVICE;
++#else
++   return HYPRE_MEMORY_HOST;
++#endif // HYPRE_USING_GPU
++}
++#endif
++};
++
+ // Define macro wrappers for hypre_TAlloc, hypre_CTAlloc and hypre_TFree:
+ // mfem_hypre_TAlloc, mfem_hypre_CTAlloc, and mfem_hypre_TFree, respectively.
+ // Note: these macros are used in hypre.cpp, hypre_parcsr.cpp, and perhaps
+@@ -46,10 +73,10 @@ typedef HYPRE_Int HYPRE_BigInt;
+ #else // MFEM_HYPRE_VERSION >= 21400
+ 
+ #define mfem_hypre_TAlloc(type, size) \
+-   hypre_TAlloc(type, size, HYPRE_MEMORY_DEVICE)
++   hypre_TAlloc(type, size, mfem::GetHypreMemoryLocation())
+ #define mfem_hypre_CTAlloc(type, size) \
+-   hypre_CTAlloc(type, size, HYPRE_MEMORY_DEVICE)
+-#define mfem_hypre_TFree(ptr) hypre_TFree(ptr, HYPRE_MEMORY_DEVICE)
++   hypre_CTAlloc(type, size, mfem::GetHypreMemoryLocation())
++#define mfem_hypre_TFree(ptr) hypre_TFree(ptr, mfem::GetHypreMemoryLocation())
+ 
+ #define mfem_hypre_TAlloc_host(type, size) \
+    hypre_TAlloc(type, size, HYPRE_MEMORY_HOST)
+diff --git a/miniapps/hdiv-linear-solver/discrete_divergence.cpp b/miniapps/hdiv-linear-solver/discrete_divergence.cpp
+index 7b4eb9e4f..fde64a4cd 100644
+--- a/miniapps/hdiv-linear-solver/discrete_divergence.cpp
++++ b/miniapps/hdiv-linear-solver/discrete_divergence.cpp
+@@ -85,7 +85,7 @@ void EliminateColumns(HypreParMatrix &D, const Array<int> &ess_dofs)
+       const auto I = diag->i;
+       const auto J = diag->j;
+       auto data = diag->data;
+-      MFEM_HYPRE_FORALL(i, nrows_diag,
++      mfem::forall_switch(HypreUsingGPU(), nrows_diag, [=] MFEM_HOST_DEVICE (int i)
+       {
+          for (int jj=I[i]; jj<I[i+1]; ++jj)
+          {
+@@ -109,7 +109,7 @@ void EliminateColumns(HypreParMatrix &D, const Array<int> &ess_dofs)
+       const auto I = offd->i;
+       const auto J = offd->j;
+       auto data = offd->data;
+-      MFEM_HYPRE_FORALL(i, nrows_offd,
++      mfem::forall_switch(HypreUsingGPU(), nrows_offd, [=] MFEM_HOST_DEVICE (int i)
+       {
+          for (int jj=I[i]; jj<I[i+1]; ++jj)
+          {
diff --git a/extern/patch/mfem/patch_mesh_partitioner_dev.diff b/extern/patch/mfem/patch_mesh_partitioner_dev.diff
index b5e8e0b97..a9ef741ab 100644
--- a/extern/patch/mfem/patch_mesh_partitioner_dev.diff
+++ b/extern/patch/mfem/patch_mesh_partitioner_dev.diff
@@ -1,5 +1,5 @@
 diff --git a/.gitignore b/.gitignore
-index f7e7265d6..1e8f32ff6 100644
+index 1ba1a28c8..fede588e6 100644
 --- a/.gitignore
 +++ b/.gitignore
 @@ -234,7 +234,7 @@ miniapps/meshing/mobius-strip.mesh
@@ -319,7 +319,7 @@ index 000000000..9b17ae982
 +   return 0;
 +}
 diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp
-index 78a76041c..84f17f797 100644
+index ffb448cab..f55d6904e 100644
 --- a/fem/pfespace.cpp
 +++ b/fem/pfespace.cpp
 @@ -715,13 +715,10 @@ void ParFiniteElementSpace::CheckNDSTriaDofs()
@@ -340,13 +340,13 @@ index 78a76041c..84f17f797 100644
  
     // Combine results
 diff --git a/general/array.hpp b/general/array.hpp
-index 0f16b3023..a19b099a1 100644
+index c878a7c48..c740562a2 100644
 --- a/general/array.hpp
 +++ b/general/array.hpp
-@@ -77,8 +77,8 @@ public:
-    /** @brief Creates array using an existing c-array of asize elements;
-        allocsize is set to -asize to indicate that the data will not
-        be deleted. */
+@@ -76,8 +76,8 @@ public:
+ 
+    /** @brief Creates array using an externally allocated pointer @a data_ to
+        @a asize elements. The data pointer will not be deleted by Array. */
 -   inline Array(T *data_, int asize)
 -   { data.Wrap(data_, asize, false); size = asize; }
 +   inline Array(T *data_, int asize, bool own_data = false)
@@ -354,7 +354,7 @@ index 0f16b3023..a19b099a1 100644
  
     /// Copy constructor: deep copy from @a src
     /** This method supports source arrays using any MemoryType. */
-@@ -206,7 +206,7 @@ public:
+@@ -205,7 +205,7 @@ public:
     inline void Copy(Array &copy) const;
  
     /// Make this Array a reference to a pointer.
@@ -363,7 +363,7 @@ index 0f16b3023..a19b099a1 100644
  
     /// Make this Array a reference to 'master'.
     inline void MakeRef(const Array &master);
-@@ -869,11 +869,11 @@ inline void Array<T>::Copy(Array &copy) const
+@@ -868,11 +868,11 @@ inline void Array<T>::Copy(Array &copy) const
  }
  
  template <class T>
@@ -379,7 +379,7 @@ index 0f16b3023..a19b099a1 100644
  
  template <class T>
 diff --git a/general/communication.cpp b/general/communication.cpp
-index 0c2fffc1f..e8002a273 100644
+index 840d1e39c..48209622e 100644
 --- a/general/communication.cpp
 +++ b/general/communication.cpp
 @@ -275,7 +275,7 @@ void GroupTopology::Save(ostream &os) const
@@ -392,7 +392,7 @@ index 0c2fffc1f..e8002a273 100644
     {
        int group_size = GetGroupSize(group_id);
 diff --git a/general/table.hpp b/general/table.hpp
-index 2ed9f4a1b..96373b2d1 100644
+index 6f90e95bf..85efaeaec 100644
 --- a/general/table.hpp
 +++ b/general/table.hpp
 @@ -208,6 +208,7 @@ void Transpose (const Table &A, Table &At, int ncols_A_ = -1);
@@ -404,7 +404,7 @@ index 2ed9f4a1b..96373b2d1 100644
  
  ///  C = A * B  (as boolean matrices)
 diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp
-index 7ce605dc8..7482859d3 100644
+index 572ad262d..ebb273f62 100644
 --- a/mesh/mesh.cpp
 +++ b/mesh/mesh.cpp
 @@ -20,6 +20,7 @@
@@ -423,7 +423,7 @@ index 7ce605dc8..7482859d3 100644
  #include <unordered_set>
  
  // Include the METIS header, if using version 5. If using METIS 4, the needed
-@@ -1290,7 +1292,7 @@ Mesh::FaceInformation::operator Mesh::FaceInfo() const
+@@ -1332,7 +1334,7 @@ Mesh::FaceInformation::operator Mesh::FaceInfo() const
     return res;
  }
  
@@ -432,7 +432,7 @@ index 7ce605dc8..7482859d3 100644
  {
     os << "face topology=";
     switch (info.topology)
-@@ -3124,7 +3126,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
+@@ -3166,7 +3168,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
     {
        GetElementToFaceTable();
        GenerateFaces();
@@ -441,7 +441,7 @@ index 7ce605dc8..7482859d3 100644
        {
           GenerateBoundaryElements();
           GetElementToFaceTable(); // update be_to_face
-@@ -3144,7 +3146,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
+@@ -3186,7 +3188,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
        if (Dim == 2)
        {
           GenerateFaces(); // 'Faces' in 2D refers to the edges
@@ -450,7 +450,7 @@ index 7ce605dc8..7482859d3 100644
           {
              GenerateBoundaryElements();
           }
-@@ -3158,7 +3160,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
+@@ -3200,7 +3202,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
     if (Dim == 1)
     {
        GenerateFaces();
@@ -459,7 +459,7 @@ index 7ce605dc8..7482859d3 100644
        {
           // be_to_face will be set inside GenerateBoundaryElements
           GenerateBoundaryElements();
-@@ -6083,6 +6085,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const
+@@ -6125,6 +6127,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const
  
  void Mesh::SetCurvature(int order, bool discont, int space_dim, int ordering)
  {
@@ -472,7 +472,7 @@ index 7ce605dc8..7482859d3 100644
     space_dim = (space_dim == -1) ? spaceDim : space_dim;
     FiniteElementCollection* nfec;
     if (discont)
-@@ -13079,6 +13087,878 @@ void Mesh::GetGeometricParametersFromJacobian(const DenseMatrix &J,
+@@ -13121,6 +13129,878 @@ void Mesh::GetGeometricParametersFromJacobian(const DenseMatrix &J,
  }
  
  
@@ -1352,7 +1352,7 @@ index 7ce605dc8..7482859d3 100644
                                     int flags, MemoryType d_mt)
  {
 diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp
-index 9820b4e95..fd8111f5e 100644
+index 12a7b3ed9..3870604aa 100644
 --- a/mesh/mesh.hpp
 +++ b/mesh/mesh.hpp
 @@ -29,6 +29,7 @@
@@ -1401,7 +1401,7 @@ index 9820b4e95..fd8111f5e 100644
                    const std::string &comments = "") const;
  
     /// Used in GetFaceElementTransformations (...)
-@@ -546,7 +549,7 @@ protected:
+@@ -541,7 +544,7 @@ protected:
         mfem v1.2 format with the given section_delimiter at the end.
         If @a comments is non-empty, it will be printed after the first line of
         the file, and each line should begin with '#'. */
@@ -1410,7 +1410,7 @@ index 9820b4e95..fd8111f5e 100644
                  std::string section_delimiter = "",
                  const std::string &comments = "") const;
  
-@@ -957,7 +960,7 @@ public:
+@@ -950,7 +953,7 @@ public:
  
     int AddBdrPoint(int v, int attr = 1);
  
@@ -1419,7 +1419,7 @@ index 9820b4e95..fd8111f5e 100644
     /// Finalize the construction of a triangular Mesh.
     void FinalizeTriMesh(int generate_edges = 0, int refine = 0,
                          bool fix_orientation = true);
-@@ -2203,7 +2206,7 @@ public:
+@@ -2256,7 +2259,7 @@ public:
                                 std::ostream &os, int elem_attr = 0) const;
  
     void PrintElementsWithPartitioning (int *partitioning,
@@ -1428,7 +1428,7 @@ index 9820b4e95..fd8111f5e 100644
                                         int interior_faces = 0);
  
     /// Print set of disjoint surfaces:
-@@ -2211,13 +2214,13 @@ public:
+@@ -2264,13 +2267,13 @@ public:
      * If Aface_face(i,j) != 0, print face j as a boundary
      * element with attribute i+1.
      */
@@ -1444,7 +1444,7 @@ index 9820b4e95..fd8111f5e 100644
  
     /** @brief Compute and print mesh characteristics such as number of vertices,
         number of elements, number of boundary elements, minimal and maximal
-@@ -2237,7 +2240,7 @@ public:
+@@ -2290,7 +2293,7 @@ public:
  
  #ifdef MFEM_DEBUG
     /// Output an NCMesh-compatible debug dump.
@@ -1453,7 +1453,7 @@ index 9820b4e95..fd8111f5e 100644
  #endif
  
     /// @}
-@@ -2316,9 +2319,194 @@ public:
+@@ -2369,9 +2372,194 @@ public:
     /// @}
  };
  
@@ -1651,7 +1651,7 @@ index 9820b4e95..fd8111f5e 100644
  
  
  /** @brief Structure for storing mesh geometric factors: coordinates, Jacobians,
-@@ -2327,7 +2515,6 @@ std::ostream &operator<<(std::ostream &out, const Mesh &mesh);
+@@ -2380,7 +2568,6 @@ std::ostream &operator<<(std::ostream &out, const Mesh &mesh);
      Mesh. See Mesh::GetGeometricFactors(). */
  class GeometricFactors
  {
@@ -1659,7 +1659,7 @@ index 9820b4e95..fd8111f5e 100644
  private:
     void Compute(const GridFunction &nodes,
                  MemoryType d_mt = MemoryType::DEFAULT);
-@@ -2375,6 +2562,7 @@ public:
+@@ -2428,6 +2615,7 @@ public:
     Vector detJ;
  };
  
@@ -1667,7 +1667,7 @@ index 9820b4e95..fd8111f5e 100644
  /** @brief Structure for storing face geometric factors: coordinates, Jacobians,
      determinants of the Jacobians, and normal vectors. */
  /** Typically objects of this type are constructed and owned by objects of class
-@@ -2429,6 +2617,7 @@ public:
+@@ -2482,6 +2670,7 @@ public:
     Vector normal;
  };
  
@@ -1675,7 +1675,7 @@ index 9820b4e95..fd8111f5e 100644
  /// Class used to extrude the nodes of a mesh
  class NodeExtrudeCoefficient : public VectorCoefficient
  {
-@@ -2460,8 +2649,12 @@ inline void ShiftRight(int &a, int &b, int &c)
+@@ -2513,8 +2702,12 @@ inline void ShiftRight(int &a, int &b, int &c)
     a = c;  c = b;  b = t;
  }
  
@@ -1690,7 +1690,7 @@ index 9820b4e95..fd8111f5e 100644
  }
  
 diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp
-index 97b29108b..5017cd012 100644
+index 1c212c077..3f23d1877 100644
 --- a/mesh/pmesh.cpp
 +++ b/mesh/pmesh.cpp
 @@ -249,6 +249,8 @@ ParMesh::ParMesh(MPI_Comm comm, Mesh &mesh, int *partitioning_,
@@ -1710,7 +1710,7 @@ index 97b29108b..5017cd012 100644
  
     Mesh::Finalize(refine, fix_orientation);
  
-@@ -6245,6 +6248,7 @@ void ParMesh::ParPrint(ostream &os, const std::string &comments) const
+@@ -6278,6 +6281,7 @@ void ParMesh::ParPrint(ostream &os, const std::string &comments) const
     {
        os << "total_shared_faces " << sface_lface.Size() << '\n';
     }
@@ -1719,10 +1719,10 @@ index 97b29108b..5017cd012 100644
     {
        {
 diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp
-index af5aa9a7c..d723204f6 100644
+index b7744f19c..479c6b502 100644
 --- a/mesh/pmesh.hpp
 +++ b/mesh/pmesh.hpp
-@@ -108,6 +108,8 @@ protected:
+@@ -106,6 +106,8 @@ protected:
     // Convert the local 'meshgen' to a global one.
     void ReduceMeshGen();
  
@@ -1732,7 +1732,7 @@ index af5aa9a7c..d723204f6 100644
     void FinalizeParTopo();
  
 diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp
-index d094b70cd..22eecbd42 100644
+index 283b35f8a..9653316bf 100644
 --- a/mesh/tetrahedron.cpp
 +++ b/mesh/tetrahedron.cpp
 @@ -53,7 +53,7 @@ void Tetrahedron::Init(int ind1, int ind2, int ind3, int ind4, int attr,
@@ -1758,7 +1758,7 @@ index d094b70cd..22eecbd42 100644
     switch (face)
     {
 diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp
-index 80f1de4e0..e6093bc08 100644
+index 2062c42ba..83665454e 100644
 --- a/mesh/tetrahedron.hpp
 +++ b/mesh/tetrahedron.hpp
 @@ -58,12 +58,13 @@ public:
@@ -1779,7 +1779,7 @@ index 80f1de4e0..e6093bc08 100644
     void SetRefinementFlag(int rf) { refinement_flag = rf; }
  
 diff --git a/miniapps/meshing/makefile b/miniapps/meshing/makefile
-index 1ccec0455..e34a5637e 100644
+index 7510d8892..2029d1967 100644
 --- a/miniapps/meshing/makefile
 +++ b/miniapps/meshing/makefile
 @@ -123,7 +123,7 @@ clean-build:
@@ -1792,7 +1792,7 @@ index 1ccec0455..e34a5637e 100644
  	@rm -f partitioning.txt shaper.mesh extruder.mesh
  	@rm -f optimized* perturbed* polar-nc.mesh
 diff --git a/miniapps/meshing/mesh-explorer.cpp b/miniapps/meshing/mesh-explorer.cpp
-index f05e18e83..67e8b1f65 100644
+index 925371f7f..a6257a034 100644
 --- a/miniapps/meshing/mesh-explorer.cpp
 +++ b/miniapps/meshing/mesh-explorer.cpp
 @@ -308,6 +308,7 @@ int main (int argc, char *argv[])
diff --git a/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff b/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff
new file mode 100644
index 000000000..fdaf7cadf
--- /dev/null
+++ b/extern/patch/mfem/patch_mesh_prism_vtu_fix.diff
@@ -0,0 +1,13 @@
+diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp
+index aa416e1b1..42faa8aa5 100644
+--- a/mesh/mesh.cpp
++++ b/mesh/mesh.cpp
+@@ -11581,7 +11581,7 @@ void Mesh::PrintVTU(std::ostream &os, int ref, VTKFormat format,
+             const int *p = VTKGeometry::VertexPermutation[geom];
+             for (int k = 0; k < nv; k++, j++)
+             {
+-               WriteBinaryOrASCII(os, buf, np + RG[p ? p[j] : j], " ",
++               WriteBinaryOrASCII(os, buf, np + RG[p ? (j - k + p[k]) : j], " ",
+                                   format);
+             }
+             if (format == VTKFormat::ASCII) { os << '\n'; }
diff --git a/extern/patch/mfem/patch_mesh_vis_dev.diff b/extern/patch/mfem/patch_mesh_vis_dev.diff
index 0541362af..02ad69963 100644
--- a/extern/patch/mfem/patch_mesh_vis_dev.diff
+++ b/extern/patch/mfem/patch_mesh_vis_dev.diff
@@ -1,5 +1,5 @@
 diff --git a/fem/datacollection.cpp b/fem/datacollection.cpp
-index 0dc718b07..3d478edaa 100644
+index d73d2bb5b..f1a21e3d6 100644
 --- a/fem/datacollection.cpp
 +++ b/fem/datacollection.cpp
 @@ -210,6 +210,11 @@ void DataCollection::Save()
@@ -282,7 +282,7 @@ index 0dc718b07..3d478edaa 100644
  {
     restart_mode = restart_mode_;
 diff --git a/fem/datacollection.hpp b/fem/datacollection.hpp
-index c216188af..4a26cd1f8 100644
+index 351c1239d..3b33198cf 100644
 --- a/fem/datacollection.hpp
 +++ b/fem/datacollection.hpp
 @@ -133,6 +133,10 @@ private:
@@ -470,10 +470,10 @@ index c216188af..4a26cd1f8 100644
     /// preserve timestep metadata for any solutions prior to the currently
     /// defined time.
 diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp
-index 102ee4de6..625b049c0 100644
+index 41ac08480..572ad262d 100644
 --- a/mesh/mesh.cpp
 +++ b/mesh/mesh.cpp
-@@ -11423,7 +11423,7 @@ void Mesh::PrintVTU(std::string fname,
+@@ -11476,7 +11476,7 @@ void Mesh::PrintVTU(std::string fname,
                      VTKFormat format,
                      bool high_order_output,
                      int compression_level,
@@ -482,7 +482,7 @@ index 102ee4de6..625b049c0 100644
  {
     int ref = (high_order_output && Nodes)
               ? Nodes->FESpace()->GetMaxElementOrder() : 1;
-@@ -11437,7 +11437,7 @@ void Mesh::PrintVTU(std::string fname,
+@@ -11490,7 +11490,7 @@ void Mesh::PrintVTU(std::string fname,
     }
     os << " byte_order=\"" << VTKByteOrder() << "\">\n";
     os << "<UnstructuredGrid>\n";
@@ -492,10 +492,10 @@ index 102ee4de6..625b049c0 100644
     os << "</UnstructuredGrid>\n";
     os << "</VTKFile>" << std::endl;
 diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp
-index b82ce4a76..317e4349a 100644
+index 02a02dfc6..12a7b3ed9 100644
 --- a/mesh/mesh.hpp
 +++ b/mesh/mesh.hpp
-@@ -2186,7 +2186,7 @@ public:
+@@ -2240,7 +2240,7 @@ public:
                           VTKFormat format=VTKFormat::ASCII,
                           bool high_order_output=false,
                           int compression_level=0,
@@ -505,10 +505,10 @@ index b82ce4a76..317e4349a 100644
         boundary attributes as a data array (useful for boundary conditions). */
     void PrintBdrVTU(std::string fname,
 diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp
-index 81012851d..d4f6d1abd 100644
+index ac353e3d7..1c212c077 100644
 --- a/mesh/pmesh.cpp
 +++ b/mesh/pmesh.cpp
-@@ -6303,7 +6303,7 @@ void ParMesh::PrintVTU(std::string pathname,
+@@ -6332,7 +6332,7 @@ void ParMesh::PrintVTU(std::string pathname,
                         VTKFormat format,
                         bool high_order_output,
                         int compression_level,
@@ -517,7 +517,7 @@ index 81012851d..d4f6d1abd 100644
  {
     int pad_digits_rank = 6;
     DataCollection::create_directory(pathname, this, MyRank);
-@@ -6363,7 +6363,8 @@ void ParMesh::PrintVTU(std::string pathname,
+@@ -6392,7 +6392,8 @@ void ParMesh::PrintVTU(std::string pathname,
  
     std::string vtu_fname = pathname + "/" + fname + ".proc"
                             + to_padded_string(MyRank, pad_digits_rank);
@@ -528,10 +528,10 @@ index 81012851d..d4f6d1abd 100644
  
  int ParMesh::FindPoints(DenseMatrix& point_mat, Array<int>& elem_id,
 diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp
-index 472a0491e..af5aa9a7c 100644
+index 1aa027a3c..b7744f19c 100644
 --- a/mesh/pmesh.hpp
 +++ b/mesh/pmesh.hpp
-@@ -690,7 +690,7 @@ public:
+@@ -721,7 +721,7 @@ public:
                   VTKFormat format=VTKFormat::ASCII,
                   bool high_order_output=false,
                   int compression_level=0,
diff --git a/extern/patch/mfem/patch_mfem_device_fixes.diff b/extern/patch/mfem/patch_mfem_device_fixes.diff
index 448adba25..03da57976 100644
--- a/extern/patch/mfem/patch_mfem_device_fixes.diff
+++ b/extern/patch/mfem/patch_mfem_device_fixes.diff
@@ -1,8 +1,8 @@
 diff --git a/general/device.cpp b/general/device.cpp
-index ccee71cd7..f664f70c3 100644
+index b239eea59..8b3e586ba 100644
 --- a/general/device.cpp
 +++ b/general/device.cpp
-@@ -9,14 +9,13 @@
+@@ -9,14 +9,12 @@
  // terms of the BSD-3 license. We welcome feedback and contributions, see file
  // CONTRIBUTING.md for details.
  
@@ -12,13 +12,13 @@ index ccee71cd7..f664f70c3 100644
  #ifdef MFEM_USE_CEED
  #include "../fem/ceed/interface/util.hpp"
  #endif
- 
+-
 -#include <unordered_map>
 -#include <string>
  #include <map>
  
  namespace mfem
-@@ -144,13 +143,11 @@ Device::Device()
+@@ -144,13 +142,11 @@ Device::Device()
     }
  }
  
@@ -32,7 +32,7 @@ index ccee71cd7..f664f70c3 100644
  #ifdef MFEM_USE_CEED
        // Destroy FES -> CeedBasis, CeedElemRestriction hash table contents
        for (auto entry : internal::ceed_basis_map)
-@@ -169,7 +166,6 @@ Device::~Device()
+@@ -169,7 +165,6 @@ Device::~Device()
        mm.Destroy();
     }
     Get().ngpu = -1;
@@ -40,18 +40,19 @@ index ccee71cd7..f664f70c3 100644
     Get().backends = Backend::CPU;
     Get().host_mem_type = MemoryType::HOST;
     Get().host_mem_class = MemoryClass::HOST;
-@@ -189,6 +185,7 @@ void Device::Configure(const std::string &device, const int device_id)
+@@ -193,28 +188,22 @@ void Device::Configure(const std::string &device, const int device_id)
+    {
+       bmap[internal::backend_name[i]] = internal::backend_list[i];
     }
- 
-    std::map<std::string, Backend::Id> bmap;
+-   std::string::size_type beg = 0, end, option;
 +   std::string device_option;
-    for (int i = 0; i < Backend::NUM_BACKENDS; i++)
++   std::string::size_type beg = 0, end;
+    while (1)
     {
-       bmap[internal::backend_name[i]] = internal::backend_list[i];
-@@ -200,21 +197,14 @@ void Device::Configure(const std::string &device, const int device_id)
+       end = device.find(',', beg);
        end = (end != std::string::npos) ? end : device.size();
        const std::string bname = device.substr(beg, end - beg);
-       option = bname.find(':');
+-      option = bname.find(':');
 -      if (option==std::string::npos) // No option
 -      {
 -         const std::string backend = bname;
@@ -60,9 +61,10 @@ index ccee71cd7..f664f70c3 100644
 -         Get().MarkBackend(it->second);
 -      }
 -      else
++      const auto option = bname.find(':');
 +      const std::string backend = (option != std::string::npos) ?
 +                                  bname.substr(0, option) : bname;
-+      std::map<std::string, Backend::Id>::iterator it = bmap.find(backend);
++      const auto it = bmap.find(backend);
 +      MFEM_VERIFY(it != bmap.end(), "Invalid backend name: '" << backend << '\'');
 +      Get().MarkBackend(it->second);
 +      if (option != std::string::npos)
@@ -77,7 +79,7 @@ index ccee71cd7..f664f70c3 100644
        }
        if (end == device.size()) { break; }
        beg = end + 1;
-@@ -240,10 +230,10 @@ void Device::Configure(const std::string &device, const int device_id)
+@@ -240,10 +229,10 @@ void Device::Configure(const std::string &device, const int device_id)
  #endif
  
     // Perform setup.
@@ -86,137 +88,39 @@ index ccee71cd7..f664f70c3 100644
  
 -   // Enable the device
 -   Enable();
-+   // Enable the device.
++   // Configure the host/device MemoryType/MemoryClass.
 +   Get().UpdateMemoryTypeAndClass(device_option);
  
     // Copy all data members from the global 'singleton_device' into '*this'.
     if (this != &Get()) { std::memcpy(this, &Get(), sizeof(Device)); }
-@@ -252,30 +242,6 @@ void Device::Configure(const std::string &device, const int device_id)
-    destroy_mm = true;
- }
- 
--// static method
--void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt)
--{
--   // If the device and/or the MemoryTypes are configured through the
--   // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this
--   // method.
--   if (mem_host_env || mem_device_env || device_env) { return; }
--
--   MFEM_VERIFY(!IsConfigured(), "the default MemoryTypes can only be set before"
--               " Device construction and configuration");
--   MFEM_VERIFY(IsHostMemory(h_mt),
--               "invalid host MemoryType, h_mt = " << (int)h_mt);
--   MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt,
--               "invalid device MemoryType, d_mt = " << (int)d_mt
--               << " (h_mt = " << (int)h_mt << ')');
--
--   Get().host_mem_type = h_mt;
--   Get().device_mem_type = d_mt;
--   mem_types_set = true;
--
--   // h_mt and d_mt will be set as dual to each other during configuration by
--   // the call mm.Configure(...) in UpdateMemoryTypeAndClass()
--}
--
- void Device::Print(std::ostream &os)
- {
-    os << "Device configuration: ";
-@@ -307,96 +273,53 @@ void Device::Print(std::ostream &os)
+@@ -307,10 +296,9 @@ void Device::Print(std::ostream &os)
     os << std::endl;
  }
  
 -void Device::UpdateMemoryTypeAndClass()
-+// static method
-+void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt)
++void Device::UpdateMemoryTypeAndClass(const std::string &device_option)
  {
--   const bool debug = Device::Allows(Backend::DEBUG_DEVICE);
+    const bool debug = Device::Allows(Backend::DEBUG_DEVICE);
 -
--   const bool device = Device::Allows(Backend::DEVICE_MASK);
--
--#ifdef MFEM_USE_UMPIRE
--   // If MFEM has been compiled with Umpire support, use it as the default
--   if (!mem_host_env && !mem_types_set)
--   {
--      host_mem_type = MemoryType::HOST_UMPIRE;
--      if (!mem_device_env)
--      {
--         device_mem_type = MemoryType::HOST_UMPIRE;
--      }
--   }
--#endif
--
--   // Enable the device memory type
--   if (device)
--   {
--      if (!mem_device_env)
--      {
--         if (mem_host_env)
--         {
--            switch (host_mem_type)
--            {
--               case MemoryType::HOST_UMPIRE:
--                  device_mem_type = MemoryType::DEVICE_UMPIRE;
--                  break;
--               case MemoryType::HOST_DEBUG:
--                  device_mem_type = MemoryType::DEVICE_DEBUG;
--                  break;
--               default:
--                  device_mem_type = MemoryType::DEVICE;
--            }
--         }
--         else if (!mem_types_set)
--         {
--#ifndef MFEM_USE_UMPIRE
--            device_mem_type = MemoryType::DEVICE;
--#else
--            device_mem_type = MemoryType::DEVICE_UMPIRE;
--#endif
--         }
--      }
--      device_mem_class = MemoryClass::DEVICE;
--   }
--
--   // Enable the UVM shortcut when requested
--   if (device && device_option && !strcmp(device_option, "uvm"))
--   {
--      host_mem_type = MemoryType::MANAGED;
--      device_mem_type = MemoryType::MANAGED;
--   }
-+   // If the device and/or the MemoryTypes are configured through the
-+   // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this
-+   // method.
-+   if (mem_host_env || mem_device_env || device_env) { return; }
+    const bool device = Device::Allows(Backend::DEVICE_MASK);
  
--   // Enable the DEBUG mode when requested
--   if (debug)
--   {
--      host_mem_type = MemoryType::HOST_DEBUG;
--      device_mem_type = MemoryType::DEVICE_DEBUG;
--   }
-+   MFEM_VERIFY(!IsConfigured(), "The default MemoryTypes can only be set before"
-+               " Device construction and configuration");
-+   MFEM_VERIFY(IsHostMemory(h_mt),
-+               "Invalid host MemoryType, h_mt = " << (int)h_mt);
-+   MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt,
-+               "Invalid device MemoryType, d_mt = " << (int)d_mt
-+               << " (h_mt = " << (int)h_mt << ')');
- 
--   MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type),
--               "invalid device memory configuration!");
-+   Get().host_mem_type = h_mt;
-+   Get().device_mem_type = d_mt;
-+   mem_types_set = true;
- 
--   // Update the memory manager with the new settings
--   mm.Configure(host_mem_type, device_mem_type);
-+   // h_mt and d_mt will be set as dual to each other during configuration by
-+   // the call mm.Configure(...) in UpdateMemoryTypeAndClass().
+ #ifdef MFEM_USE_UMPIRE
+@@ -357,7 +345,7 @@ void Device::UpdateMemoryTypeAndClass()
+    }
+ 
+    // Enable the UVM shortcut when requested
+-   if (device && device_option && !strcmp(device_option, "uvm"))
++   if (device && device_option.find(":uvm") != std::string::npos)
+    {
+       host_mem_type = MemoryType::MANAGED;
+       device_mem_type = MemoryType::MANAGED;
+@@ -377,26 +365,29 @@ void Device::UpdateMemoryTypeAndClass()
+    mm.Configure(host_mem_type, device_mem_type);
  }
  
 -void Device::Enable()
 +// static method
-+int Device::GetNumGPU()
++int Device::GetDeviceCount()
  {
 -   const bool accelerated = Get().backends & ~(Backend::CPU);
 -   if (accelerated) { Get().mode = Device::ACCELERATED;}
@@ -253,7 +157,7 @@ index ccee71cd7..f664f70c3 100644
  #else
     MFEM_CONTRACT_VAR(dev);
     MFEM_CONTRACT_VAR(ngpu);
-@@ -418,7 +341,7 @@ static void HipDeviceSetup(const int dev, int &ngpu)
+@@ -418,7 +409,7 @@ static void HipDeviceSetup(const int dev, int &ngpu)
  static void RajaDeviceSetup(const int dev, int &ngpu)
  {
  #ifdef MFEM_USE_CUDA
@@ -262,76 +166,25 @@ index ccee71cd7..f664f70c3 100644
  #elif defined(MFEM_USE_HIP)
     HipDeviceSetup(dev, ngpu);
  #else
-@@ -443,7 +366,7 @@ static void OccaDeviceSetup(const int dev)
-       std::string mode("mode: 'CUDA', device_id : ");
-       internal::occaDevice.setup(mode.append(1,'0'+dev));
- #else
--      MFEM_ABORT("the OCCA CUDA backend requires OCCA built with CUDA!");
-+      MFEM_ABORT("The OCCA CUDA backend requires OCCA built with CUDA!");
- #endif
-    }
-    else if (omp)
-@@ -451,7 +374,7 @@ static void OccaDeviceSetup(const int dev)
- #if OCCA_OPENMP_ENABLED
-       internal::occaDevice.setup("mode: 'OpenMP'");
- #else
--      MFEM_ABORT("the OCCA OpenMP backend requires OCCA built with OpenMP!");
-+      MFEM_ABORT("The OCCA OpenMP backend requires OCCA built with OpenMP!");
- #endif
-    }
-    else
-@@ -477,7 +400,7 @@ static void OccaDeviceSetup(const int dev)
-    occa::loadKernels("mfem");
- #else
-    MFEM_CONTRACT_VAR(dev);
--   MFEM_ABORT("the OCCA backends require MFEM built with MFEM_USE_OCCA=YES");
-+   MFEM_ABORT("The OCCA backends require MFEM built with MFEM_USE_OCCA=YES");
- #endif
- }
- 
-@@ -502,80 +425,136 @@ static void CeedDeviceSetup(const char* ceed_spec)
+@@ -502,7 +493,7 @@ static void CeedDeviceSetup(const char* ceed_spec)
  #endif
  }
  
 -void Device::Setup(const int device_id)
 +void Device::Setup(const std::string &device_option, const int device_id)
  {
--   MFEM_VERIFY(ngpu == -1, "the mfem::Device is already configured!");
-+   MFEM_VERIFY(ngpu == -1, "The mfem::Device is already configured!");
- 
-    ngpu = 0;
-    dev = device_id;
- #ifndef MFEM_USE_CUDA
-    MFEM_VERIFY(!Allows(Backend::CUDA_MASK),
--               "the CUDA backends require MFEM built with MFEM_USE_CUDA=YES");
-+               "The CUDA backends require MFEM built with MFEM_USE_CUDA=YES");
- #endif
- #ifndef MFEM_USE_HIP
-    MFEM_VERIFY(!Allows(Backend::HIP_MASK),
--               "the HIP backends require MFEM built with MFEM_USE_HIP=YES");
-+               "The HIP backends require MFEM built with MFEM_USE_HIP=YES");
- #endif
- #ifndef MFEM_USE_RAJA
-    MFEM_VERIFY(!Allows(Backend::RAJA_MASK),
--               "the RAJA backends require MFEM built with MFEM_USE_RAJA=YES");
-+               "The RAJA backends require MFEM built with MFEM_USE_RAJA=YES");
- #endif
- #ifndef MFEM_USE_OPENMP
-    MFEM_VERIFY(!Allows(Backend::OMP|Backend::RAJA_OMP),
--               "the OpenMP and RAJA OpenMP backends require MFEM built with"
-+               "The OpenMP and RAJA OpenMP backends require MFEM built with"
-                " MFEM_USE_OPENMP=YES");
- #endif
+    MFEM_VERIFY(ngpu == -1, "the mfem::Device is already configured!");
+ 
+@@ -528,54 +519,41 @@ void Device::Setup(const int device_id)
  #ifndef MFEM_USE_CEED
     MFEM_VERIFY(!Allows(Backend::CEED_MASK),
--               "the CEED backends require MFEM built with MFEM_USE_CEED=YES");
+                "the CEED backends require MFEM built with MFEM_USE_CEED=YES");
 -#else
 -   int ceed_cpu  = Allows(Backend::CEED_CPU);
 -   int ceed_cuda = Allows(Backend::CEED_CUDA);
 -   int ceed_hip  = Allows(Backend::CEED_HIP);
 -   MFEM_VERIFY(ceed_cpu + ceed_cuda + ceed_hip <= 1,
 -               "Only one CEED backend can be enabled at a time!");
-+               "The CEED backends require MFEM built with MFEM_USE_CEED=YES");
  #endif
     if (Allows(Backend::CUDA)) { CudaDeviceSetup(dev, ngpu); }
     if (Allows(Backend::HIP)) { HipDeviceSetup(dev, ngpu); }
@@ -340,6 +193,29 @@ index ccee71cd7..f664f70c3 100644
 -   // The check for MFEM_USE_OCCA is in the function OccaDeviceSetup().
     if (Allows(Backend::OCCA_MASK)) { OccaDeviceSetup(dev); }
 -   if (Allows(Backend::CEED_CPU))
+-   {
+-      if (!device_option)
+-      {
+-         CeedDeviceSetup("/cpu/self");
+-      }
+-      else
+-      {
+-         CeedDeviceSetup(device_option);
+-      }
+-   }
+-   if (Allows(Backend::CEED_CUDA))
+-   {
+-      if (!device_option)
+-      {
+-         // NOTE: libCEED's /gpu/cuda/gen backend is non-deterministic!
+-         CeedDeviceSetup("/gpu/cuda/gen");
+-      }
+-      else
+-      {
+-         CeedDeviceSetup(device_option);
+-      }
+-   }
+-   if (Allows(Backend::CEED_HIP))
 +   if (Allows(Backend::CEED_MASK))
     {
 -      if (!device_option)
@@ -360,7 +236,7 @@ index ccee71cd7..f664f70c3 100644
 +      std::string::size_type beg = device_option.find(ceed_spec_search), end;
 +      if (beg == std::string::npos)
        {
--         CeedDeviceSetup("/cpu/self");
+-         CeedDeviceSetup("/gpu/hip");
 +         CeedDeviceSetup(ceed_spec_default);
        }
        else
@@ -371,98 +247,13 @@ index ccee71cd7..f664f70c3 100644
 +         CeedDeviceSetup(device_option.substr(beg + 1, end - beg - 1).c_str());
        }
     }
--   if (Allows(Backend::CEED_CUDA))
-+   if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; }
-+}
-+
-+void Device::UpdateMemoryTypeAndClass(const std::string &device_option)
-+{
-+   const bool debug = Device::Allows(Backend::DEBUG_DEVICE);
-+   const bool device = Device::Allows(Backend::DEVICE_MASK);
-+
-+#ifdef MFEM_USE_UMPIRE
-+   // If MFEM has been compiled with Umpire support, use it as the default
-+   if (!mem_host_env && !mem_types_set)
-    {
--      if (!device_option)
--      {
--         // NOTE: libCEED's /gpu/cuda/gen backend is non-deterministic!
--         CeedDeviceSetup("/gpu/cuda/gen");
--      }
--      else
-+      host_mem_type = MemoryType::HOST_UMPIRE;
-+      if (!mem_device_env)
-       {
--         CeedDeviceSetup(device_option);
-+         device_mem_type = MemoryType::HOST_UMPIRE;
-       }
-    }
--   if (Allows(Backend::CEED_HIP))
-+#endif
-+
-+   // Enable the device memory type
-+   if (device)
-    {
--      if (!device_option)
--      {
--         CeedDeviceSetup("/gpu/hip");
--      }
--      else
-+      if (!mem_device_env)
-       {
--         CeedDeviceSetup(device_option);
-+         if (mem_host_env)
-+         {
-+            switch (host_mem_type)
-+            {
-+               case MemoryType::HOST_UMPIRE:
-+                  device_mem_type = MemoryType::DEVICE_UMPIRE;
-+                  break;
-+               case MemoryType::HOST_DEBUG:
-+                  device_mem_type = MemoryType::DEVICE_DEBUG;
-+                  break;
-+               default:
-+                  device_mem_type = MemoryType::DEVICE;
-+            }
-+         }
-+         else if (!mem_types_set)
-+         {
-+#ifndef MFEM_USE_UMPIRE
-+            device_mem_type = MemoryType::DEVICE;
-+#else
-+            device_mem_type = MemoryType::DEVICE_UMPIRE;
-+#endif
-+         }
-       }
-+      device_mem_class = MemoryClass::DEVICE;
-    }
--   if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; }
-+
-+   // Enable the UVM shortcut when requested
-+   if (device && device_option.find(":uvm") != std::string::npos)
-+   {
-+      host_mem_type = MemoryType::MANAGED;
-+      device_mem_type = MemoryType::MANAGED;
-+   }
-+
-+   // Enable the DEBUG mode when requested
-+   if (debug)
-+   {
-+      host_mem_type = MemoryType::HOST_DEBUG;
-+      device_mem_type = MemoryType::DEVICE_DEBUG;
-+   }
-+
-+   MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type),
-+               "Invalid device memory configuration!");
-+
-+   // Update the memory manager with the new settings
-+   mm.Configure(host_mem_type, device_mem_type);
+    if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; }
  }
  
 -} // mfem
 +} // namespace mfem
 diff --git a/general/device.hpp b/general/device.hpp
-index baa27397f..a2d89e22e 100644
+index 5e236430b..07e6d0c19 100644
 --- a/general/device.hpp
 +++ b/general/device.hpp
 @@ -14,6 +14,7 @@
@@ -473,15 +264,7 @@ index baa27397f..a2d89e22e 100644
  
  namespace mfem
  {
-@@ -81,7 +82,6 @@ struct Backend
-    {
-       /// Number of backends: from (1 << 0) to (1 << (NUM_BACKENDS-1)).
-       NUM_BACKENDS = 15,
--
-       /// Biwise-OR of all CPU backends
-       CPU_MASK = CPU | RAJA_CPU | OCCA_CPU | CEED_CPU,
-       /// Biwise-OR of all CUDA backends
-@@ -94,7 +94,6 @@ struct Backend
+@@ -94,7 +95,6 @@ struct Backend
        CEED_MASK = CEED_CPU | CEED_CUDA | CEED_HIP,
        /// Biwise-OR of all device backends
        DEVICE_MASK = CUDA_MASK | HIP_MASK | DEBUG_DEVICE,
@@ -489,15 +272,14 @@ index baa27397f..a2d89e22e 100644
        /// Biwise-OR of all RAJA backends
        RAJA_MASK = RAJA_CPU | RAJA_OMP | RAJA_CUDA | RAJA_HIP,
        /// Biwise-OR of all OCCA backends
-@@ -122,50 +121,44 @@ class Device
+@@ -122,16 +122,16 @@ class Device
  {
  private:
     friend class MemoryManager;
 -   enum MODES {SEQUENTIAL, ACCELERATED};
--
--   static bool device_env, mem_host_env, mem_device_env, mem_types_set;
+ 
+    static bool device_env, mem_host_env, mem_device_env, mem_types_set;
     static MFEM_EXPORT Device device_singleton;
-+   static bool device_env, mem_host_env, mem_device_env, mem_types_set;
  
 -   MODES mode = Device::SEQUENTIAL;
     int dev = 0;   ///< Device ID of the configured device.
@@ -509,19 +291,8 @@ index baa27397f..a2d89e22e 100644
     /// Set to true during configuration, except in 'device_singleton'.
     bool destroy_mm = false;
     bool mpi_gpu_aware = false;
- 
--   MemoryType host_mem_type = MemoryType::HOST;    ///< Current Host MemoryType
--   MemoryClass host_mem_class = MemoryClass::HOST; ///< Current Host MemoryClass
-+   /// Current host MemoryType.
-+   MemoryType host_mem_type = MemoryType::HOST;
-+   /// Current host MemoryClass.
-+   MemoryClass host_mem_class = MemoryClass::HOST;
- 
--   /// Current Device MemoryType
-+   /// Current device MemoryType.
-    MemoryType device_mem_type = MemoryType::HOST;
--   /// Current Device MemoryClass
-+   /// Current device MemoryClass.
+@@ -144,28 +144,21 @@ private:
+    /// Current Device MemoryClass
     MemoryClass device_mem_class = MemoryClass::HOST;
  
 -   char *device_option = NULL;
@@ -531,27 +302,27 @@ index baa27397f..a2d89e22e 100644
 -
 -   /// Setup switcher based on configuration settings
 -   void Setup(const int device_id = 0);
+-
+-   void MarkBackend(Backend::Id b) { backends |= b; }
 +   // Delete copy constructor and copy assignment.
 +   Device(Device const &) = delete;
 +   void operator=(Device const &) = delete;
  
--   void MarkBackend(Backend::Id b) { backends |= b; }
-+   // Access the Device singleton.
-+   static Device &Get() { return device_singleton; }
- 
 -   void UpdateMemoryTypeAndClass();
-+   /// Setup switcher based on configuration settings.
-+   void Setup(const std::string &device_option, const int device_id);
++   // Access the Device singleton.
++   static Device& Get() { return device_singleton; }
  
 -   /// Enable the use of the configured device in the code that follows.
 -   /** After this call MFEM classes will use the backend kernels whenever
 -       possible, transferring data automatically to the device, if necessary.
-+   /// Configure host/device MemoryType/MemoryClass.
-+   void UpdateMemoryTypeAndClass(const std::string &device_option);
++   /// Setup switcher based on configuration settings.
++   void Setup(const std::string &device_option, const int device_id);
  
 -       If the only configured backend is the default host CPU one, the device
 -       will remain disabled.
--
++   /// Configure host/device MemoryType/MemoryClass.
++   void UpdateMemoryTypeAndClass(const std::string &device_option);
+ 
 -       If the device is actually enabled, this method will also update the
 -       current host/device MemoryType and MemoryClass. */
 -   static void Enable();
@@ -581,91 +352,42 @@ index baa27397f..a2d89e22e 100644
         * The available backends are described by the Backend class.
         * The string name of a backend is the lowercase version of the
           Backend::Id enumeration constant with '_' replaced by '-', e.g. the
-@@ -219,8 +212,12 @@ public:
+@@ -219,8 +212,10 @@ public:
           and evaluation of operators and enables the 'hip' backend to avoid
           transfers between host and device.
         * The 'debug' backend should not be combined with other device backends.
 -   */
 -   void Configure(const std::string &device, const int dev = 0);
++
 +       @note If the device is actually enabled, this method will also update the
 +       current host/device MemoryType and MemoryClass. */
 +   void Configure(const std::string &device, const int device_id = 0);
-+
-+   /// Print the configuration of the MFEM virtual device object.
-+   void Print(std::ostream &out = mfem::out);
  
     /// Set the default host and device MemoryTypes, @a h_mt and @a d_mt.
     /** The host and device MemoryTypes are also set to be dual to each other.
-@@ -233,60 +230,64 @@ public:
-        the subsequent Device configuration. */
-    static void SetMemoryTypes(MemoryType h_mt, MemoryType d_mt);
- 
--   /// Print the configuration of the MFEM virtual device object.
--   void Print(std::ostream &out = mfem::out);
--
-    /// Return true if Configure() has been called previously.
--   static inline bool IsConfigured() { return Get().ngpu >= 0; }
-+   static bool IsConfigured() { return Get().ngpu >= 0; }
- 
-    /// Return true if an actual device (e.g. GPU) has been configured.
--   static inline bool IsAvailable() { return Get().ngpu > 0; }
-+   static bool IsAvailable() { return Get().ngpu > 0; }
+@@ -243,14 +238,17 @@ public:
+    static inline bool IsAvailable() { return Get().ngpu > 0; }
  
     /// Return true if any backend other than Backend::CPU is enabled.
 -   static inline bool IsEnabled() { return Get().mode == ACCELERATED; }
-+   static bool IsEnabled() { return Get().backends & ~(Backend::CPU); }
++   static inline bool IsEnabled() { return Get().backends & ~(Backend::CPU); }
  
     /// The opposite of IsEnabled().
--   static inline bool IsDisabled() { return !IsEnabled(); }
-+   static bool IsDisabled() { return !IsEnabled(); }
-+
-+   /// Get the device ID of the configured device.
-+   static int GetId() { return Get().dev; }
+    static inline bool IsDisabled() { return !IsEnabled(); }
  
 -   /// Get the device id of the configured device.
--   static inline int GetId() { return Get().dev; }
-+   /// Get the number of available devices (may be called before configuration).
-+   static int GetNumGPU();
++   /// Get the device ID of the configured device.
+    static inline int GetId() { return Get().dev; }
  
++   /// Get the number of available devices (may be called before configuration).
++   static int GetDeviceCount();
++
     /** @brief Return true if any of the backends in the backend mask, @a b_mask,
         are allowed. */
     /** This method can be used with any of the Backend::Id constants, the
-        Backend::*_MASK, or combinations of those. */
--   static inline bool Allows(unsigned long b_mask)
-+   static bool Allows(unsigned long b_mask)
-    { return Get().backends & b_mask; }
- 
-    /** @brief Get the current Host MemoryType. This is the MemoryType used by
-        most MFEM classes when allocating memory used on the host.
-    */
--   static inline MemoryType GetHostMemoryType() { return Get().host_mem_type; }
-+   static MemoryType GetHostMemoryType() { return Get().host_mem_type; }
- 
-    /** @brief Get the current Host MemoryClass. This is the MemoryClass used
-        by most MFEM host Memory objects. */
--   static inline MemoryClass GetHostMemoryClass() { return Get().host_mem_class; }
-+   static MemoryClass GetHostMemoryClass() { return Get().host_mem_class; }
- 
-    /** @brief Get the current Device MemoryType. This is the MemoryType used by
-        most MFEM classes when allocating memory to be used with device kernels.
-    */
--   static inline MemoryType GetDeviceMemoryType() { return Get().device_mem_type; }
-+   static MemoryType GetDeviceMemoryType() { return Get().device_mem_type; }
- 
-    /// (DEPRECATED) Equivalent to GetDeviceMemoryType().
-    /** @deprecated Use GetDeviceMemoryType() instead. */
--   static inline MemoryType GetMemoryType() { return Get().device_mem_type; }
-+   static MemoryType GetMemoryType() { return Get().device_mem_type; }
- 
-    /** @brief Get the current Device MemoryClass. This is the MemoryClass used
-        by most MFEM device kernels to access Memory objects. */
--   static inline MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; }
-+   static MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; }
- 
-    /// (DEPRECATED) Equivalent to GetDeviceMemoryClass().
+@@ -284,9 +282,13 @@ public:
     /** @deprecated Use GetDeviceMemoryClass() instead. */
--   static inline MemoryClass GetMemoryClass() { return Get().device_mem_class; }
-+   static MemoryClass GetMemoryClass() { return Get().device_mem_class; }
+    static inline MemoryClass GetMemoryClass() { return Get().device_mem_class; }
  
 +   /** @brief Manually set the status of GPU-aware MPI flag for use in MPI
 +       communication routines which have optimized implementations for device
@@ -677,7 +399,7 @@ index baa27397f..a2d89e22e 100644
     static bool GetGPUAwareMPI() { return Get().mpi_gpu_aware; }
  };
  
-@@ -298,7 +299,7 @@ public:
+@@ -298,7 +300,7 @@ public:
      and ReadWrite(), while setting the device use flag in @a mem, if @a on_dev
      is true. */
  template <typename T>
@@ -686,7 +408,7 @@ index baa27397f..a2d89e22e 100644
  {
     if (!on_dev)
     {
-@@ -362,6 +363,6 @@ inline T *HostReadWrite(Memory<T> &mem, int size)
+@@ -362,6 +364,6 @@ inline T *HostReadWrite(Memory<T> &mem, int size)
     return mfem::ReadWrite(mem, size, false);
  }
  
diff --git a/extern/patch/mfem/patch_par_tet_mesh_fix.diff b/extern/patch/mfem/patch_par_tet_mesh_fix.diff
index 58aa441f4..c4842acc2 100644
--- a/extern/patch/mfem/patch_par_tet_mesh_fix.diff
+++ b/extern/patch/mfem/patch_par_tet_mesh_fix.diff
@@ -1,5 +1,5 @@
 diff --git a/mesh/element.hpp b/mesh/element.hpp
-index 1a265a506..12467ad20 100644
+index e9dd07217..4129789bb 100644
 --- a/mesh/element.hpp
 +++ b/mesh/element.hpp
 @@ -87,9 +87,6 @@ public:
@@ -13,7 +13,7 @@ index 1a265a506..12467ad20 100644
     virtual int NeedRefinement(HashTable<Hashed2> &v_to_v) const { return 0; }
  
 diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp
-index d20f689ab..88f5537cc 100644
+index ebb273f62..aa416e1b1 100644
 --- a/mesh/mesh.cpp
 +++ b/mesh/mesh.cpp
 @@ -2146,7 +2146,7 @@ void Mesh::FinalizeTriMesh(int generate_edges, int refine, bool fix_orientation)
@@ -384,7 +384,7 @@ index d20f689ab..88f5537cc 100644
  
  // static method
 diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp
-index fd9d0f3c6..67ab7a5ea 100644
+index 3870604aa..df53ac980 100644
 --- a/mesh/mesh.hpp
 +++ b/mesh/mesh.hpp
 @@ -336,12 +336,9 @@ protected:
@@ -403,7 +403,7 @@ index fd9d0f3c6..67ab7a5ea 100644
  
     // Methods used to prepare and apply permutation of the mesh nodes assuming
 diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp
-index ec612ee02..25e6bae5b 100644
+index 3f23d1877..f8d934556 100644
 --- a/mesh/pmesh.cpp
 +++ b/mesh/pmesh.cpp
 @@ -20,6 +20,7 @@
@@ -580,7 +580,7 @@ index ec612ee02..25e6bae5b 100644
  }
  
 diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp
-index 22eecbd42..133a69b41 100644
+index 9653316bf..cf7f1e77a 100644
 --- a/mesh/tetrahedron.cpp
 +++ b/mesh/tetrahedron.cpp
 @@ -13,6 +13,8 @@
@@ -670,7 +670,7 @@ index 22eecbd42..133a69b41 100644
 +
  }
 diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp
-index e6093bc08..002163714 100644
+index 83665454e..821d18de4 100644
 --- a/mesh/tetrahedron.hpp
 +++ b/mesh/tetrahedron.hpp
 @@ -74,7 +74,9 @@ public:
@@ -685,7 +685,7 @@ index e6093bc08..002163714 100644
     void ResetTransform(int tr) override { transform = tr; }
     unsigned GetTransform() const override { return transform; }
 diff --git a/mesh/triangle.cpp b/mesh/triangle.cpp
-index eb7493398..80d11f4b6 100644
+index ed05f1d37..26334fd94 100644
 --- a/mesh/triangle.cpp
 +++ b/mesh/triangle.cpp
 @@ -11,6 +11,8 @@
@@ -794,7 +794,7 @@ index eb7493398..80d11f4b6 100644
 +
  } // namespace mfem
 diff --git a/mesh/triangle.hpp b/mesh/triangle.hpp
-index fafd8811b..e35baf294 100644
+index 7bfdb23ef..feaa1674c 100644
 --- a/mesh/triangle.hpp
 +++ b/mesh/triangle.hpp
 @@ -47,13 +47,14 @@ public:
diff --git a/extern/patch/mfem/patch_pncmesh_update_fix.diff b/extern/patch/mfem/patch_pncmesh_update_fix.diff
deleted file mode 100644
index 2e4810382..000000000
--- a/extern/patch/mfem/patch_pncmesh_update_fix.diff
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/mesh/pncmesh.cpp b/mesh/pncmesh.cpp
-index 60aca4583..34d59e567 100644
---- a/mesh/pncmesh.cpp
-+++ b/mesh/pncmesh.cpp
-@@ -107,6 +107,8 @@ void ParNCMesh::Update()
-       entity_owner[i].DeleteAll();
-       entity_pmat_group[i].DeleteAll();
-       entity_index_rank[i].DeleteAll();
-+      entity_conf_group[i].DeleteAll();
-+      entity_elem_local[i].DeleteAll();
-    }
- 
-    shared_vertices.Clear();
diff --git a/extern/patch/mfem/patch_workspace_vectors.diff b/extern/patch/mfem/patch_workspace_vectors.diff
new file mode 100644
index 000000000..04522f66a
--- /dev/null
+++ b/extern/patch/mfem/patch_workspace_vectors.diff
@@ -0,0 +1,1562 @@
+diff --git a/fem/bilinearform_ext.cpp b/fem/bilinearform_ext.cpp
+index 1c26575b8..17808cda2 100644
+--- a/fem/bilinearform_ext.cpp
++++ b/fem/bilinearform_ext.cpp
+@@ -13,6 +13,7 @@
+ // PABilinearFormExtension and MFBilinearFormExtension.
+ 
+ #include "../general/forall.hpp"
++#include "../general/workspace.hpp"
+ #include "bilinearform.hpp"
+ #include "pbilinearform.hpp"
+ #include "pgridfunc.hpp"
+@@ -68,6 +69,9 @@ void MFBilinearFormExtension::AssembleDiagonal(Vector &y) const
+    const int iSz = integrators.Size();
+    if (elem_restrict && !DeviceCanUseCeed())
+    {
++      auto localX = Workspace::NewVector(elem_restrict->Height());
++      auto localY = Workspace::NewVector(elem_restrict->Height());
++
+       localY = 0.0;
+       for (int i = 0; i < iSz; ++i)
+       {
+@@ -142,6 +146,9 @@ void MFBilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    }
+    else
+    {
++      auto localX = Workspace::NewVector(elem_restrict->Height());
++      auto localY = Workspace::NewVector(elem_restrict->Height());
++
+       elem_restrict->Mult(x, localX);
+       localY = 0.0;
+       for (int i = 0; i < iSz; ++i)
+@@ -155,6 +162,9 @@ void MFBilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    const int iFISz = intFaceIntegrators.Size();
+    if (int_face_restrict_lex && iFISz>0)
+    {
++      auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height());
++      auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height());
++
+       int_face_restrict_lex->Mult(x, int_face_X);
+       if (int_face_X.Size()>0)
+       {
+@@ -171,6 +181,9 @@ void MFBilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    const int bFISz = bdrFaceIntegrators.Size();
+    if (bdr_face_restrict_lex && bFISz>0)
+    {
++      auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height());
++      auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height());
++
+       bdr_face_restrict_lex->Mult(x, bdr_face_X);
+       if (bdr_face_X.Size()>0)
+       {
+@@ -190,6 +203,9 @@ void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const int iSz = integrators.Size();
+    if (elem_restrict)
+    {
++      auto localX = Workspace::NewVector(elem_restrict->Height());
++      auto localY = Workspace::NewVector(elem_restrict->Height());
++
+       elem_restrict->Mult(x, localX);
+       localY = 0.0;
+       for (int i = 0; i < iSz; ++i)
+@@ -212,6 +228,9 @@ void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const int iFISz = intFaceIntegrators.Size();
+    if (int_face_restrict_lex && iFISz>0)
+    {
++      auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height());
++      auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height());
++
+       int_face_restrict_lex->Mult(x, int_face_X);
+       if (int_face_X.Size()>0)
+       {
+@@ -228,6 +247,9 @@ void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const int bFISz = bdrFaceIntegrators.Size();
+    if (bdr_face_restrict_lex && bFISz>0)
+    {
++      auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height());
++      auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height());
++
+       bdr_face_restrict_lex->Mult(x, bdr_face_X);
+       if (bdr_face_X.Size()>0)
+       {
+@@ -259,10 +281,6 @@ void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m)
+    elem_restrict = trial_fes->GetElementRestriction(ordering);
+    if (elem_restrict)
+    {
+-      localX.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType());
+-      localY.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType());
+-      localY.UseDevice(true); // ensure 'localY = 0.0' is done on device
+-
+       // Gather the attributes on the host from all the elements
+       const Mesh &mesh = *trial_fes->GetMesh();
+       elem_attributes.SetSize(mesh.GetNE());
+@@ -279,9 +297,6 @@ void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m)
+       int_face_restrict_lex = trial_fes->GetFaceRestriction(
+                                  ElementDofOrdering::LEXICOGRAPHIC,
+                                  FaceType::Interior);
+-      int_face_X.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType());
+-      int_face_Y.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType());
+-      int_face_Y.UseDevice(true); // ensure 'int_face_Y = 0.0' is done on device
+    }
+ 
+    const bool has_bdr_integs = (a->GetBFBFI()->Size() > 0 ||
+@@ -292,9 +307,6 @@ void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m)
+                                  ElementDofOrdering::LEXICOGRAPHIC,
+                                  FaceType::Boundary,
+                                  m);
+-      bdr_face_X.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType());
+-      bdr_face_Y.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType());
+-      bdr_face_Y.UseDevice(true); // ensure 'faceBoundY = 0.0' is done on device
+ 
+       const Mesh &mesh = *trial_fes->GetMesh();
+       // See LinearFormExtension::Update for explanation of f_to_be logic.
+@@ -383,6 +395,7 @@ void PABilinearFormExtension::AssembleDiagonal(Vector &y) const
+    const int iSz = integrators.Size();
+    if (elem_restrict && !DeviceCanUseCeed())
+    {
++      auto localY = Workspace::NewVector(elem_restrict->Height());
+       if (iSz > 0)
+       {
+          localY = 0.0;
+@@ -420,6 +433,7 @@ void PABilinearFormExtension::AssembleDiagonal(Vector &y) const
+    const int n_bdr_integs = bdr_integs.Size();
+    if (bdr_face_restrict_lex && n_bdr_integs > 0)
+    {
++      auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height());
+       bdr_face_Y = 0.0;
+       for (int i = 0; i < n_bdr_integs; ++i)
+       {
+@@ -504,6 +518,9 @@ void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    {
+       if (iSz)
+       {
++         auto localX = Workspace::NewVector(elem_restrict->Height());
++         auto localY = Workspace::NewVector(elem_restrict->Height());
++
+          Array<Array<int>*> &elem_markers = *a->GetDBFI_Marker();
+          elem_restrict->Mult(x, localX);
+          localY = 0.0;
+@@ -524,6 +541,9 @@ void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    const int iFISz = intFaceIntegrators.Size();
+    if (int_face_restrict_lex && iFISz>0)
+    {
++      auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height());
++      auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height());
++
+       int_face_restrict_lex->Mult(x, int_face_X);
+       if (int_face_X.Size()>0)
+       {
+@@ -543,6 +563,9 @@ void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    const bool has_bdr_integs = (n_bdr_face_integs > 0 || n_bdr_integs > 0);
+    if (bdr_face_restrict_lex && has_bdr_integs)
+    {
++      auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height());
++      auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height());
++
+       Array<Array<int>*> &bdr_markers = *a->GetBBFI_Marker();
+       Array<Array<int>*> &bdr_face_markers = *a->GetBFBFI_Marker();
+       bdr_face_restrict_lex->Mult(x, bdr_face_X);
+@@ -570,6 +593,9 @@ void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const int iSz = integrators.Size();
+    if (elem_restrict)
+    {
++      auto localX = Workspace::NewVector(elem_restrict->Height());
++      auto localY = Workspace::NewVector(elem_restrict->Height());
++
+       Array<Array<int>*> &elem_markers = *a->GetDBFI_Marker();
+       elem_restrict->Mult(x, localX);
+       localY = 0.0;
+@@ -594,6 +620,9 @@ void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const int iFISz = intFaceIntegrators.Size();
+    if (int_face_restrict_lex && iFISz>0)
+    {
++      auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height());
++      auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height());
++
+       int_face_restrict_lex->Mult(x, int_face_X);
+       if (int_face_X.Size()>0)
+       {
+@@ -613,6 +642,9 @@ void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const bool has_bdr_integs = (n_bdr_face_integs > 0 || n_bdr_integs > 0);
+    if (bdr_face_restrict_lex && has_bdr_integs)
+    {
++      auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height());
++      auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height());
++
+       Array<Array<int>*> &bdr_markers = *a->GetBBFI_Marker();
+       Array<Array<int>*> &bdr_face_markers = *a->GetBFBFI_Marker();
+ 
+@@ -670,7 +702,7 @@ void PABilinearFormExtension::AddMultWithMarkers(
+ {
+    if (markers)
+    {
+-      tmp_evec.SetSize(y.Size());
++      auto tmp_evec = Workspace::NewVector(y.Size());
+       tmp_evec = 0.0;
+       if (transpose) { integ.AddMultTransposePA(x, tmp_evec); }
+       else { integ.AddMultPA(x, tmp_evec); }
+@@ -769,6 +801,10 @@ void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const
+ {
+    // Apply the Element Restriction
+    const bool useRestrict = !DeviceCanUseCeed() && elem_restrict;
++
++   auto localX = Workspace::NewVector(useRestrict?elem_restrict->Height():0);
++   auto localY = Workspace::NewVector(useRestrict?elem_restrict->Height():0);
++
+    if (!useRestrict)
+    {
+       y.UseDevice(true); // typically this is a large vector, so store on device
+@@ -808,6 +844,9 @@ void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    const int iFISz = intFaceIntegrators.Size();
+    if (int_face_restrict_lex && iFISz>0)
+    {
++      auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height());
++      auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height());
++
+       // Apply the Interior Face Restriction
+       int_face_restrict_lex->Mult(x, int_face_X);
+       if (int_face_X.Size()>0)
+@@ -866,6 +905,9 @@ void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const
+    const int bFISz = bdrFaceIntegrators.Size();
+    if (!factorize_face_terms && bdr_face_restrict_lex && bFISz>0)
+    {
++      auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height());
++      auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height());
++
+       // Apply the Boundary Face Restriction
+       bdr_face_restrict_lex->Mult(x, bdr_face_X);
+       if (bdr_face_X.Size()>0)
+@@ -897,6 +939,10 @@ void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+ {
+    // Apply the Element Restriction
+    const bool useRestrict = !DeviceCanUseCeed() && elem_restrict;
++
++   auto localX = Workspace::NewVector(useRestrict?elem_restrict->Height():0);
++   auto localY = Workspace::NewVector(useRestrict?elem_restrict->Height():0);
++
+    if (!useRestrict)
+    {
+       y.UseDevice(true); // typically this is a large vector, so store on device
+@@ -936,6 +982,9 @@ void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const int iFISz = intFaceIntegrators.Size();
+    if (int_face_restrict_lex && iFISz>0)
+    {
++      auto int_face_X = Workspace::NewVector(int_face_restrict_lex->Height());
++      auto int_face_Y = Workspace::NewVector(int_face_restrict_lex->Height());
++
+       // Apply the Interior Face Restriction
+       int_face_restrict_lex->Mult(x, int_face_X);
+       if (int_face_X.Size()>0)
+@@ -994,6 +1043,9 @@ void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+    const int bFISz = bdrFaceIntegrators.Size();
+    if (!factorize_face_terms && bdr_face_restrict_lex && bFISz>0)
+    {
++      auto bdr_face_X = Workspace::NewVector(bdr_face_restrict_lex->Height());
++      auto bdr_face_Y = Workspace::NewVector(bdr_face_restrict_lex->Height());
++
+       // Apply the Boundary Face Restriction
+       bdr_face_restrict_lex->Mult(x, bdr_face_X);
+       if (bdr_face_X.Size()>0)
+@@ -1050,12 +1102,10 @@ void FABilinearFormExtension::Assemble()
+    {
+       pfes->ExchangeFaceNbrData();
+       width += pfes->GetFaceNbrVSize();
+-      dg_x.SetSize(width);
+       ParBilinearForm *pb = nullptr;
+       if ((pb = dynamic_cast<ParBilinearForm*>(a)) && (pb->keep_nbr_block))
+       {
+          height += pfes->GetFaceNbrVSize();
+-         dg_y.SetSize(height);
+          keep_nbr_block = true;
+       }
+    }
+@@ -1217,6 +1267,7 @@ void FABilinearFormExtension::DGMult(const Vector &x, Vector &y) const
+       x_gf.ExchangeFaceNbrData();
+       Vector &shared_x = x_gf.FaceNbrData();
+       const int local_size = a->FESpace()->GetVSize();
++      auto dg_x = Workspace::NewVector(width);
+       auto dg_x_ptr = dg_x.Write();
+       auto x_ptr = x.Read();
+       mfem::forall(local_size, [=] MFEM_HOST_DEVICE (int i)
+@@ -1232,6 +1283,7 @@ void FABilinearFormExtension::DGMult(const Vector &x, Vector &y) const
+       ParBilinearForm *pform = nullptr;
+       if ((pform = dynamic_cast<ParBilinearForm*>(a)) && (pform->keep_nbr_block))
+       {
++         auto dg_y = Workspace::NewVector(height);
+          mat->Mult(dg_x, dg_y);
+          // DG Restriction
+          auto dg_y_ptr = dg_y.Read();
+@@ -1278,6 +1330,7 @@ void FABilinearFormExtension::DGMultTranspose(const Vector &x, Vector &y) const
+       x_gf.ExchangeFaceNbrData();
+       Vector &shared_x = x_gf.FaceNbrData();
+       const int local_size = a->FESpace()->GetVSize();
++      auto dg_x = Workspace::NewVector(width);
+       auto dg_x_ptr = dg_x.Write();
+       auto x_ptr = x.Read();
+       mfem::forall(local_size, [=] MFEM_HOST_DEVICE (int i)
+@@ -1293,6 +1346,7 @@ void FABilinearFormExtension::DGMultTranspose(const Vector &x, Vector &y) const
+       ParBilinearForm *pb = nullptr;
+       if ((pb = dynamic_cast<ParBilinearForm*>(a)) && (pb->keep_nbr_block))
+       {
++         auto dg_y = Workspace::NewVector(height);
+          mat->MultTranspose(dg_x, dg_y);
+          // DG Restriction
+          auto dg_y_ptr = dg_y.Read();
+@@ -1392,17 +1446,6 @@ void PAMixedBilinearFormExtension::Update()
+                             ElementDofOrdering::LEXICOGRAPHIC);
+    elem_restrict_test  =  test_fes->GetElementRestriction(
+                              ElementDofOrdering::LEXICOGRAPHIC);
+-   if (elem_restrict_trial)
+-   {
+-      localTrial.UseDevice(true);
+-      localTrial.SetSize(elem_restrict_trial->Height(),
+-                         Device::GetMemoryType());
+-   }
+-   if (elem_restrict_test)
+-   {
+-      localTest.UseDevice(true); // ensure 'localY = 0.0' is done on device
+-      localTest.SetSize(elem_restrict_test->Height(), Device::GetMemoryType());
+-   }
+ }
+ 
+ void PAMixedBilinearFormExtension::FormRectangularSystemOperator(
+@@ -1481,6 +1524,11 @@ void PAMixedBilinearFormExtension::AddMult(const Vector &x, Vector &y,
+    Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
+    const int iSz = integrators.Size();
+ 
++   auto localTrial = Workspace::NewVector(elem_restrict_trial
++                                          ?elem_restrict_trial->Height():0);
++   auto localTest = Workspace::NewVector(elem_restrict_test
++                                         ?elem_restrict_test->Height():0);
++
+    // * G operation
+    SetupMultInputs(elem_restrict_trial, x, localTrial,
+                    elem_restrict_test, y, localTest, c);
+@@ -1494,9 +1542,7 @@ void PAMixedBilinearFormExtension::AddMult(const Vector &x, Vector &y,
+    // * G^T operation
+    if (elem_restrict_test)
+    {
+-      tempY.SetSize(y.Size());
+-      elem_restrict_test->MultTranspose(localTest, tempY);
+-      y += tempY;
++      elem_restrict_test->AddMultTranspose(localTest, y);
+    }
+ }
+ 
+@@ -1513,6 +1559,11 @@ void PAMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
+    Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
+    const int iSz = integrators.Size();
+ 
++   auto localTrial = Workspace::NewVector(elem_restrict_trial
++                                          ?elem_restrict_trial->Height():0);
++   auto localTest = Workspace::NewVector(elem_restrict_test
++                                         ?elem_restrict_test->Height():0);
++
+    // * G operation
+    SetupMultInputs(elem_restrict_test, x, localTest,
+                    elem_restrict_trial, y, localTrial, c);
+@@ -1526,9 +1577,7 @@ void PAMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
+    // * G^T operation
+    if (elem_restrict_trial)
+    {
+-      tempY.SetSize(y.Size());
+-      elem_restrict_trial->MultTranspose(localTrial, tempY);
+-      y += tempY;
++      elem_restrict_trial->AddMultTranspose(localTrial, y);
+    }
+ }
+ 
+@@ -1539,6 +1588,8 @@ void PAMixedBilinearFormExtension::AssembleDiagonal_ADAt(const Vector &D,
+ 
+    const int iSz = integrators.Size();
+ 
++   auto localTrial = Workspace::NewVector(elem_restrict_trial
++                                          ?elem_restrict_trial->Height():0);
+    if (elem_restrict_trial)
+    {
+       const ElementRestriction* H1elem_restrict_trial =
+@@ -1555,6 +1606,8 @@ void PAMixedBilinearFormExtension::AssembleDiagonal_ADAt(const Vector &D,
+ 
+    if (elem_restrict_test)
+    {
++      auto localTest = Workspace::NewVector(elem_restrict_test->Height());
++
+       localTest = 0.0;
+       for (int i = 0; i < iSz; ++i)
+       {
+@@ -1620,7 +1673,8 @@ void PADiscreteLinearOperatorExtension::Assemble()
+ 
+    test_multiplicity.UseDevice(true);
+    test_multiplicity.SetSize(elem_restrict_test->Width()); // l-vector
+-   Vector ones(elem_restrict_test->Height()); // e-vector
++
++   auto ones = Workspace::NewVector(elem_restrict_test->Height()); // e-vector
+    ones = 1.0;
+ 
+    const ElementRestriction* elem_restrict =
+@@ -1631,7 +1685,7 @@ void PADiscreteLinearOperatorExtension::Assemble()
+    }
+    else
+    {
+-      mfem_error("A real ElementRestriction is required in this setting!");
++      MFEM_ABORT("A real ElementRestriction is required in this setting!");
+    }
+ 
+    auto tm = test_multiplicity.ReadWrite();
+@@ -1647,6 +1701,11 @@ void PADiscreteLinearOperatorExtension::AddMult(
+    Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
+    const int iSz = integrators.Size();
+ 
++   auto localTrial = Workspace::NewVector(elem_restrict_trial
++                                          ?elem_restrict_trial->Height():0);
++   auto localTest = Workspace::NewVector(elem_restrict_test
++                                         ?elem_restrict_test->Height():0);
++
+    // * G operation
+    SetupMultInputs(elem_restrict_trial, x, localTrial,
+                    elem_restrict_test, y, localTest, c);
+@@ -1664,13 +1723,13 @@ void PADiscreteLinearOperatorExtension::AddMult(
+       dynamic_cast<const ElementRestriction*>(elem_restrict_test);
+    if (elem_restrict)
+    {
+-      tempY.SetSize(y.Size());
++      auto tempY = Workspace::NewVector(y.Size());
+       elem_restrict->MultLeftInverse(localTest, tempY);
+       y += tempY;
+    }
+    else
+    {
+-      mfem_error("In this setting you need a real ElementRestriction!");
++      MFEM_ABORT("In this setting you need a real ElementRestriction!");
+    }
+ }
+ 
+@@ -1691,6 +1750,12 @@ void PADiscreteLinearOperatorExtension::AddMultTranspose(
+    {
+       xs[i] *= tm[i];
+    });
++
++   auto localTrial = Workspace::NewVector(elem_restrict_trial
++                                          ?elem_restrict_trial->Height():0);
++   auto localTest = Workspace::NewVector(elem_restrict_test
++                                         ?elem_restrict_test->Height():0);
++
+    SetupMultInputs(elem_restrict_test, xscaled, localTest,
+                    elem_restrict_trial, y, localTrial, c);
+ 
+@@ -1703,13 +1768,11 @@ void PADiscreteLinearOperatorExtension::AddMultTranspose(
+    // * G^T operation
+    if (elem_restrict_trial)
+    {
+-      tempY.SetSize(y.Size());
+-      elem_restrict_trial->MultTranspose(localTrial, tempY);
+-      y += tempY;
++      elem_restrict_trial->AddMultTranspose(localTrial, y);
+    }
+    else
+    {
+-      mfem_error("Trial ElementRestriction not defined");
++      MFEM_ABORT("Trial ElementRestriction not defined");
+    }
+ }
+ 
+diff --git a/fem/bilinearform_ext.hpp b/fem/bilinearform_ext.hpp
+index 69caf2d42..ee76fe9d5 100644
+--- a/fem/bilinearform_ext.hpp
++++ b/fem/bilinearform_ext.hpp
+@@ -70,10 +70,6 @@ protected:
+    const FiniteElementSpace *trial_fes, *test_fes; // Not owned
+    /// Attributes of all mesh elements.
+    Array<int> elem_attributes, bdr_attributes;
+-   mutable Vector tmp_evec; // Work array
+-   mutable Vector localX, localY;
+-   mutable Vector int_face_X, int_face_Y;
+-   mutable Vector bdr_face_X, bdr_face_Y;
+    const Operator *elem_restrict; // Not owned
+    const FaceRestriction *int_face_restrict_lex; // Not owned
+    const FaceRestriction *bdr_face_restrict_lex; // Not owned
+@@ -141,7 +137,6 @@ class FABilinearFormExtension : public EABilinearFormExtension
+ {
+ private:
+    SparseMatrix *mat;
+-   mutable Vector dg_x, dg_y;
+ 
+ public:
+    FABilinearFormExtension(BilinearForm *form);
+@@ -170,9 +165,6 @@ class MFBilinearFormExtension : public BilinearFormExtension
+ {
+ protected:
+    const FiniteElementSpace *trial_fes, *test_fes; // Not owned
+-   mutable Vector localX, localY;
+-   mutable Vector int_face_X, int_face_Y;
+-   mutable Vector bdr_face_X, bdr_face_Y;
+    const Operator *elem_restrict; // Not owned
+    const FaceRestriction *int_face_restrict_lex; // Not owned
+    const FaceRestriction *bdr_face_restrict_lex; // Not owned
+@@ -240,7 +232,6 @@ class PAMixedBilinearFormExtension : public MixedBilinearFormExtension
+ {
+ protected:
+    const FiniteElementSpace *trial_fes, *test_fes; // Not owned
+-   mutable Vector localTrial, localTest, tempY;
+    const Operator *elem_restrict_trial; // Not owned
+    const Operator *elem_restrict_test;  // Not owned
+ 
+diff --git a/fem/linearform_ext.cpp b/fem/linearform_ext.cpp
+index 355431eb7..f3d5adba2 100644
+--- a/fem/linearform_ext.cpp
++++ b/fem/linearform_ext.cpp
+@@ -11,6 +11,7 @@
+ 
+ #include "linearform.hpp"
+ #include "../general/forall.hpp"
++#include "../general/workspace.hpp"
+ 
+ namespace mfem
+ {
+@@ -61,6 +62,7 @@ void LinearFormExtension::Assemble()
+       }
+ 
+       // Assemble the linear form
++      auto b = Workspace::NewVector(elem_restrict_lex->Height());
+       b = 0.0;
+       domain_integs[k]->AssembleDevice(fes, markers, b);
+       if (k == 0) { elem_restrict_lex->MultTranspose(b, *lf); }
+@@ -104,6 +106,7 @@ void LinearFormExtension::Assemble()
+       }
+ 
+       // Assemble the linear form
++      auto bdr_b = Workspace::NewVector(bdr_restrict_lex->Height());
+       bdr_b = 0.0;
+       boundary_integs[k]->AssembleDevice(fes, bdr_markers, bdr_b);
+       bdr_restrict_lex->AddMultTranspose(bdr_b, *lf);
+@@ -130,8 +133,6 @@ void LinearFormExtension::Update()
+ 
+       elem_restrict_lex = fes.GetElementRestriction(ordering);
+       MFEM_VERIFY(elem_restrict_lex, "Element restriction not available");
+-      b.SetSize(elem_restrict_lex->Height(), Device::GetMemoryType());
+-      b.UseDevice(true);
+    }
+ 
+    if (lf->boundary_integs.Size() > 0)
+@@ -168,9 +169,6 @@ void LinearFormExtension::Update()
+          dynamic_cast<const FaceRestriction*>(
+             fes.GetFaceRestriction(ordering, FaceType::Boundary,
+                                    L2FaceValues::SingleValued));
+-      MFEM_VERIFY(bdr_restrict_lex, "Face restriction not available");
+-      bdr_b.SetSize(bdr_restrict_lex->Height(), Device::GetMemoryType());
+-      bdr_b.UseDevice(true);
+    }
+ }
+ 
+diff --git a/fem/linearform_ext.hpp b/fem/linearform_ext.hpp
+index 0103185e0..eb54d8745 100644
+--- a/fem/linearform_ext.hpp
++++ b/fem/linearform_ext.hpp
+@@ -39,9 +39,6 @@ class LinearFormExtension
+    /// Operator that converts L-vectors to boundary E-vectors.
+    const FaceRestriction *bdr_restrict_lex; // Not owned
+ 
+-   /// Internal E-vectors.
+-   mutable Vector b, bdr_b;
+-
+ public:
+ 
+    /// \brief Create a LinearForm extension of @a lf.
+diff --git a/fem/qinterp/det.cpp b/fem/qinterp/det.cpp
+index e0b034b82..dda386644 100644
+--- a/fem/qinterp/det.cpp
++++ b/fem/qinterp/det.cpp
+@@ -11,6 +11,7 @@
+ 
+ #include "../quadinterpolator.hpp"
+ #include "../../general/forall.hpp"
++#include "../../general/workspace.hpp"
+ #include "../../linalg/dtensor.hpp"
+ #include "../../fem/kernels.hpp"
+ #include "../../linalg/kernels.hpp"
+@@ -204,8 +205,7 @@ static void Det3D(const int NE,
+                   const double *x,
+                   double *y,
+                   const int d1d = 0,
+-                  const int q1d = 0,
+-                  Vector *d_buff = nullptr) // used only with SMEM = false
++                  const int q1d = 0)
+ {
+    constexpr int DIM = 3;
+    static constexpr int GRID = SMEM ? 0 : 128;
+@@ -219,6 +219,7 @@ static void Det3D(const int NE,
+    auto Y = Reshape(y, Q1D, Q1D, Q1D, NE);
+ 
+    double *GM = nullptr;
++   int buffer_size = 0;
+    if (!SMEM)
+    {
+       const DeviceDofQuadLimits &limits = DeviceDofQuadLimits::Get();
+@@ -226,9 +227,11 @@ static void Det3D(const int NE,
+       const int max_d1d = T_D1D ? T_D1D : limits.MAX_Q1D;
+       const int max_qd = std::max(max_q1d, max_d1d);
+       const int mem_size = max_qd * max_qd * max_qd * 9;
+-      d_buff->SetSize(2*mem_size*GRID);
+-      GM = d_buff->Write();
++      buffer_size = 2*mem_size*GRID;
+    }
++   // if SMEM is true, d_buff will be empty (zero size)
++   auto d_buff = Workspace::NewVector(buffer_size);
++   GM = d_buff.Write();
+ 
+    mfem::forall_3D_grid(NE, Q1D, Q1D, Q1D, GRID, [=] MFEM_HOST_DEVICE (int e)
+    {
+@@ -278,8 +281,7 @@ void TensorDeterminants(const int NE,
+                         const int vdim,
+                         const DofToQuad &maps,
+                         const Vector &e_vec,
+-                        Vector &q_det,
+-                        Vector &d_buff)
++                        Vector &q_det)
+ {
+    if (NE == 0) { return; }
+    const int dim = maps.FE->GetDim();
+@@ -347,8 +349,7 @@ void TensorDeterminants(const int NE,
+             if (D1D <= MD && Q1D <= MQ)
+             { return Det3D<0,0,true>(NE,B,G,X,Y,D1D,Q1D); }
+             // Last fall-back will use global memory
+-            return Det3D<0,0,false>(
+-                      NE,B,G,X,Y,D1D,Q1D,&d_buff);
++            return Det3D<0,0,false>(NE,B,G,X,Y,D1D,Q1D);
+          }
+       }
+    }
+diff --git a/fem/qinterp/dispatch.hpp b/fem/qinterp/dispatch.hpp
+index 7e7c0ebfa..9faf53b81 100644
+--- a/fem/qinterp/dispatch.hpp
++++ b/fem/qinterp/dispatch.hpp
+@@ -54,8 +54,7 @@ void TensorDeterminants(const int NE,
+                         const int vdim,
+                         const DofToQuad &maps,
+                         const Vector &e_vec,
+-                        Vector &q_det,
+-                        Vector &d_buff);
++                        Vector &q_det);
+ 
+ } // namespace quadrature_interpolator
+ 
+diff --git a/fem/quadinterpolator.cpp b/fem/quadinterpolator.cpp
+index 8314fdcfb..67e77dd9a 100644
+--- a/fem/quadinterpolator.cpp
++++ b/fem/quadinterpolator.cpp
+@@ -28,7 +28,6 @@ QuadratureInterpolator::QuadratureInterpolator(const FiniteElementSpace &fes,
+    q_layout(QVectorLayout::byNODES),
+    use_tensor_products(UsesTensorBasis(fes))
+ {
+-   d_buffer.UseDevice(true);
+    if (fespace->GetNE() == 0) { return; }
+    const FiniteElement *fe = fespace->GetFE(0);
+    MFEM_VERIFY(dynamic_cast<const ScalarFiniteElement*>(fe) != NULL,
+@@ -44,7 +43,6 @@ QuadratureInterpolator::QuadratureInterpolator(const FiniteElementSpace &fes,
+    q_layout(QVectorLayout::byNODES),
+    use_tensor_products(UsesTensorBasis(fes))
+ {
+-   d_buffer.UseDevice(true);
+    if (fespace->GetNE() == 0) { return; }
+    const FiniteElement *fe = fespace->GetFE(0);
+    MFEM_VERIFY(dynamic_cast<const ScalarFiniteElement*>(fe) != NULL,
+@@ -530,7 +528,7 @@ void QuadratureInterpolator::Mult(const Vector &e_vec,
+       }
+       if (eval_flags & DETERMINANTS)
+       {
+-         TensorDeterminants(ne, vdim, maps, e_vec, q_det, d_buffer);
++         TensorDeterminants(ne, vdim, maps, e_vec, q_det);
+       }
+    }
+    else // use_tensor_eval == false
+diff --git a/fem/quadinterpolator.hpp b/fem/quadinterpolator.hpp
+index 6672ee2f1..95fa8c9a3 100644
+--- a/fem/quadinterpolator.hpp
++++ b/fem/quadinterpolator.hpp
+@@ -37,7 +37,6 @@ protected:
+    mutable QVectorLayout q_layout;     ///< Output Q-vector layout
+ 
+    mutable bool use_tensor_products;   ///< Tensor product evaluation mode
+-   mutable Vector d_buffer;            ///< Auxiliary device buffer
+ 
+ public:
+    static const int MAX_NQ2D = 100;
+diff --git a/fem/transfer.cpp b/fem/transfer.cpp
+index a1eee41db..1db97d041 100644
+--- a/fem/transfer.cpp
++++ b/fem/transfer.cpp
+@@ -12,6 +12,7 @@
+ #include "transfer.hpp"
+ #include "bilinearform.hpp"
+ #include "../general/forall.hpp"
++#include "../general/workspace.hpp"
+ 
+ namespace mfem
+ {
+@@ -1399,15 +1400,10 @@ TensorProductPRefinementTransferOperator(
+    MFEM_VERIFY(elem_restrict_lex_h,
+                "High order ElementRestriction not available");
+ 
+-   localL.SetSize(elem_restrict_lex_l->Height(), Device::GetMemoryType());
+-   localH.SetSize(elem_restrict_lex_h->Height(), Device::GetMemoryType());
+-   localL.UseDevice(true);
+-   localH.UseDevice(true);
+-
+    MFEM_VERIFY(dynamic_cast<const ElementRestriction*>(elem_restrict_lex_h),
+                "High order element restriction is of unsupported type");
+ 
+-   mask.SetSize(localH.Size(), Device::GetMemoryType());
++   mask.SetSize(elem_restrict_lex_h->Height(), Device::GetMemoryType());
+    static_cast<const ElementRestriction*>(elem_restrict_lex_h)
+    ->BooleanMask(mask);
+    mask.UseDevice(true);
+@@ -1680,6 +1676,9 @@ void TensorProductPRefinementTransferOperator::Mult(const Vector& x,
+       return;
+    }
+ 
++   auto localH = Workspace::NewVector(elem_restrict_lex_h->Height());
++   auto localL = Workspace::NewVector(elem_restrict_lex_l->Height());
++
+    elem_restrict_lex_l->Mult(x, localL);
+    if (dim == 2)
+    {
+@@ -1706,6 +1705,9 @@ void TensorProductPRefinementTransferOperator::MultTranspose(const Vector& x,
+       return;
+    }
+ 
++   auto localH = Workspace::NewVector(elem_restrict_lex_h->Height());
++   auto localL = Workspace::NewVector(elem_restrict_lex_l->Height());
++
+    elem_restrict_lex_h->Mult(x, localH);
+    if (dim == 2)
+    {
+@@ -1731,7 +1733,7 @@ TrueTransferOperator::TrueTransferOperator(const FiniteElementSpace& lFESpace_,
+      lFESpace(lFESpace_),
+      hFESpace(hFESpace_)
+ {
+-   localTransferOperator = new TransferOperator(lFESpace_, hFESpace_);
++   localTransferOperator.reset(new TransferOperator(lFESpace_, hFESpace_));
+ 
+    P = lFESpace.GetProlongationMatrix();
+    R = hFESpace.IsVariableOrder() ? hFESpace.GetHpRestrictionMatrix() :
+@@ -1741,34 +1743,23 @@ TrueTransferOperator::TrueTransferOperator(const FiniteElementSpace& lFESpace_,
+    // P can be null and R not null
+    // If P is not null it is assumed that R is not null as well
+    if (P) { MFEM_VERIFY(R, "Both P and R have to be not NULL") }
+-
+-   if (P)
+-   {
+-      tmpL.SetSize(lFESpace_.GetVSize());
+-      tmpH.SetSize(hFESpace_.GetVSize());
+-   }
+-   // P can be null and R not null
+-   else if (R)
+-   {
+-      tmpH.SetSize(hFESpace_.GetVSize());
+-   }
+-}
+-
+-TrueTransferOperator::~TrueTransferOperator()
+-{
+-   delete localTransferOperator;
+ }
+ 
+ void TrueTransferOperator::Mult(const Vector& x, Vector& y) const
+ {
+    if (P)
+    {
++      auto tmpL = Workspace::NewVector(lFESpace.GetVSize());
++      auto tmpH = Workspace::NewVector(hFESpace.GetVSize());
++
+       P->Mult(x, tmpL);
+       localTransferOperator->Mult(tmpL, tmpH);
+       R->Mult(tmpH, y);
+    }
+    else if (R)
+    {
++      auto tmpH = Workspace::NewVector(hFESpace.GetVSize());
++
+       localTransferOperator->Mult(x, tmpH);
+       R->Mult(tmpH, y);
+    }
+@@ -1782,12 +1773,17 @@ void TrueTransferOperator::MultTranspose(const Vector& x, Vector& y) const
+ {
+    if (P)
+    {
++      auto tmpL = Workspace::NewVector(lFESpace.GetVSize());
++      auto tmpH = Workspace::NewVector(hFESpace.GetVSize());
++
+       R->MultTranspose(x, tmpH);
+       localTransferOperator->MultTranspose(tmpH, tmpL);
+       P->MultTranspose(tmpL, y);
+    }
+    else if (R)
+    {
++      auto tmpH = Workspace::NewVector(hFESpace.GetVSize());
++
+       R->MultTranspose(x, tmpH);
+       localTransferOperator->MultTranspose(tmpH, y);
+    }
+diff --git a/fem/transfer.hpp b/fem/transfer.hpp
+index 10e3a3de3..58cd6583e 100644
+--- a/fem/transfer.hpp
++++ b/fem/transfer.hpp
+@@ -470,8 +470,6 @@ private:
+    const Operator* elem_restrict_lex_l;
+    const Operator* elem_restrict_lex_h;
+    Vector mask;
+-   mutable Vector localL;
+-   mutable Vector localH;
+ 
+ public:
+    /// @brief Constructs a transfer operator from \p lFESpace to \p hFESpace
+@@ -505,9 +503,7 @@ private:
+    const FiniteElementSpace& hFESpace;
+    const Operator * P = nullptr;
+    const SparseMatrix * R = nullptr;
+-   TransferOperator* localTransferOperator;
+-   mutable Vector tmpL;
+-   mutable Vector tmpH;
++   std::unique_ptr<TransferOperator> localTransferOperator;
+ 
+ public:
+    /// @brief Constructs a transfer operator working on true degrees of freedom
+@@ -515,9 +511,6 @@ public:
+    TrueTransferOperator(const FiniteElementSpace& lFESpace_,
+                         const FiniteElementSpace& hFESpace_);
+ 
+-   /// Destructor
+-   ~TrueTransferOperator();
+-
+    /// @brief Interpolation or prolongation of a true dof vector \p x to a true
+    /// dof vector \p y.
+    /** The true dof vector \p x corresponding to the coarse space is restricted
+diff --git a/general/CMakeLists.txt b/general/CMakeLists.txt
+index 67c3653c4..6f610f959 100644
+--- a/general/CMakeLists.txt
++++ b/general/CMakeLists.txt
+@@ -31,6 +31,7 @@ list(APPEND SRCS
+   tinyxml2.cpp
+   version.cpp
+   hip.cpp
++  workspace.cpp
+   )
+ 
+ list(APPEND HDRS
+@@ -64,6 +65,7 @@ list(APPEND HDRS
+   text.hpp
+   version.hpp
+   hip.hpp
++  workspace.hpp
+   )
+ 
+ if (MFEM_USE_MPI)
+diff --git a/general/workspace.cpp b/general/workspace.cpp
+new file mode 100644
+index 000000000..ad8210a61
+--- /dev/null
++++ b/general/workspace.cpp
+@@ -0,0 +1,144 @@
++// Copyright (c) 2010-2024, Lawrence Livermore National Security, LLC. Produced
++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
++// LICENSE and NOTICE for details. LLNL-CODE-806117.
++//
++// This file is part of the MFEM library. For more information and source code
++// availability visit https://mfem.org.
++//
++// MFEM is free software; you can redistribute it and/or modify it under the
++// terms of the BSD-3 license. We welcome feedback and contributions, see file
++// CONTRIBUTING.md for details.
++
++#include "workspace.hpp"
++
++namespace mfem
++{
++
++WorkspaceVector::WorkspaceVector(
++   internal::WorkspaceChunk &chunk_, int offset_, int n)
++   : Vector(chunk_.GetData(), chunk_.GetOffset(), n),
++     chunk(chunk_),
++     offset(offset_),
++     original_size(n)
++{
++   UseDevice(true);
++}
++
++WorkspaceVector::WorkspaceVector(WorkspaceVector &&other)
++   : Vector(std::move(other)),
++     chunk(other.chunk),
++     offset(other.offset),
++     original_size(other.original_size)
++{
++   if (this != &other) { other.moved_from = true; }
++}
++
++WorkspaceVector::~WorkspaceVector()
++{
++   if (!moved_from) { chunk.FreeVector(*this); }
++}
++
++namespace internal
++{
++
++WorkspaceChunk::WorkspaceChunk(int capacity)
++   : data(capacity), original_capacity(capacity)
++{ }
++
++WorkspaceVector WorkspaceChunk::NewVector(int n)
++{
++   MFEM_ASSERT(HasCapacityFor(n), "Requested vector is too large.");
++   WorkspaceVector vector(*this, offset, n);
++   offset += n;
++   vector_count += 1;
++   return vector;
++}
++
++void WorkspaceChunk::FreeVector(const WorkspaceVector &v)
++{
++   MFEM_ASSERT(vector_count >= 0, "");
++   vector_count -= 1;
++   // If the chunk is completely empty, we can reclaim all of the memory and
++   // allow new allocations (before it is completely empty, we cannot reclaim
++   // memory because we don't track the specific regions that are freed).
++   if (vector_count == 0)
++   {
++      offset = 0;
++      // If we are not the front chunk, deallocate the backing memory. This
++      // chunk will be consolidated later anyway.
++      if (!front) { data.Destroy(); }
++   }
++   else
++   {
++      // If the vector being freed is the most recent vector allocated (i.e. if
++      // the vector is freed in stack/LIFO order), then we can reclaim its
++      // memory by moving the offset.
++      const int size = v.original_size;
++      if (v.offset == offset - size)
++      {
++         offset -= size;
++      }
++   }
++}
++
++} // namespace internal
++
++Workspace &Workspace::Instance()
++{
++   static Workspace ws;
++   return ws;
++}
++
++void Workspace::ConsolidateAndEnsureAvailable(int requested_size)
++{
++   int n_empty = 0;
++   int empty_capacity = 0;
++   // Merge all empty chunks at the beginning of the list
++   auto it = chunks.begin();
++   while (it != chunks.end() && it->IsEmpty())
++   {
++      empty_capacity += it->GetOriginalCapacity();
++      ++it;
++      ++n_empty;
++   }
++
++   // If we have multiple empty chunks at the beginning of the list, we need
++   // to merge them. Also, if the front chunk is empty, but not big enough,
++   // we need to replace it, so we remove it here.
++   if (n_empty > 1 || requested_size > empty_capacity)
++   {
++      // Note: if chunks is empty, then 'it' will be chunks.begin(), and the
++      // next line is a no-op.
++      chunks.erase_after(chunks.before_begin(), it);
++   }
++
++   const int capacity = chunks.empty() ? -1 :
++                        chunks.front().GetAvailableCapacity();
++
++   const int min_chunk_size = std::max(requested_size, empty_capacity);
++
++   if (min_chunk_size > capacity)
++   {
++      if (!chunks.empty()) { chunks.front().SetFront(false); }
++      chunks.emplace_front(min_chunk_size);
++   }
++}
++
++WorkspaceVector Workspace::NewVector(int n)
++{
++   Workspace &ws = Instance();
++   ws.ConsolidateAndEnsureAvailable(n);
++   return ws.chunks.front().NewVector(n);
++}
++
++void Workspace::Reserve(int n)
++{
++   Instance().ConsolidateAndEnsureAvailable(n);
++}
++
++void Workspace::Clear()
++{
++   Instance().chunks.clear();
++}
++
++} // namespace mfem
+diff --git a/general/workspace.hpp b/general/workspace.hpp
+new file mode 100644
+index 000000000..b5c0ebae8
+--- /dev/null
++++ b/general/workspace.hpp
+@@ -0,0 +1,176 @@
++// Copyright (c) 2010-2024, Lawrence Livermore National Security, LLC. Produced
++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
++// LICENSE and NOTICE for details. LLNL-CODE-806117.
++//
++// This file is part of the MFEM library. For more information and source code
++// availability visit https://mfem.org.
++//
++// MFEM is free software; you can redistribute it and/or modify it under the
++// terms of the BSD-3 license. We welcome feedback and contributions, see file
++// CONTRIBUTING.md for details.
++
++#ifndef MFEM_WORKSPACE_HPP
++#define MFEM_WORKSPACE_HPP
++
++#include "../linalg/vector.hpp"
++#include <forward_list>
++
++namespace mfem
++{
++
++// Forward declaration of internal::WorkspaceChunk
++namespace internal { class WorkspaceChunk; }
++
++/// @brief A vector used as a short-lived workspace for temporary calculations,
++/// created with Workspace::NewVector.
++///
++/// A WorkspaceVector is created using the Workspace bump allocator. The
++/// allocator can quickly and cheaply create new vectors, so that these vectors
++/// can be created and destroyed in loops without incurring the memory
++/// allocation and deallocation overhead.
++///
++/// WorkspaceVector%s should be used only for short-lived temporary storage; for
++/// example, they are not intended to be stored as member data in other classes.
++class WorkspaceVector : public Vector
++{
++   // using internal::WorkspaceChunk;
++   friend class internal::WorkspaceChunk;
++
++   /// The WorkspaceChunk containing the data for this vector.
++   internal::WorkspaceChunk &chunk;
++
++   /// Offset in the chunk.
++   const int offset;
++
++   /// Original size allocated.
++   const int original_size;
++
++   /// @brief Has this WorkspaceVector been moved from? If so, don't deallocate
++   /// from its WorkspaceChunk in the destructor.
++   bool moved_from = false;
++
++   /// Private constructor, create with Workspace::NewVector() instead.
++   WorkspaceVector(internal::WorkspaceChunk &chunk_, int offset_, int n);
++
++public:
++   /// @brief Move constructor. The moved-from WorkspaceVector has @a
++   /// size_in_chunk set to zero.
++   WorkspaceVector(WorkspaceVector &&other);
++
++   /// No copy constructor.
++   WorkspaceVector(WorkspaceVector &other) = delete;
++
++   /// Copy assignment: copy contents of vector, not metadata.
++   WorkspaceVector& operator=(WorkspaceVector &other)
++   {
++      Vector::operator=(other);
++      return *this;
++   }
++
++   /// Cannot move to an existing WorkspaceVector.
++   WorkspaceVector& operator=(WorkspaceVector &&other) = delete;
++
++   // All other operator=, inherit from Vector
++   using Vector::operator=;
++
++   /// Destructor. Notifies the WorkspaceChunk that this vector has been freed.
++   ~WorkspaceVector();
++};
++
++namespace internal
++{
++
++/// @brief A chunk of storage used to allocate WorkspaceVector%s.
++///
++/// This is an internal class used by Workspace.
++class WorkspaceChunk
++{
++   /// The data used as a base for the WorkspaceVector%s.
++   Vector data;
++
++   /// The offset of currently allocated WorkspaceVector%s in thus chunk.
++   int offset = 0;
++
++   /// How many vectors have been allocated in this chunk.
++   int vector_count = 0;
++
++   /// Is the vector in the front of the list?
++   bool front = true;
++
++   /// The original capacity allocated.
++   const int original_capacity;
++
++public:
++   /// Create a WorkspaceChunk with the given @a capacity.
++   WorkspaceChunk(int capacity);
++
++   /// @brief Return the available capacity (i.e. the largest vector that will
++   /// fit in this chunk).
++   int GetAvailableCapacity() const { return data.Size() - offset; }
++
++   /// @brief Returns the original capacity of the chunk.
++   ///
++   /// If the chunk is not in the front of the list and all of its vectors are
++   /// freed, it may deallocate its data, so the capacity becomes zero. The
++   /// "original capacity" remains unchained.
++   int GetOriginalCapacity() const { return original_capacity; }
++
++   /// Return the data offset.
++   int GetOffset() const { return offset; }
++
++   /// Sets whether the chunk is in the front of the list
++   void SetFront(bool front_) { front = front_; }
++
++   /// Returns true if this chunk can fit a new vector of size @a n.
++   bool HasCapacityFor(int n) const { return n <= GetAvailableCapacity(); }
++
++   /// Returns true if this chunk is empty.
++   bool IsEmpty() const { return vector_count == 0; }
++
++   /// Note that a vector from this chunk has been deallocated.
++   void FreeVector(const WorkspaceVector &v);
++
++   /// Returns the backing data Vector.
++   Vector &GetData() { return data; }
++
++   /// Returns a new WorkspaceVector of size @a n.
++   WorkspaceVector NewVector(int n);
++};
++
++}
++
++/// @brief Storage for temporary, short-lived workspace vectors.
++///
++/// This class implements a simple bump allocator to quickly allocate and
++/// deallocate workspace vectors without incurring the overhead of memory
++/// allocation and deallocation.
++class Workspace
++{
++   /// Chunks of storage to hold the vectors.
++   std::forward_list<internal::WorkspaceChunk> chunks;
++
++   /// Default constructor, private (singleton class).
++   Workspace() = default;
++
++   /// @brief Consolidate the chunks (merge consecutive empty chunks), and
++   /// ensure that the front chunk has sufficient available capacity for a
++   /// vector of @a requested_size.
++   void ConsolidateAndEnsureAvailable(int requested_size);
++
++   /// Return the singleton instance.
++   static Workspace &Instance();
++
++public:
++   /// Return a new WorkspaceVector of the requested size.
++   static WorkspaceVector NewVector(int n);
++
++   /// Ensure that capacity of at least @a n is available for allocations.
++   static void Reserve(int n);
++
++   /// Clear all storage. Invalidates any existing WorkspaceVector%s.
++   static void Clear();
++};
++
++} // namespace mfem
++
++#endif
+diff --git a/linalg/blockoperator.cpp b/linalg/blockoperator.cpp
+index f5e1d02c7..c6b48d8b9 100644
+--- a/linalg/blockoperator.cpp
++++ b/linalg/blockoperator.cpp
+@@ -11,6 +11,7 @@
+ 
+ 
+ #include "../general/array.hpp"
++#include "../general/workspace.hpp"
+ #include "operator.hpp"
+ #include "blockvector.hpp"
+ #include "blockoperator.hpp"
+@@ -78,7 +79,7 @@ void BlockOperator::Mult (const Vector & x, Vector & y) const
+ 
+    for (int iRow=0; iRow < nRowBlocks; ++iRow)
+    {
+-      tmp.SetSize(row_offsets[iRow+1] - row_offsets[iRow]);
++      auto tmp = Workspace::NewVector(row_offsets[iRow+1] - row_offsets[iRow]);
+       for (int jCol=0; jCol < nColBlocks; ++jCol)
+       {
+          if (op(iRow,jCol))
+@@ -109,7 +110,7 @@ void BlockOperator::MultTranspose (const Vector & x, Vector & y) const
+ 
+    for (int iRow=0; iRow < nColBlocks; ++iRow)
+    {
+-      tmp.SetSize(col_offsets[iRow+1] - col_offsets[iRow]);
++      auto tmp = Workspace::NewVector(col_offsets[iRow+1] - col_offsets[iRow]);
+       for (int jCol=0; jCol < nRowBlocks; ++jCol)
+       {
+          if (op(jCol,iRow))
+@@ -284,8 +285,8 @@ void BlockLowerTriangularPreconditioner::Mult (const Vector & x,
+    y = 0.0;
+    for (int iRow=0; iRow < nBlocks; ++iRow)
+    {
+-      tmp.SetSize(offsets[iRow+1] - offsets[iRow]);
+-      tmp2.SetSize(offsets[iRow+1] - offsets[iRow]);
++      auto tmp = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]);
++      auto tmp2 = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]);
+       tmp2 = 0.0;
+       tmp2 += xblock.GetBlock(iRow);
+       for (int jCol=0; jCol < iRow; ++jCol)
+@@ -320,8 +321,8 @@ void BlockLowerTriangularPreconditioner::MultTranspose (const Vector & x,
+    y = 0.0;
+    for (int iRow=nBlocks-1; iRow >=0; --iRow)
+    {
+-      tmp.SetSize(offsets[iRow+1] - offsets[iRow]);
+-      tmp2.SetSize(offsets[iRow+1] - offsets[iRow]);
++      auto tmp = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]);
++      auto tmp2 = Workspace::NewVector(offsets[iRow+1] - offsets[iRow]);
+       tmp2 = 0.0;
+       tmp2 += xblock.GetBlock(iRow);
+       for (int jCol=iRow+1; jCol < nBlocks; ++jCol)
+diff --git a/linalg/blockoperator.hpp b/linalg/blockoperator.hpp
+index a6c05c477..8900a380b 100644
+--- a/linalg/blockoperator.hpp
++++ b/linalg/blockoperator.hpp
+@@ -127,7 +127,6 @@ private:
+    //! Temporary Vectors used to efficiently apply the Mult and MultTranspose methods.
+    mutable BlockVector xblock;
+    mutable BlockVector yblock;
+-   mutable Vector tmp;
+ };
+ 
+ //! @class BlockDiagonalPreconditioner
+@@ -281,8 +280,6 @@ private:
+    //! methods.
+    mutable BlockVector xblock;
+    mutable BlockVector yblock;
+-   mutable Vector tmp;
+-   mutable Vector tmp2;
+ };
+ 
+ }
+diff --git a/linalg/operator.cpp b/linalg/operator.cpp
+index 3fd4d64f9..9b7f8ed54 100644
+--- a/linalg/operator.cpp
++++ b/linalg/operator.cpp
+@@ -12,6 +12,7 @@
+ #include "vector.hpp"
+ #include "operator.hpp"
+ #include "../general/forall.hpp"
++#include "../general/workspace.hpp"
+ 
+ #include <iostream>
+ #include <iomanip>
+@@ -50,7 +51,7 @@ void Operator::InitTVectors(const Operator *Po, const Operator *Ri,
+ 
+ void Operator::AddMult(const Vector &x, Vector &y, const double a) const
+ {
+-   mfem::Vector z(y.Size());
++   auto z = Workspace::NewVector(y.Size());
+    Mult(x, z);
+    y.Add(a, z);
+ }
+@@ -58,7 +59,7 @@ void Operator::AddMult(const Vector &x, Vector &y, const double a) const
+ void Operator::AddMultTranspose(const Vector &x, Vector &y,
+                                 const double a) const
+ {
+-   mfem::Vector z(y.Size());
++   auto z = Workspace::NewVector(y.Size());
+    MultTranspose(x, z);
+    y.Add(a, z);
+ }
+@@ -516,13 +517,8 @@ ConstrainedOperator::ConstrainedOperator(Operator *A, const Array<int> &list,
+ {
+    // 'mem_class' should work with A->Mult() and mfem::forall():
+    mem_class = A->GetMemoryClass()*Device::GetDeviceMemoryClass();
+-   MemoryType mem_type = GetMemoryType(mem_class);
+    list.Read(); // TODO: just ensure 'list' is registered, no need to copy it
+    constraint_list.MakeRef(list);
+-   // typically z and w are large vectors, so use the device (GPU) to perform
+-   // operations on them
+-   z.SetSize(height, mem_type); z.UseDevice(true);
+-   w.SetSize(height, mem_type); w.UseDevice(true);
+ }
+ 
+ void ConstrainedOperator::AssembleDiagonal(Vector &diag) const
+@@ -558,6 +554,7 @@ void ConstrainedOperator::AssembleDiagonal(Vector &diag) const
+ 
+ void ConstrainedOperator::EliminateRHS(const Vector &x, Vector &b) const
+ {
++   auto w = Workspace::NewVector(height);
+    w = 0.0;
+    const int csz = constraint_list.Size();
+    auto idx = constraint_list.Read();
+@@ -570,9 +567,7 @@ void ConstrainedOperator::EliminateRHS(const Vector &x, Vector &b) const
+       d_w[id] = d_x[id];
+    });
+ 
+-   // A.AddMult(w, b, -1.0); // if available to all Operators
+-   A->Mult(w, z);
+-   b -= z;
++   A->AddMult(w, b, -1.0);
+ 
+    // Use read+write access - we are modifying sub-vector of b
+    auto d_b = b.ReadWrite();
+@@ -600,6 +595,7 @@ void ConstrainedOperator::ConstrainedMult(const Vector &x, Vector &y,
+       return;
+    }
+ 
++   auto z = Workspace::NewVector(height);
+    z = x;
+ 
+    auto idx = constraint_list.Read();
+@@ -657,13 +653,6 @@ void ConstrainedOperator::MultTranspose(const Vector &x, Vector &y) const
+    ConstrainedMult(x, y, transpose);
+ }
+ 
+-void ConstrainedOperator::AddMult(const Vector &x, Vector &y,
+-                                  const double a) const
+-{
+-   Mult(x, w);
+-   y.Add(a, w);
+-}
+-
+ RectangularConstrainedOperator::RectangularConstrainedOperator(
+    Operator *A,
+    const Array<int> &trial_list,
+@@ -673,19 +662,16 @@ RectangularConstrainedOperator::RectangularConstrainedOperator(
+ {
+    // 'mem_class' should work with A->Mult() and mfem::forall():
+    mem_class = A->GetMemoryClass()*Device::GetMemoryClass();
+-   MemoryType mem_type = GetMemoryType(mem_class);
+    trial_list.Read(); // TODO: just ensure 'list' is registered, no need to copy it
+    test_list.Read(); // TODO: just ensure 'list' is registered, no need to copy it
+    trial_constraints.MakeRef(trial_list);
+    test_constraints.MakeRef(test_list);
+-   // typically z and w are large vectors, so store them on the device
+-   z.SetSize(height, mem_type); z.UseDevice(true);
+-   w.SetSize(width, mem_type); w.UseDevice(true);
+ }
+ 
+ void RectangularConstrainedOperator::EliminateRHS(const Vector &x,
+                                                   Vector &b) const
+ {
++   auto w = Workspace::NewVector(width);
+    w = 0.0;
+    const int trial_csz = trial_constraints.Size();
+    auto trial_idx = trial_constraints.Read();
+@@ -719,6 +705,7 @@ void RectangularConstrainedOperator::Mult(const Vector &x, Vector &y) const
+    }
+    else
+    {
++      auto w = Workspace::NewVector(width);
+       w = x;
+ 
+       auto idx = trial_constraints.Read();
+@@ -754,6 +741,7 @@ void RectangularConstrainedOperator::MultTranspose(const Vector &x,
+    }
+    else
+    {
++      auto z = Workspace::NewVector(height);
+       z = x;
+ 
+       auto idx = test_constraints.Read();
+diff --git a/linalg/operator.hpp b/linalg/operator.hpp
+index fa856ffb8..b8b1761ae 100644
+--- a/linalg/operator.hpp
++++ b/linalg/operator.hpp
+@@ -897,7 +897,6 @@ protected:
+    Array<int> constraint_list;  ///< List of constrained indices/dofs.
+    Operator *A;                 ///< The unconstrained Operator.
+    bool own_A;                  ///< Ownership flag for A.
+-   mutable Vector z, w;         ///< Auxiliary vectors.
+    MemoryClass mem_class;
+    DiagonalPolicy diag_policy;  ///< Diagonal policy for constrained dofs
+ 
+@@ -946,8 +945,6 @@ public:
+        the vectors, and "_i" -- the rest of the entries. */
+    void Mult(const Vector &x, Vector &y) const override;
+ 
+-   void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override;
+-
+    void MultTranspose(const Vector &x, Vector &y) const override;
+ 
+    /** @brief Implementation of Mult or MultTranspose.
+@@ -972,7 +969,6 @@ protected:
+    Array<int> trial_constraints, test_constraints;
+    Operator *A;
+    bool own_A;
+-   mutable Vector z, w;
+    MemoryClass mem_class;
+ 
+ public:
+diff --git a/linalg/solvers.cpp b/linalg/solvers.cpp
+index ba0af7e91..053607e88 100644
+--- a/linalg/solvers.cpp
++++ b/linalg/solvers.cpp
+@@ -13,6 +13,7 @@
+ #include "../general/annotation.hpp"
+ #include "../general/forall.hpp"
+ #include "../general/globals.hpp"
++#include "../general/workspace.hpp"
+ #include "../fem/bilinearform.hpp"
+ #include <iostream>
+ #include <iomanip>
+@@ -212,10 +213,9 @@ OperatorJacobiSmoother::OperatorJacobiSmoother(const BilinearForm &a,
+    dinv(height),
+    damping(dmpng),
+    ess_tdof_list(&ess_tdofs),
+-   residual(height),
+    allow_updates(false)
+ {
+-   Vector &diag(residual);
++   auto diag = Workspace::NewVector(height);
+    a.AssembleDiagonal(diag);
+    // 'a' cannot be used for iterative_mode == true because its size may be
+    // different.
+@@ -231,7 +231,6 @@ OperatorJacobiSmoother::OperatorJacobiSmoother(const Vector &d,
+    dinv(height),
+    damping(dmpng),
+    ess_tdof_list(&ess_tdofs),
+-   residual(height),
+    oper(NULL),
+    allow_updates(false)
+ {
+@@ -268,15 +267,13 @@ void OperatorJacobiSmoother::SetOperator(const Operator &op)
+       ess_tdof_list = nullptr;
+    }
+    dinv.SetSize(height);
+-   residual.SetSize(height);
+-   Vector &diag(residual);
++   auto diag = Workspace::NewVector(height);
+    op.AssembleDiagonal(diag);
+    Setup(diag);
+ }
+ 
+ void OperatorJacobiSmoother::Setup(const Vector &diag)
+ {
+-   residual.UseDevice(true);
+    const double delta = damping;
+    auto D = diag.Read();
+    auto DI = dinv.Write();
+@@ -307,6 +304,8 @@ void OperatorJacobiSmoother::Mult(const Vector &x, Vector &y) const
+    MFEM_ASSERT(x.Size() == Width(), "invalid input vector");
+    MFEM_ASSERT(y.Size() == Height(), "invalid output vector");
+ 
++   auto residual = Workspace::NewVector(height);
++
+    if (iterative_mode)
+    {
+       MFEM_VERIFY(oper, "iterative_mode == true requires the forward operator");
+@@ -341,7 +340,6 @@ OperatorChebyshevSmoother::OperatorChebyshevSmoother(const Operator &oper_,
+    diag(d),
+    coeffs(order),
+    ess_tdof_list(ess_tdofs),
+-   residual(N),
+    oper(&oper_) { Setup(); }
+ 
+ #ifdef MFEM_USE_MPI
+@@ -362,7 +360,6 @@ OperatorChebyshevSmoother::OperatorChebyshevSmoother(const Operator &oper_,
+      diag(d),
+      coeffs(order),
+      ess_tdof_list(ess_tdofs),
+-     residual(N),
+      oper(&oper_)
+ {
+    OperatorJacobiSmoother invDiagOperator(diag, ess_tdofs, 1.0);
+@@ -405,7 +402,6 @@ OperatorChebyshevSmoother::OperatorChebyshevSmoother(const Operator* oper_,
+ void OperatorChebyshevSmoother::Setup()
+ {
+    // Invert diagonal
+-   residual.UseDevice(true);
+    auto D = diag.Read();
+    auto X = dinv.Write();
+    mfem::forall(N, [=] MFEM_HOST_DEVICE (int i) { X[i] = 1.0 / D[i]; });
+@@ -494,8 +490,10 @@ void OperatorChebyshevSmoother::Mult(const Vector& x, Vector &y) const
+       MFEM_ABORT("Chebyshev smoother requires operator");
+    }
+ 
++   auto residual = Workspace::NewVector(x.Size());
++   auto helperVector = Workspace::NewVector(x.Size());
++
+    residual = x;
+-   helperVector.SetSize(x.Size());
+ 
+    y.UseDevice(true);
+    y = 0.0;
+@@ -2997,7 +2995,7 @@ void BlockILU::Mult(const Vector &b, Vector &x) const
+ {
+    MFEM_ASSERT(height > 0, "BlockILU(0) preconditioner is not constructed");
+    int nblockrows = Height()/block_size;
+-   y.SetSize(Height());
++   auto y = Workspace::NewVector(Height());
+ 
+    DenseMatrix B;
+    Vector yi, yj, xi, xj;
+@@ -3457,6 +3455,8 @@ void OrthoSolver::Mult(const Vector &b, Vector &x) const
+    MFEM_VERIFY(height == b.Size(), "incompatible input Vector size!");
+    MFEM_VERIFY(height == x.Size(), "incompatible output Vector size!");
+ 
++   auto b_ortho = Workspace::NewVector(height);
++
+    // Orthogonalize input
+    Orthogonalize(b, b_ortho);
+ 
+diff --git a/linalg/solvers.hpp b/linalg/solvers.hpp
+index 5b222fc50..ff243746c 100644
+--- a/linalg/solvers.hpp
++++ b/linalg/solvers.hpp
+@@ -368,7 +368,6 @@ private:
+    Vector dinv;
+    const double damping;
+    const Array<int> *ess_tdof_list; // not owned; may be NULL
+-   mutable Vector residual;
+    /// Uses absolute values of the diagonal entries.
+    bool use_abs_diag = false;
+ 
+@@ -465,8 +464,6 @@ private:
+    const Vector &diag;
+    Array<double> coeffs;
+    const Array<int>& ess_tdof_list;
+-   mutable Vector residual;
+-   mutable Vector helperVector;
+    const Operator* oper;
+ };
+ 
+@@ -1046,9 +1043,6 @@ private:
+ 
+    Reordering reordering;
+ 
+-   /// Temporary vector used in the Mult() function.
+-   mutable Vector y;
+-
+    /// Permutation and inverse permutation vectors for the block reordering.
+    Array<int> P, Pinv;
+ 
+@@ -1249,8 +1243,6 @@ public:
+ private:
+    Solver *solver = nullptr;
+ 
+-   mutable Vector b_ortho;
+-
+    void Orthogonalize(const Vector &v, Vector &v_ortho) const;
+ };
+