diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
index bd994990c05..2e3b8bac188 100644
--- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
@@ -1,10 +1,13 @@
+..
+   This file was automatically generated. Do not edit.
+
 barrier.cluster.arrive
 ^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // barrier.cluster.arrive; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive();
 
 barrier.cluster.wait
@@ -13,7 +16,7 @@ barrier.cluster.wait
 
    // barrier.cluster.wait; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait();
 
 barrier.cluster.arrive.release
@@ -23,7 +26,7 @@ barrier.cluster.arrive.release
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .release }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_release_t);
 
@@ -34,7 +37,7 @@ barrier.cluster.arrive.relaxed
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .relaxed }
    // Marked volatile
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_relaxed_t);
 
@@ -45,6 +48,6 @@ barrier.cluster.wait.acquire
    // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
    // .sem       = { .acquire }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait(
      cuda::ptx::sem_acquire_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
index f5c236f8bf9..4883d8495eb 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -37,7 +40,7 @@ cp.async.bulk.global.shared::cta.bulk_group
    // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
index 984b4aff976..07b9f9acfc1 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
@@ -1,7 +1,10 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.commit_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_commit_group();
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
index 9cb15d06fa3..af027c0b623 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu
    // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
index 40eb070e66a..1c21efdd0a3 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -36,7 +39,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -52,7 +55,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -67,7 +70,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -83,7 +86,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -98,7 +101,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -114,7 +117,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -129,7 +132,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -145,7 +148,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
index 2481c80bf3c..ac33a05b69f 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -22,7 +25,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -39,7 +42,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -56,7 +59,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -73,7 +76,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
index 08ebd3c28a7..06ff8e9014c 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.wait_group
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
index cc82d633375..b043eb9f456 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -64,7 +67,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -83,7 +86,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -102,7 +105,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -121,7 +124,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.in
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -140,7 +143,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.de
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -159,7 +162,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -178,7 +181,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -197,7 +200,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -216,7 +219,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -235,7 +238,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -362,7 +365,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -380,7 +383,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -398,7 +401,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -416,7 +419,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -434,7 +437,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -452,7 +455,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -470,7 +473,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -488,7 +491,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -506,7 +509,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -524,7 +527,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -542,7 +545,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -560,7 +563,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -578,7 +581,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -596,7 +599,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32
    // .src       = { .shared::cta }
    // .type      = { .f32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -614,7 +617,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64
    // .src       = { .shared::cta }
    // .type      = { .f64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -632,7 +635,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
index e4dea98a119..80e927d0375 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
index 18c5e0bfc60..0d658fd9256 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
index c653b01cd60..d587d3f51a2 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst
index 2fe14dcb3b2..ed21fa80b6e 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.sc.cta
 ^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
index 0f5298e3359..c7dd357632a 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.mbarrier_init.release.cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ fence.mbarrier_init.release.cluster
    // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
    // .sem       = { .release }
    // .scope     = { .cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_mbarrier_init(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
index 935aab9b6df..fdd1f8d0b12 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
@@ -1,7 +1,10 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.proxy.alias
 ^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_proxy_alias();
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
index 3e741a1f6c4..8376e96ce6b 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.proxy.async
 ^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // fence.proxy.async; // 5. PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_proxy_async();
 
 fence.proxy.async.global
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
index db582971c3d..78c3cd308a0 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.proxy.tensormap::generic.release.cta
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
index c85f52ee302..374c182576f 100644
--- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
@@ -1,10 +1,13 @@
+..
+   This file was automatically generated. Do not edit.
+
 getctarank.shared::cluster.u32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint32_t getctarank(
      cuda::ptx::space_cluster_t,
      const void* addr);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
index 92cd106cad9..21436e2b3ca 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.arrive.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive(
      uint64_t* addr);
 
@@ -12,7 +15,7 @@ mbarrier.arrive.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive(
      uint64_t* addr,
      const uint32_t& count);
@@ -87,7 +90,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
@@ -102,7 +105,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
index 0087ae2f458..47c56eca31a 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.arrive.expect_tx.release.cta.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -38,7 +41,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive_expect_tx(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
index b6d7edbbeee..ba909ae1f56 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.arrive.noComplete.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive_no_complete(
      uint64_t* addr,
      const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
index b87d6f62a23..46adcd16be3 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.expect_tx.relaxed.cta.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
index 3e529d86d78..2c3520a20f6 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.init.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_init(
      uint64_t* addr,
      const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
index 4cb241c7ca8..d16b2ac07ac 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.test_wait.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_test_wait(
      uint64_t* addr,
      const uint64_t& state);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
index e750c4a543f..ec464b3398b 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.test_wait.parity.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3.  PTX ISA 71, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_test_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
index ce648c66ee9..3dfdba46861 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.try_wait.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait(
      uint64_t* addr,
      const uint64_t& state);
@@ -13,7 +16,7 @@ mbarrier.try_wait.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait(
      uint64_t* addr,
      const uint64_t& state,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
index 3210dc0eab1..4e7af4bace5 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.try_wait.parity.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity);
@@ -13,7 +16,7 @@ mbarrier.try_wait.parity.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity,
diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
index d6b9cf36549..658fe0a8f44 100644
--- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_inc_t,
      uint32_t* dest,
@@ -19,7 +22,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_dec_t,
      uint32_t* dest,
@@ -33,7 +36,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_min_t,
      uint32_t* dest,
@@ -47,7 +50,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_max_t,
      uint32_t* dest,
@@ -61,7 +64,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      uint32_t* dest,
@@ -75,7 +78,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_min_t,
      int32_t* dest,
@@ -89,7 +92,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_max_t,
      int32_t* dest,
@@ -103,7 +106,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      int32_t* dest,
@@ -159,7 +162,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      uint64_t* dest,
@@ -172,7 +175,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
 
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      int64_t* dest,
diff --git a/docs/libcudacxx/ptx/instructions/generated/st_async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
index c519ea57f70..d00a152cf29 100644
--- a/docs/libcudacxx/ptx/instructions/generated/st_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
index 52fae102ad4..e42bae5a5a0 100644
--- a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
index 33e6f1d839a..a8c4a260782 100644
--- a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 tensormap.replace.tile.global_address.global.b1024.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
index 8b09ddd1110..93b6a06037c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
-#include <cuda/__ptx/instructions/generated/barrier_cluster.inc>
+#include <cuda/__ptx/instructions/generated/barrier_cluster.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
index 480a02a701e..abfba441ac9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
@@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-#include <cuda/__ptx/instructions/generated/cp_async_bulk.inc>
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
index bd97259cf19..f9320e975f2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
index 5b9f575ce5f..7de5b41b744 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
@@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc>
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
index 00a3700e1a9..0d933e2cc34 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
index ee6d90bc4d9..f1487301ada 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
@@ -43,12 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
-#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc>
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h>
 #ifdef _LIBCUDACXX_HAS_NVF16
-#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc>
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h>
 #endif // _LIBCUDACXX_HAS_NVF16
 #ifdef _LIBCUDACXX_HAS_NVBF16
-#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc>
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h>
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
index a6b23a706c7..436c42d4c3f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
-#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc>
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h
index 045f09cb40e..a8dccf979c2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/fence.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h
@@ -32,11 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
-#include <cuda/__ptx/instructions/generated/fence.inc>
-#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_alias.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_async.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc>
+#include <cuda/__ptx/instructions/generated/fence.h>
+#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_alias.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_async.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
similarity index 92%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
index ca9238bc3ff..10d55714c5b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
@@ -1,7 +1,12 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
+#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
+
 /*
 // barrier.cluster.arrive; // PTX ISA 78, SM_90
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive();
 */
 #if __cccl_ptx_isa >= 780
@@ -24,7 +29,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive()
 /*
 // barrier.cluster.wait; // PTX ISA 78, SM_90
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_wait();
 */
 #if __cccl_ptx_isa >= 780
@@ -48,7 +53,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait()
 // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
 // .sem       = { .release }
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive(
   cuda::ptx::sem_release_t);
 */
@@ -74,7 +79,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
 // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
 // .sem       = { .relaxed }
 // Marked volatile
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive(
   cuda::ptx::sem_relaxed_t);
 */
@@ -100,7 +105,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
 // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
 // .sem       = { .acquire }
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_wait(
   cuda::ptx::sem_acquire_t);
 */
@@ -121,3 +126,5 @@ _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
       __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
index 69f77053b95..8ba40d45f64 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
+
 /*
 // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
 SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
 // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -82,7 +87,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
 // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -109,3 +114,5 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
similarity index 73%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
index 24baddaea8f..7bb58675ddb 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
+
 /*
 // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_commit_group();
 */
 #if __cccl_ptx_isa >= 800
@@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_commit_group()
       __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
similarity index 86%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
index cdd5a535eb6..a5534ef0b48 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
+
 /*
 // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
 ctaMask; // 1.  PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -43,3 +48,5 @@ _CCCL_DEVICE static inline void cp_async_bulk(
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
similarity index 96%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
index 547888d5b0f..3cbd26fda04 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
+
 /*
 // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
 1a. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -42,7 +47,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -79,7 +84,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1b. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -122,7 +127,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -159,7 +164,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1c. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -244,7 +249,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1d. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -289,7 +294,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -331,7 +336,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1e. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -414,3 +419,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
similarity index 95%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
index 020698a15b1..915979d18f3 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
+
 /*
 // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
 tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -49,7 +54,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -96,7 +101,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -144,7 +149,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -193,7 +198,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -237,3 +242,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
similarity index 82%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
index 1a715a0fac6..2057323665a 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
+
 /*
 // cp.async.bulk.wait_group N; // PTX ISA 80, SM_90
 template <int N32>
@@ -7,13 +12,13 @@ __device__ static inline void cp_async_bulk_wait_group(
 #if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();
 template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n)
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm volatile("cp.async.bulk.wait_group %0;"
                   :
-                  : "n"(__n.value)
+                  : "n"(__N.value)
                   : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -30,16 +35,18 @@ __device__ static inline void cp_async_bulk_wait_group_read(
 #if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();
 template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n)
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm volatile("cp.async.bulk.wait_group.read %0;"
                   :
-                  : "n"(__n.value)
+                  : "n"(__N.value)
                   : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
similarity index 97%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
index 50059ff6c5b..a35684c85e1 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
@@ -1,5 +1,8 @@
-// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
 ISA 80, SM_90
@@ -154,7 +157,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -203,7 +206,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -252,7 +255,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -301,7 +304,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -350,7 +353,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -399,7 +402,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -448,7 +451,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -497,7 +500,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -546,7 +549,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -595,7 +598,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -670,7 +673,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -715,7 +718,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -760,7 +763,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -778,7 +781,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -820,7 +823,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -862,7 +865,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -904,7 +907,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -946,7 +949,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -988,7 +991,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1030,7 +1033,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1072,7 +1075,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1114,7 +1117,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1156,7 +1159,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1198,7 +1201,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1240,7 +1243,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1282,7 +1285,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1324,7 +1327,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1361,7 +1364,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1398,7 +1401,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1433,3 +1436,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
similarity index 89%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
index c657e8d1935..1e13bb5f4f2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
@@ -1,11 +1,15 @@
-#ifdef _LIBCUDACXX_HAS_NVBF16
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -14,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -39,7 +43,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
@@ -47,7 +51,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -56,7 +60,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -81,7 +85,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
@@ -89,7 +93,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -98,7 +102,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -123,5 +127,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
similarity index 89%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
index 3a52630db53..0c4678c95bb 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
@@ -1,10 +1,15 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -13,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -33,7 +38,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
@@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -50,7 +55,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -70,7 +75,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
@@ -78,7 +83,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -87,7 +92,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -107,4 +112,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
similarity index 91%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
index 32008f6af5b..9ec5b2443d8 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
+
 /*
 // cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80,
 SM_90
@@ -37,37 +42,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
@@ -118,37 +123,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
@@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -212,7 +217,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -221,7 +226,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -230,7 +235,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -239,7 +244,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -248,7 +253,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -257,7 +262,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -317,7 +322,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -327,7 +332,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -337,7 +342,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -347,7 +352,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -357,7 +362,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -367,7 +372,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -440,7 +445,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -452,7 +457,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -464,7 +469,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -476,7 +481,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -488,7 +493,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -500,7 +505,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // "
             "1e."
             :
@@ -512,7 +517,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -530,3 +535,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
       __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
similarity index 81%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
index f10ec07ebb5..db00c4d4cba 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_H_
+#define _CUDA_PTX_GENERATED_FENCE_H_
+
 /*
 // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
 // .sem       = { .sc, .acq_rel }
@@ -19,15 +24,15 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope
     (
       _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) {
         asm volatile("fence.sc.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
         asm volatile("fence.sc.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
         asm volatile("fence.sc.sys; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
         asm volatile("fence.acq_rel.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
         asm volatile("fence.acq_rel.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
         asm volatile("fence.acq_rel.sys; // 1." : : : "memory");
       }),
     (
@@ -57,7 +62,7 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
     (
       _CCCL_IF_CONSTEXPR (__sem == sem_sc) {
         asm volatile("fence.sc.cluster; // 2." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) {
         asm volatile("fence.acq_rel.cluster; // 2." : : : "memory");
       }),
     (
@@ -65,3 +70,5 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
       __cuda_ptx_fence_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 780
+
+#endif // _CUDA_PTX_GENERATED_FENCE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
similarity index 80%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
index 0d39c222598..e185913b3cd 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
@@ -1,8 +1,13 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
+#define _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
+
 /*
 // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
 // .sem       = { .release }
 // .scope     = { .cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_mbarrier_init(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t);
@@ -25,3 +30,5 @@ _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster
       __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
similarity index 74%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
index 98260b851ca..40229b84a96 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
+
 /*
 // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_proxy_alias();
 */
 #if __cccl_ptx_isa >= 750
@@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void fence_proxy_alias()
       __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
 }
 #endif // __cccl_ptx_isa >= 750
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
similarity index 83%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
index f0a37baabdb..f64b5faee5e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
+
 /*
 // fence.proxy.async; // 5. PTX ISA 80, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_proxy_async();
 */
 #if __cccl_ptx_isa >= 800
@@ -38,9 +43,9 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
     (
       _CCCL_IF_CONSTEXPR (__space == space_global) {
         asm volatile("fence.proxy.async.global; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__space == space_cluster) {
         asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) {
+      } else _CCCL_IF_CONSTEXPR (__space == space_shared) {
         asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory");
       }),
     (
@@ -48,3 +53,5 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
       __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
similarity index 85%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
index 3e5b2a265f4..1e6119ee032 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
+
 /*
 // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
 // .sem       = { .release }
@@ -19,11 +24,11 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco
     (
       _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
         asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory");
       }),
     (
@@ -59,17 +64,17 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
@@ -80,3 +85,5 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void
       __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
similarity index 95%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
index dd3079915f7..08128cc00a1 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_GET_SREG_H_
+#define _CUDA_PTX_GENERATED_GET_SREG_H_
+
 /*
 // mov.u32 sreg_value, %%tid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -15,7 +20,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x()
 
 /*
 // mov.u32 sreg_value, %%tid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -30,7 +35,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y()
 
 /*
 // mov.u32 sreg_value, %%tid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -45,7 +50,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z()
 
 /*
 // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -60,7 +65,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x()
 
 /*
 // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -75,7 +80,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y()
 
 /*
 // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -90,7 +95,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z()
 
 /*
 // mov.u32 sreg_value, %%laneid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_laneid();
 */
 #if __cccl_ptx_isa >= 130
@@ -105,7 +110,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid()
 
 /*
 // mov.u32 sreg_value, %%warpid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_warpid();
 */
 #if __cccl_ptx_isa >= 130
@@ -120,7 +125,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid()
 
 /*
 // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nwarpid();
 */
 #if __cccl_ptx_isa >= 200
@@ -144,7 +149,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
 
 /*
 // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -159,7 +164,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x()
 
 /*
 // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -174,7 +179,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y()
 
 /*
 // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -189,7 +194,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z()
 
 /*
 // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -204,7 +209,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x()
 
 /*
 // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -219,7 +224,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y()
 
 /*
 // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -234,7 +239,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z()
 
 /*
 // mov.u32 sreg_value, %%smid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_smid();
 */
 #if __cccl_ptx_isa >= 130
@@ -249,7 +254,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid()
 
 /*
 // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nsmid();
 */
 #if __cccl_ptx_isa >= 200
@@ -273,7 +278,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
 
 /*
 // mov.u64 sreg_value, %%gridid; // PTX ISA 30
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_gridid();
 */
 #if __cccl_ptx_isa >= 300
@@ -288,7 +293,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid()
 
 /*
 // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline bool get_sreg_is_explicit_cluster();
 */
 #if __cccl_ptx_isa >= 780
@@ -315,7 +320,7 @@ _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
 
 /*
 // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -339,7 +344,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
 
 /*
 // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -363,7 +368,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
 
 /*
 // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -387,7 +392,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -411,7 +416,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -435,7 +440,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -459,7 +464,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -483,7 +488,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -507,7 +512,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -531,7 +536,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -555,7 +560,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -579,7 +584,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -603,7 +608,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctarank();
 */
 #if __cccl_ptx_isa >= 780
@@ -627,7 +632,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctarank();
 */
 #if __cccl_ptx_isa >= 780
@@ -651,7 +656,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
 
 /*
 // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_eq();
 */
 #if __cccl_ptx_isa >= 200
@@ -675,7 +680,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
 
 /*
 // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_le();
 */
 #if __cccl_ptx_isa >= 200
@@ -699,7 +704,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
 
 /*
 // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_lt();
 */
 #if __cccl_ptx_isa >= 200
@@ -723,7 +728,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
 
 /*
 // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_ge();
 */
 #if __cccl_ptx_isa >= 200
@@ -747,7 +752,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
 
 /*
 // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_gt();
 */
 #if __cccl_ptx_isa >= 200
@@ -771,7 +776,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
 
 /*
 // mov.u32 sreg_value, %%clock; // PTX ISA 10
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clock();
 */
 #if __cccl_ptx_isa >= 100
@@ -786,7 +791,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock()
 
 /*
 // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clock_hi();
 */
 #if __cccl_ptx_isa >= 500
@@ -810,7 +815,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
 
 /*
 // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_clock64();
 */
 #if __cccl_ptx_isa >= 200
@@ -834,7 +839,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
 
 /*
 // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_globaltimer();
 */
 #if __cccl_ptx_isa >= 310
@@ -858,7 +863,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
 
 /*
 // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_globaltimer_lo();
 */
 #if __cccl_ptx_isa >= 310
@@ -882,7 +887,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
 
 /*
 // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_globaltimer_hi();
 */
 #if __cccl_ptx_isa >= 310
@@ -906,7 +911,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
 
 /*
 // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_total_smem_size();
 */
 #if __cccl_ptx_isa >= 410
@@ -930,7 +935,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
 
 /*
 // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_aggr_smem_size();
 */
 #if __cccl_ptx_isa >= 810
@@ -954,7 +959,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
 
 /*
 // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_dynamic_smem_size();
 */
 #if __cccl_ptx_isa >= 410
@@ -978,7 +983,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
 
 /*
 // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_current_graph_exec();
 */
 #if __cccl_ptx_isa >= 800
@@ -999,3 +1004,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
       __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_GET_SREG_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
similarity index 81%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
index 51bd351be87..a769868f45c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
@@ -1,7 +1,12 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_GETCTARANK_H_
+#define _CUDA_PTX_GENERATED_GETCTARANK_H_
+
 /*
 // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t getctarank(
   cuda::ptx::space_cluster_t,
   const void* addr);
@@ -25,3 +30,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, cons
       __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 780
+
+#endif // _CUDA_PTX_GENERATED_GETCTARANK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
similarity index 94%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
index f3e2b860d50..e1afe25d8c2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
+
 /*
 // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr);
 */
@@ -25,7 +30,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint
 
 /*
 // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr,
   const uint32_t& count);
@@ -79,7 +84,7 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1];           // 3a. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr))
@@ -125,7 +130,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__count)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__count)
@@ -142,7 +147,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -175,7 +180,7 @@ mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uin
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -203,3 +208,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
       __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
index efb749957b1..79301a57851 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
+
 /*
 // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
 // .sem       = { .release }
@@ -32,7 +37,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
@@ -49,7 +54,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -77,3 +82,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
       __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
similarity index 79%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
index 879bedebdc9..cbfb275baa4 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
+
 /*
 // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive_no_complete(
   uint64_t* addr,
   const uint32_t& count);
@@ -24,3 +29,5 @@ mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint
       __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 700
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
similarity index 78%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
index 3afeeacfccf..d1e5c57c97e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
+
 /*
 // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_init(
   uint64_t* addr,
   const uint32_t& count);
@@ -21,3 +26,5 @@ _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, cons
       __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
 }
 #endif // __cccl_ptx_isa >= 700
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
index 301c0364af4..f3dbb6ed1c3 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
+
 /*
 // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1. PTX
-ISA 70, SM_80 template <typename=void>
+ISA 70, SM_80 template <typename = void>
 __device__ static inline bool mbarrier_test_wait(
   uint64_t* addr,
   const uint64_t& state);
@@ -58,7 +63,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.test_wait.acquire.cluster.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  "
             "\n\t"
@@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
       __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
index 604cfd92045..b975434b2de 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
+
 /*
 // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3. PTX
-ISA 71, SM_80 template <typename=void>
+ISA 71, SM_80 template <typename = void>
 __device__ static inline bool mbarrier_test_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity);
@@ -59,7 +64,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
       __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
index c5f2062664c..dd50a2c9f41 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
+
 /*
 // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait(
   uint64_t* addr,
   const uint64_t& state);
@@ -29,7 +34,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr,
 
 /*
 // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b. PTX
-ISA 78, SM_90 template <typename=void>
+ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait(
   uint64_t* addr,
   const uint64_t& state,
@@ -89,7 +94,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. "
             "\n\t"
@@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
       __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
index 321bfc515da..d3deb3ca1d5 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
+
 /*
 // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity);
@@ -30,7 +35,7 @@ mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_
 
 /*
 // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity,
@@ -90,7 +95,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
       __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
similarity index 97%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
index 3157fa1c627..d88392f3635 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_RED_ASYNC_H_
+#define _CUDA_PTX_GENERATED_RED_ASYNC_H_
+
 /*
 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_inc_t,
   uint32_t* dest,
@@ -35,7 +40,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_dec_t,
   uint32_t* dest,
@@ -67,7 +72,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_min_t,
   uint32_t* dest,
@@ -99,7 +104,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_max_t,
   uint32_t* dest,
@@ -131,7 +136,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   uint32_t* dest,
@@ -163,7 +168,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_min_t,
   int32_t* dest,
@@ -195,7 +200,7 @@ red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_max_t,
   int32_t* dest,
@@ -227,7 +232,7 @@ red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   int32_t* dest,
@@ -358,7 +363,7 @@ red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t*
 PTX ISA 81, SM_90
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   uint64_t* dest,
@@ -389,7 +394,7 @@ _CCCL_DEVICE static inline void red_async(
 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64
 intentional PTX ISA 81, SM_90
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   int64_t* dest,
@@ -415,3 +420,5 @@ red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __va
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 810
+
+#endif // _CUDA_PTX_GENERATED_RED_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
index 9dfab243ffe..18fd2c03a41 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_ST_ASYNC_H_
+#define _CUDA_PTX_GENERATED_ST_ASYNC_H_
+
 /*
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81,
 SM_90
@@ -22,7 +27,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _C
             :
             : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
             :
             : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
@@ -61,7 +66,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2
               "r"(__as_b32(__value[1])),
               "r"(__as_ptr_remote_dsmem(__remote_bar))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
             :
             : "r"(__as_ptr_remote_dsmem(__addr)),
@@ -106,3 +111,5 @@ _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4],
       __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 810
+
+#endif // _CUDA_PTX_GENERATED_ST_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
similarity index 85%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
index 033d0606e7f..b51b5185db0 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
+#define _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
+
 /*
 // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA
 83, SM_90
@@ -28,19 +33,19 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
           :
@@ -52,3 +57,5 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con
       __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
similarity index 99%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
index 3b1060ead38..3889026750d 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
+#define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
+
 /*
 // tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
 // .space     = { .global }
@@ -567,3 +572,5 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void
       __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
index 033005beb5b..3157f7d1da9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 10. Special Registers
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers
-#include <cuda/__ptx/instructions/generated/get_sreg.inc>
+#include <cuda/__ptx/instructions/generated/get_sreg.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
index f5ed3424d3b..c41084f5ae3 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.23. Data Movement and Conversion Instructions: getctarank
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
-#include <cuda/__ptx/instructions/generated/getctarank.inc>
+#include <cuda/__ptx/instructions/generated/getctarank.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
index fb1341a61d8..0a44942df82 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
@@ -32,9 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
index 575abda7a41..b3539245e03 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
-#include <cuda/__ptx/instructions/generated/mbarrier_init.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_init.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
index 2d6adb78eec..dfcc03bc01c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
@@ -32,10 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
-#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
index a610cf2b583..d14a96dc725 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
-#include <cuda/__ptx/instructions/generated/red_async.inc>
+#include <cuda/__ptx/instructions/generated/red_async.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
index 09199b4a3ce..ffad9f176d0 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.12. Data Movement and Conversion Instructions: st.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
-#include <cuda/__ptx/instructions/generated/st_async.inc>
+#include <cuda/__ptx/instructions/generated/st_async.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
index de179f69735..22eaa502305 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
-#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc>
+#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
index 2f81d8b4361..681a820b070 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
-#include <cuda/__ptx/instructions/generated/tensormap_replace.inc>
+#include <cuda/__ptx/instructions/generated/tensormap_replace.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/test/internal_headers/CMakeLists.txt b/libcudacxx/test/internal_headers/CMakeLists.txt
index 4c1031e5b4f..1f1e4947efb 100644
--- a/libcudacxx/test/internal_headers/CMakeLists.txt
+++ b/libcudacxx/test/internal_headers/CMakeLists.txt
@@ -26,6 +26,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND NOT "${CMAKE_CXX_STANDARD}" M
   list(FILTER internal_headers EXCLUDE REGEX "mdspan")
 endif()
 
+# generated cuda::ptx headers are not standalone
+list(FILTER internal_headers EXCLUDE REGEX "__ptx/instructions/generated")
+
 function(libcudacxx_create_internal_header_test header_name, headertest_src, fallback)
   if(fallback)
     set(header_name "${header_name}_fallback")