diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst index bd994990c05..2e3b8bac188 100644 --- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst @@ -1,10 +1,13 @@ +.. + This file was automatically generated. Do not edit. + barrier.cluster.arrive ^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // barrier.cluster.arrive; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_arrive(); barrier.cluster.wait @@ -13,7 +16,7 @@ barrier.cluster.wait // barrier.cluster.wait; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_wait(); barrier.cluster.arrive.release @@ -23,7 +26,7 @@ barrier.cluster.arrive.release // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .release } // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_release_t); @@ -34,7 +37,7 @@ barrier.cluster.arrive.relaxed // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .relaxed } // Marked volatile - template + template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_relaxed_t); @@ -45,6 +48,6 @@ barrier.cluster.wait.acquire // barrier.cluster.wait.sem; // PTX ISA 80, SM_90 // .sem = { .acquire } // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_wait( cuda::ptx::sem_acquire_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index f5c236f8bf9..4883d8495eb 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -21,7 +24,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -37,7 +40,7 @@ cp.async.bulk.global.shared::cta.bulk_group // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst index 984b4aff976..07b9f9acfc1 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst @@ -1,7 +1,10 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.commit_group ^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 - template + template __device__ static inline void cp_async_bulk_commit_group(); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst index 9cb15d06fa3..af027c0b623 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index 40eb070e66a..1c21efdd0a3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -21,7 +24,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -36,7 +39,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -52,7 +55,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -67,7 +70,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -83,7 +86,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -98,7 +101,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -114,7 +117,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -129,7 +132,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -145,7 +148,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst index 2481c80bf3c..ac33a05b69f 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -22,7 +25,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -39,7 +42,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -56,7 +59,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -73,7 +76,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst index 08ebd3c28a7..06ff8e9014c 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.wait_group ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index cc82d633375..b043eb9f456 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -64,7 +67,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -83,7 +86,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -102,7 +105,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -121,7 +124,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.in // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -140,7 +143,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.de // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -159,7 +162,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -178,7 +181,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -197,7 +200,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -216,7 +219,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -235,7 +238,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -362,7 +365,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -380,7 +383,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -398,7 +401,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -416,7 +419,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -434,7 +437,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -452,7 +455,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -470,7 +473,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -488,7 +491,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -506,7 +509,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -524,7 +527,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -542,7 +545,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -560,7 +563,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -578,7 +581,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -596,7 +599,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 // .src = { .shared::cta } // .type = { .f32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -614,7 +617,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 // .src = { .shared::cta } // .type = { .f64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -632,7 +635,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst index e4dea98a119..80e927d0375 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst index 18c5e0bfc60..0d658fd9256 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst index c653b01cd60..d587d3f51a2 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst index 2fe14dcb3b2..ed21fa80b6e 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + fence.sc.cta ^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst index 0f5298e3359..c7dd357632a 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + fence.mbarrier_init.release.cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ fence.mbarrier_init.release.cluster // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } - template + template __device__ static inline void fence_mbarrier_init( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst index 935aab9b6df..fdd1f8d0b12 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst @@ -1,7 +1,10 @@ +.. + This file was automatically generated. Do not edit. + fence.proxy.alias ^^^^^^^^^^^^^^^^^ .. code:: cuda // fence.proxy.alias; // 4. PTX ISA 75, SM_70 - template + template __device__ static inline void fence_proxy_alias(); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst index 3e741a1f6c4..8376e96ce6b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + fence.proxy.async ^^^^^^^^^^^^^^^^^ .. code:: cuda // fence.proxy.async; // 5. PTX ISA 80, SM_90 - template + template __device__ static inline void fence_proxy_async(); fence.proxy.async.global diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst index db582971c3d..78c3cd308a0 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + fence.proxy.tensormap::generic.release.cta ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst index c85f52ee302..374c182576f 100644 --- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -1,10 +1,13 @@ +.. + This file was automatically generated. Do not edit. + getctarank.shared::cluster.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } - template + template __device__ static inline uint32_t getctarank( cuda::ptx::space_cluster_t, const void* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst index 92cd106cad9..21436e2b3ca 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.arrive.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 - template + template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr); @@ -12,7 +15,7 @@ mbarrier.arrive.shared::cta.b64 .. code:: cuda // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 - template + template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); @@ -87,7 +90,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -102,7 +105,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst index 0087ae2f458..47c56eca31a 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -38,7 +41,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst index b6d7edbbeee..ba909ae1f56 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.arrive.noComplete.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 - template + template __device__ static inline uint64_t mbarrier_arrive_no_complete( uint64_t* addr, const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst index b87d6f62a23..46adcd16be3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.expect_tx.relaxed.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst index 3e529d86d78..2c3520a20f6 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.init.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 - template + template __device__ static inline void mbarrier_init( uint64_t* addr, const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst index 4cb241c7ca8..d16b2ac07ac 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.test_wait.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 - template + template __device__ static inline bool mbarrier_test_wait( uint64_t* addr, const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst index e750c4a543f..ec464b3398b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.test_wait.parity.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 - template + template __device__ static inline bool mbarrier_test_wait_parity( uint64_t* addr, const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst index ce648c66ee9..3dfdba46861 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.try_wait.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state); @@ -13,7 +16,7 @@ mbarrier.try_wait.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst index 3210dc0eab1..4e7af4bace5 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.try_wait.parity.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -13,7 +16,7 @@ mbarrier.try_wait.parity.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity, diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst index d6b9cf36549..658fe0a8f44 100644 --- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void red_async( cuda::ptx::op_inc_t, uint32_t* dest, @@ -19,7 +22,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void red_async( cuda::ptx::op_dec_t, uint32_t* dest, @@ -33,7 +36,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void red_async( cuda::ptx::op_min_t, uint32_t* dest, @@ -47,7 +50,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void red_async( cuda::ptx::op_max_t, uint32_t* dest, @@ -61,7 +64,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, uint32_t* dest, @@ -75,7 +78,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void red_async( cuda::ptx::op_min_t, int32_t* dest, @@ -89,7 +92,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void red_async( cuda::ptx::op_max_t, int32_t* dest, @@ -103,7 +106,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, int32_t* dest, @@ -159,7 +162,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, uint64_t* dest, @@ -172,7 +175,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, int64_t* dest, diff --git a/docs/libcudacxx/ptx/instructions/generated/st_async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst index c519ea57f70..d00a152cf29 100644 --- a/docs/libcudacxx/ptx/instructions/generated/st_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst index 52fae102ad4..e42bae5a5a0 100644 --- a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst index 33e6f1d839a..a8c4a260782 100644 --- a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + tensormap.replace.tile.global_address.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h index 8b09ddd1110..93b6a06037c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h index 480a02a701e..abfba441ac9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h @@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk -#include -#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h index bd97259cf19..f9320e975f2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index 5b9f575ce5f..7de5b41b744 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor -#include -#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h index 00a3700e1a9..0d933e2cc34 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h index ee6d90bc4d9..f1487301ada 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h @@ -43,12 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk -#include +#include #ifdef _LIBCUDACXX_HAS_NVF16 -# include +# include #endif // _LIBCUDACXX_HAS_NVF16 #ifdef _LIBCUDACXX_HAS_NVBF16 -# include +# include #endif // _LIBCUDACXX_HAS_NVBF16 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h index a6b23a706c7..436c42d4c3f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h index 045f09cb40e..a8dccf979c2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h @@ -32,11 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h similarity index 92% rename from libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h index ca9238bc3ff..10d55714c5b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h @@ -1,7 +1,12 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ +#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ + /* // barrier.cluster.arrive; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_arrive(); */ #if __cccl_ptx_isa >= 780 @@ -24,7 +29,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive() /* // barrier.cluster.wait; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_wait(); */ #if __cccl_ptx_isa >= 780 @@ -48,7 +53,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait() // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .release } // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_release_t); */ @@ -74,7 +79,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .relaxed } // Marked volatile -template +template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_relaxed_t); */ @@ -100,7 +105,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) // barrier.cluster.wait.sem; // PTX ISA 80, SM_90 // .sem = { .acquire } // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_wait( cuda::ptx::sem_acquire_t); */ @@ -121,3 +126,5 @@ _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h index 69f77053b95..8ba40d45f64 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ + /* // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -82,7 +87,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -109,3 +114,5 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h similarity index 73% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h index 24baddaea8f..7bb58675ddb 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ + /* // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 -template +template __device__ static inline void cp_async_bulk_commit_group(); */ #if __cccl_ptx_isa >= 800 @@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_commit_group() __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h similarity index 86% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h index cdd5a535eb6..a5534ef0b48 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ + /* // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -43,3 +48,5 @@ _CCCL_DEVICE static inline void cp_async_bulk( __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h similarity index 96% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h index 547888d5b0f..3cbd26fda04 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ + /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -42,7 +47,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -79,7 +84,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1b. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -122,7 +127,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -159,7 +164,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1c. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -244,7 +249,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1d. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -289,7 +294,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -331,7 +336,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1e. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -414,3 +419,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h similarity index 95% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h index 020698a15b1..915979d18f3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ + /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -49,7 +54,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -96,7 +101,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -144,7 +149,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -193,7 +198,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -237,3 +242,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h similarity index 82% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h index 1a715a0fac6..2057323665a 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ + /* // cp.async.bulk.wait_group N; // PTX ISA 80, SM_90 template @@ -7,13 +12,13 @@ __device__ static inline void cp_async_bulk_wait_group( #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n) +_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N) { NV_IF_ELSE_TARGET( NV_PROVIDES_SM_90, (asm volatile("cp.async.bulk.wait_group %0;" : - : "n"(__n.value) + : "n"(__N.value) : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message @@ -30,16 +35,18 @@ __device__ static inline void cp_async_bulk_wait_group_read( #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n) +_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N) { NV_IF_ELSE_TARGET( NV_PROVIDES_SM_90, (asm volatile("cp.async.bulk.wait_group.read %0;" : - : "n"(__n.value) + : "n"(__N.value) : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h similarity index 97% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h index 50059ff6c5b..a35684c85e1 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h @@ -1,5 +1,8 @@ -// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ + /* // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90 @@ -154,7 +157,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -203,7 +206,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -252,7 +255,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -301,7 +304,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -350,7 +353,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -399,7 +402,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -448,7 +451,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -497,7 +500,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -546,7 +549,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -595,7 +598,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -670,7 +673,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -715,7 +718,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -760,7 +763,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -778,7 +781,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -820,7 +823,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -862,7 +865,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -904,7 +907,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -946,7 +949,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -988,7 +991,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1030,7 +1033,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1072,7 +1075,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1114,7 +1117,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1156,7 +1159,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1198,7 +1201,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1240,7 +1243,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1282,7 +1285,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1324,7 +1327,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1361,7 +1364,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1398,7 +1401,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1433,3 +1436,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h similarity index 89% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h index c657e8d1935..1e13bb5f4f2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h @@ -1,11 +1,15 @@ -#ifdef _LIBCUDACXX_HAS_NVBF16 +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ + /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } // .type = { .bf16 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -14,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -39,7 +43,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 @@ -47,7 +51,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .bf16 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -56,7 +60,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -81,7 +85,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 @@ -89,7 +93,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .bf16 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -98,7 +102,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -123,5 +127,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h similarity index 89% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h index 3a52630db53..0c4678c95bb 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h @@ -1,10 +1,15 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ + /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } // .type = { .f16 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -13,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -33,7 +38,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 @@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f16 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -50,7 +55,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -70,7 +75,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 @@ -78,7 +83,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f16 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -87,7 +92,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -107,4 +112,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h similarity index 91% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h index 32008f6af5b..9ec5b2443d8 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ + /* // cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80, SM_90 @@ -37,37 +42,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) @@ -118,37 +123,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) @@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -212,7 +217,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -221,7 +226,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -230,7 +235,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -239,7 +244,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -248,7 +253,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -257,7 +262,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -317,7 +322,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -327,7 +332,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -337,7 +342,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -347,7 +352,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -357,7 +362,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -367,7 +372,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -440,7 +445,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -452,7 +457,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -464,7 +469,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -476,7 +481,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -488,7 +493,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -500,7 +505,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " "1e." : @@ -512,7 +517,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -530,3 +535,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h similarity index 81% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence.h index f10ec07ebb5..db00c4d4cba 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_H_ +#define _CUDA_PTX_GENERATED_FENCE_H_ + /* // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 // .sem = { .sc, .acq_rel } @@ -19,15 +24,15 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope ( _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { asm volatile("fence.sc.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { asm volatile("fence.sc.sys; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); }), ( @@ -57,7 +62,7 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) ( _CCCL_IF_CONSTEXPR (__sem == sem_sc) { asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) { asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); }), ( @@ -65,3 +70,5 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) __cuda_ptx_fence_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h similarity index 80% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h index 0d39c222598..e185913b3cd 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h @@ -1,8 +1,13 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ +#define _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ + /* // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } -template +template __device__ static inline void fence_mbarrier_init( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t); @@ -25,3 +30,5 @@ _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h similarity index 74% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h index 98260b851ca..40229b84a96 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ + /* // fence.proxy.alias; // 4. PTX ISA 75, SM_70 -template +template __device__ static inline void fence_proxy_alias(); */ #if __cccl_ptx_isa >= 750 @@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void fence_proxy_alias() __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); } #endif // __cccl_ptx_isa >= 750 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h similarity index 83% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h index f0a37baabdb..f64b5faee5e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ + /* // fence.proxy.async; // 5. PTX ISA 80, SM_90 -template +template __device__ static inline void fence_proxy_async(); */ #if __cccl_ptx_isa >= 800 @@ -38,9 +43,9 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) ( _CCCL_IF_CONSTEXPR (__space == space_global) { asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) { + } else _CCCL_IF_CONSTEXPR (__space == space_cluster) { asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) { + } else _CCCL_IF_CONSTEXPR (__space == space_shared) { asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); }), ( @@ -48,3 +53,5 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h similarity index 85% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h index 3e5b2a265f4..1e6119ee032 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ + /* // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 // .sem = { .release } @@ -19,11 +24,11 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco ( _CCCL_IF_CONSTEXPR (__scope == scope_cta) { asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); }), ( @@ -59,17 +64,17 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) @@ -80,3 +85,5 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h similarity index 95% rename from libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h index dd3079915f7..08128cc00a1 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_GET_SREG_H_ +#define _CUDA_PTX_GENERATED_GET_SREG_H_ + /* // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_x(); */ #if __cccl_ptx_isa >= 200 @@ -15,7 +20,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x() /* // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_y(); */ #if __cccl_ptx_isa >= 200 @@ -30,7 +35,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y() /* // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_z(); */ #if __cccl_ptx_isa >= 200 @@ -45,7 +50,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z() /* // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_x(); */ #if __cccl_ptx_isa >= 200 @@ -60,7 +65,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x() /* // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_y(); */ #if __cccl_ptx_isa >= 200 @@ -75,7 +80,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y() /* // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_z(); */ #if __cccl_ptx_isa >= 200 @@ -90,7 +95,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z() /* // mov.u32 sreg_value, %%laneid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_laneid(); */ #if __cccl_ptx_isa >= 130 @@ -105,7 +110,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid() /* // mov.u32 sreg_value, %%warpid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_warpid(); */ #if __cccl_ptx_isa >= 130 @@ -120,7 +125,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid() /* // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_nwarpid(); */ #if __cccl_ptx_isa >= 200 @@ -144,7 +149,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() /* // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_x(); */ #if __cccl_ptx_isa >= 200 @@ -159,7 +164,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x() /* // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_y(); */ #if __cccl_ptx_isa >= 200 @@ -174,7 +179,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y() /* // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_z(); */ #if __cccl_ptx_isa >= 200 @@ -189,7 +194,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z() /* // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_x(); */ #if __cccl_ptx_isa >= 200 @@ -204,7 +209,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x() /* // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_y(); */ #if __cccl_ptx_isa >= 200 @@ -219,7 +224,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y() /* // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_z(); */ #if __cccl_ptx_isa >= 200 @@ -234,7 +239,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z() /* // mov.u32 sreg_value, %%smid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_smid(); */ #if __cccl_ptx_isa >= 130 @@ -249,7 +254,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid() /* // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_nsmid(); */ #if __cccl_ptx_isa >= 200 @@ -273,7 +278,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() /* // mov.u64 sreg_value, %%gridid; // PTX ISA 30 -template +template __device__ static inline uint64_t get_sreg_gridid(); */ #if __cccl_ptx_isa >= 300 @@ -288,7 +293,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid() /* // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 -template +template __device__ static inline bool get_sreg_is_explicit_cluster(); */ #if __cccl_ptx_isa >= 780 @@ -315,7 +320,7 @@ _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() /* // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_x(); */ #if __cccl_ptx_isa >= 780 @@ -339,7 +344,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() /* // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_y(); */ #if __cccl_ptx_isa >= 780 @@ -363,7 +368,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() /* // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_z(); */ #if __cccl_ptx_isa >= 780 @@ -387,7 +392,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() /* // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_x(); */ #if __cccl_ptx_isa >= 780 @@ -411,7 +416,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() /* // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_y(); */ #if __cccl_ptx_isa >= 780 @@ -435,7 +440,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() /* // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_z(); */ #if __cccl_ptx_isa >= 780 @@ -459,7 +464,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() /* // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); */ #if __cccl_ptx_isa >= 780 @@ -483,7 +488,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() /* // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); */ #if __cccl_ptx_isa >= 780 @@ -507,7 +512,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() /* // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); */ #if __cccl_ptx_isa >= 780 @@ -531,7 +536,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() /* // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); */ #if __cccl_ptx_isa >= 780 @@ -555,7 +560,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() /* // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); */ #if __cccl_ptx_isa >= 780 @@ -579,7 +584,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() /* // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); */ #if __cccl_ptx_isa >= 780 @@ -603,7 +608,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() /* // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctarank(); */ #if __cccl_ptx_isa >= 780 @@ -627,7 +632,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() /* // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctarank(); */ #if __cccl_ptx_isa >= 780 @@ -651,7 +656,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() /* // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_eq(); */ #if __cccl_ptx_isa >= 200 @@ -675,7 +680,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() /* // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_le(); */ #if __cccl_ptx_isa >= 200 @@ -699,7 +704,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() /* // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_lt(); */ #if __cccl_ptx_isa >= 200 @@ -723,7 +728,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() /* // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_ge(); */ #if __cccl_ptx_isa >= 200 @@ -747,7 +752,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() /* // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_gt(); */ #if __cccl_ptx_isa >= 200 @@ -771,7 +776,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() /* // mov.u32 sreg_value, %%clock; // PTX ISA 10 -template +template __device__ static inline uint32_t get_sreg_clock(); */ #if __cccl_ptx_isa >= 100 @@ -786,7 +791,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock() /* // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 -template +template __device__ static inline uint32_t get_sreg_clock_hi(); */ #if __cccl_ptx_isa >= 500 @@ -810,7 +815,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() /* // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 -template +template __device__ static inline uint64_t get_sreg_clock64(); */ #if __cccl_ptx_isa >= 200 @@ -834,7 +839,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() /* // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 -template +template __device__ static inline uint64_t get_sreg_globaltimer(); */ #if __cccl_ptx_isa >= 310 @@ -858,7 +863,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() /* // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 -template +template __device__ static inline uint32_t get_sreg_globaltimer_lo(); */ #if __cccl_ptx_isa >= 310 @@ -882,7 +887,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() /* // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 -template +template __device__ static inline uint32_t get_sreg_globaltimer_hi(); */ #if __cccl_ptx_isa >= 310 @@ -906,7 +911,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() /* // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 -template +template __device__ static inline uint32_t get_sreg_total_smem_size(); */ #if __cccl_ptx_isa >= 410 @@ -930,7 +935,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() /* // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 -template +template __device__ static inline uint32_t get_sreg_aggr_smem_size(); */ #if __cccl_ptx_isa >= 810 @@ -954,7 +959,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() /* // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 -template +template __device__ static inline uint32_t get_sreg_dynamic_smem_size(); */ #if __cccl_ptx_isa >= 410 @@ -978,7 +983,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() /* // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 -template +template __device__ static inline uint64_t get_sreg_current_graph_exec(); */ #if __cccl_ptx_isa >= 800 @@ -999,3 +1004,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_GET_SREG_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h similarity index 81% rename from libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h index 51bd351be87..a769868f45c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h @@ -1,7 +1,12 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_GETCTARANK_H_ +#define _CUDA_PTX_GENERATED_GETCTARANK_H_ + /* // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } -template +template __device__ static inline uint32_t getctarank( cuda::ptx::space_cluster_t, const void* addr); @@ -25,3 +30,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, cons __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); } #endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_GETCTARANK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h similarity index 94% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h index f3e2b860d50..e1afe25d8c2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ + /* // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 -template +template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr); */ @@ -25,7 +30,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint /* // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 -template +template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); @@ -79,7 +84,7 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) @@ -125,7 +130,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -142,7 +147,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -175,7 +180,7 @@ mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uin // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -203,3 +208,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive( __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h index efb749957b1..79301a57851 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ + /* // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } @@ -32,7 +37,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -49,7 +54,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -77,3 +82,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h similarity index 79% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h index 879bedebdc9..cbfb275baa4 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ + /* // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 -template +template __device__ static inline uint64_t mbarrier_arrive_no_complete( uint64_t* addr, const uint32_t& count); @@ -24,3 +29,5 @@ mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); } #endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h similarity index 78% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h index 3afeeacfccf..d1e5c57c97e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ + /* // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 -template +template __device__ static inline void mbarrier_init( uint64_t* addr, const uint32_t& count); @@ -21,3 +26,5 @@ _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, cons __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); } #endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h index 301c0364af4..f3dbb6ed1c3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ + /* // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX -ISA 70, SM_80 template +ISA 70, SM_80 template __device__ static inline bool mbarrier_test_wait( uint64_t* addr, const uint64_t& state); @@ -58,7 +63,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " "\n\t" @@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h index 604cfd92045..b975434b2de 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ + /* // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX -ISA 71, SM_80 template +ISA 71, SM_80 template __device__ static inline bool mbarrier_test_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -59,7 +64,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h index c5f2062664c..dd50a2c9f41 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ + /* // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state); @@ -29,7 +34,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, /* // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX -ISA 78, SM_90 template +ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state, @@ -89,7 +94,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " "\n\t" @@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h index 321bfc515da..d3deb3ca1d5 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ + /* // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -30,7 +35,7 @@ mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_ /* // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity, @@ -90,7 +95,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h similarity index 97% rename from libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h index 3157fa1c627..d88392f3635 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_RED_ASYNC_H_ +#define _CUDA_PTX_GENERATED_RED_ASYNC_H_ + /* // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void red_async( cuda::ptx::op_inc_t, uint32_t* dest, @@ -35,7 +40,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void red_async( cuda::ptx::op_dec_t, uint32_t* dest, @@ -67,7 +72,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void red_async( cuda::ptx::op_min_t, uint32_t* dest, @@ -99,7 +104,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void red_async( cuda::ptx::op_max_t, uint32_t* dest, @@ -131,7 +136,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, uint32_t* dest, @@ -163,7 +168,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void red_async( cuda::ptx::op_min_t, int32_t* dest, @@ -195,7 +200,7 @@ red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void red_async( cuda::ptx::op_max_t, int32_t* dest, @@ -227,7 +232,7 @@ red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, int32_t* dest, @@ -358,7 +363,7 @@ red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, uint64_t* dest, @@ -389,7 +394,7 @@ _CCCL_DEVICE static inline void red_async( // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, int64_t* dest, @@ -415,3 +420,5 @@ red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __va __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_RED_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h index 9dfab243ffe..18fd2c03a41 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ST_ASYNC_H_ +#define _CUDA_PTX_GENERATED_ST_ASYNC_H_ + /* // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 @@ -22,7 +27,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _C : : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " : : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) @@ -61,7 +66,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2 "r"(__as_b32(__value[1])), "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " : : "r"(__as_ptr_remote_dsmem(__addr)), @@ -106,3 +111,5 @@ _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_ST_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h similarity index 85% rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h index 033d0606e7f..b51b5185db0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ +#define _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ + /* // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA 83, SM_90 @@ -28,19 +33,19 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" : @@ -52,3 +57,5 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h similarity index 99% rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h index 3b1060ead38..3889026750d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ +#define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ + /* // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a // .space = { .global } @@ -567,3 +572,5 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h index 033005beb5b..3157f7d1da9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 10. Special Registers // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h index f5ed3424d3b..c41084f5ae3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.23. Data Movement and Conversion Instructions: getctarank // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h index fb1341a61d8..0a44942df82 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h @@ -32,9 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -#include -#include -#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h index 575abda7a41..b3539245e03 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h index 2d6adb78eec..dfcc03bc01c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h @@ -32,10 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait -#include -#include -#include -#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h index a610cf2b583..d14a96dc725 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h index 09199b4a3ce..ffad9f176d0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.12. Data Movement and Conversion Instructions: st.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h index de179f69735..22eaa502305 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h index 2f81d8b4361..681a820b070 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/test/internal_headers/CMakeLists.txt b/libcudacxx/test/internal_headers/CMakeLists.txt index 4c1031e5b4f..1f1e4947efb 100644 --- a/libcudacxx/test/internal_headers/CMakeLists.txt +++ b/libcudacxx/test/internal_headers/CMakeLists.txt @@ -26,6 +26,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND NOT "${CMAKE_CXX_STANDARD}" M list(FILTER internal_headers EXCLUDE REGEX "mdspan") endif() +# generated cuda::ptx headers are not standalone +list(FILTER internal_headers EXCLUDE REGEX "__ptx/instructions/generated") + function(libcudacxx_create_internal_header_test header_name, headertest_src, fallback) if(fallback) set(header_name "${header_name}_fallback")