Skip to content

Commit

Permalink
Update matrix_transpose.cl
Browse files Browse the repository at this point in the history
  • Loading branch information
Smart781 authored Oct 6, 2024
1 parent 37794e6 commit ab80a17
Showing 1 changed file with 25 additions and 53 deletions.
78 changes: 25 additions & 53 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
@@ -1,69 +1,41 @@
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>
#endif


#line 6

__kernel void matrix_transpose_naive(
__global const float* A,
__global float* A_T,
const unsigned int M,
const unsigned int K)
__kernel void matrix_transpose_naive(__global float *a,
__global float *a_t,
const int m,
const int k)
{
int i = get_global_id(0);
int j = get_global_id(1);

if (i < M && j < K) {
A_T[j * M + i] = A[i * K + j];
}
a_t[i * m + j] = a[j * k + i];
}

__kernel void matrix_transpose_local_bad_banks(
__global const float* A,
__global float* A_T,
const unsigned int M,
const unsigned int K)
{
__local float tile[32][32];
#define SIZE 16

__kernel void matrix_transpose_local_bad_banks(__global float *a,
__global float *a_t,
const int m,
const int k)
{
__local float buffer[SIZE * SIZE];
int i = get_global_id(0);
int j = get_global_id(1);
int local_i = get_local_id(0);
int local_j = get_local_id(1);

if (i < M && j < K) {
tile[local_j][local_i] = A[i * K + j];
}

int l_i = get_local_id(0);
int l_j = get_local_id(1);
buffer[l_i * SIZE + l_j] = a[i * k + j];
barrier(CLK_LOCAL_MEM_FENCE);

if (i < M && j < K) {
A_T[j * M + i] = tile[local_i][local_j];
}
a_t[m * j + i] = buffer[l_i * SIZE + l_j];
}


__kernel void matrix_transpose_local_good_banks(
__global const float* A,
__global float* A_T,
const unsigned int M,
const unsigned int K)
__kernel void matrix_transpose_local_good_banks(__global float *a,
__global float *a_t,
const int m,
const int k)
{
__local float tile[32][32 + 1];

__local float buffer[SIZE * (SIZE + 1)];
int i = get_global_id(0);
int j = get_global_id(1);
int local_i = get_local_id(0);
int local_j = get_local_id(1);

if (i < M && j < K) {
tile[local_j][local_i] = A[i * K + j];
}

int l_i = get_local_id(0);
int l_j = get_local_id(1);
buffer[l_j * (SIZE + 1) + l_i] = a[i * k + j];
barrier(CLK_LOCAL_MEM_FENCE);

if (i < M && j < K) {
A_T[j * M + i] = tile[local_i][local_j];
}
a_t[m * j + i] = buffer[l_j * (SIZE + 1) + l_i];
}

0 comments on commit ab80a17

Please sign in to comment.