Skip to content

Commit

Permalink
Update matrix_transpose.cl
Browse files Browse the repository at this point in the history
  • Loading branch information
Smart781 authored Oct 6, 2024
1 parent ab80a17 commit 85bbc9e
Showing 1 changed file with 36 additions and 25 deletions.
61 changes: 36 additions & 25 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
@@ -1,41 +1,52 @@
__kernel void matrix_transpose_naive(__global float *a,
__global float *a_t,
const int m,
const int k)
__kernel void matrix_transpose_naive(
__global const float* A,
__global float* A_T,
const unsigned int M,
const unsigned int K)
{
int i = get_global_id(0);
int j = get_global_id(1);
a_t[i * m + j] = a[j * k + i];
if (i < M && j < K) {
A_T[j * M + i] = A[i * K + j];
}
}

#define SIZE 16

__kernel void matrix_transpose_local_bad_banks(__global float *a,
__global float *a_t,
const int m,
const int k)
__kernel void matrix_transpose_local_bad_banks(
__global const float* A,
__global float* A_T,
const unsigned int M,
const unsigned int K)
{
__local float buffer[SIZE * SIZE];
__local float tile[32][32];
int i = get_global_id(0);
int j = get_global_id(1);
int l_i = get_local_id(0);
int l_j = get_local_id(1);
buffer[l_i * SIZE + l_j] = a[i * k + j];
int local_i = get_local_id(0);
int local_j = get_local_id(1);
if (i < M && j < K) {
tile[local_j][local_i] = A[i * K + j];
}
barrier(CLK_LOCAL_MEM_FENCE);
a_t[m * j + i] = buffer[l_i * SIZE + l_j];
if (i < M && j < K) {
A_T[j * M + i] = tile[local_i][local_j];
}
}

__kernel void matrix_transpose_local_good_banks(__global float *a,
__global float *a_t,
const int m,
const int k)
__kernel void matrix_transpose_local_good_banks(
__global const float* A,
__global float* A_T,
const unsigned int M,
const unsigned int K)
{
__local float buffer[SIZE * (SIZE + 1)];
__local float tile[32][32 + 1];
int i = get_global_id(0);
int j = get_global_id(1);
int l_i = get_local_id(0);
int l_j = get_local_id(1);
buffer[l_j * (SIZE + 1) + l_i] = a[i * k + j];
int local_i = get_local_id(0);
int local_j = get_local_id(1);
if (i < M && j < K) {
tile[local_j][local_i] = A[i * K + j];
}
barrier(CLK_LOCAL_MEM_FENCE);
a_t[m * j + i] = buffer[l_j * (SIZE + 1) + l_i];
if (i < M && j < K) {
A_T[j * M + i] = tile[local_i][local_j];
}
}

0 comments on commit 85bbc9e

Please sign in to comment.