-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtranspose3d.cu
134 lines (102 loc) · 3.06 KB
/
transpose3d.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "cuda.h"
#include "timer.h"
#define NX 512
#define NY 512
#define NZ 512
#define NXY NX*NY
#define K 32
void initdata(float data[])
{
for(int zi=0;zi< NZ ;zi++)
for(int yi=0; yi< NY;yi++)
for(int xi=0; xi< NX;xi++)
{
data[zi*NXY+ yi*NX + xi] = zi*NXY+ yi*NX + xi;
}
}
void printdata(float data[])
{
for(int zi=0;zi< NZ ;zi++){
for(int yi=0; yi< NY;yi++) {
for(int xi=0; xi< NX;xi++)
{
printf("%.2f ",data[zi*NXY+ yi*NX + xi]);
}
printf("\n");
}
printf("===========\n");
}
}
void transpose_CPU(float in[], float out[]) {
for(int zi=0;zi< NZ ;zi++)
for(int yi=0; yi< NY;yi++)
for(int xi=0; xi< NX;xi++)
out[yi*NXY+ zi*NX + xi] = in[zi*NXY+ yi*NX + xi];
}
__global__ void transpose(float in[], float out[])
{
unsigned int i = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int j = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int k = blockIdx.z * blockDim.z;
//unsigned int idx = i * N + j;
if(i < NX && j < NY && k<NZ/2 )
{
out[j * NXY + k *NX+ i] = in[k * NXY + j*NX+ i];
//out[k * NXY + j *NX+ i] = in[k * NXY + j*NX+ i];
}
}
__global__ void transpose_parallel_per_element_tiled(float in[], float out[])
{
int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[K][K];
//tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
__syncthreads();
//out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}
void testbandwidth()
{
int devicenum=0;
cudaSetDevice(devicenum);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, devicenum);
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
printf(" Theory peak : %.2f GB/s \n",(deviceProp.memoryBusWidth*deviceProp.memoryClockRate * 1e-3f)/8/1000.0);
}
int main(int argc, char **argv) {
testbandwidth();
int numbytes =NX * NY * NZ* sizeof(float);
float *in = (float *) malloc(numbytes);
float *out = (float *) malloc(numbytes);
initdata(in);
float *d_in, *d_out; // on Device
cudaMalloc((void **)&d_in, numbytes);
cudaMalloc((void **)&d_out, numbytes);
cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
dim3 threads(K,K);
dim3 blocks((NX + threads.x - 1) / threads.x, (NY + threads.y - 1) / threads.y, NZ/2);
for(int i=0;i<20;i++)
{
cudaEvent_t start1;
cudaEventCreate(&start1);
cudaEvent_t stop1;
cudaEventCreate(&stop1);
cudaEventRecord(start1, NULL);
transpose<<<blocks, threads>>>(d_in, d_out);
// transpose_parallel_per_element_tiled<<<blocks, threads>>>(d_in, d_out);
// cudaDeviceSynchronize();
cudaEventRecord(stop1, NULL);
cudaEventSynchronize(stop1);
float msecTotal1 = 0.0f;
cudaEventElapsedTime(&msecTotal1, start1, stop1);
printf("Time : %f \n",msecTotal1);
}
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
//printdata(out);
return 0;
}