-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.cu
123 lines (95 loc) · 4.21 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "kernels.cuh"
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
int main() {
static size_t vecLength = 1024*1024; /*number of elements in array*/
size_t iVec;
float *h_aVec, *h_bVec, *h_c;/* Host vectors*/
float *d_aVec, *d_bVec, *d_c; /* Device vectors*/
clock_t startAll, endAll, startCPU, endCPU;
clock_t startGPUKern, endGPUKern,endCtoG;
double diffsAll, diffsCPU, diffsKernel;
double diffsCtoG, diffsGtoC;
time_t t;
size_t gridSize, blockSize;
blockSize = 1024; /* Number of threads in each thread block*/
gridSize = (int)ceil((float)vecLength/blockSize); /* Number of thread blocks in grid*/
size_t singleNumber = sizeof(float); /*percision of number calculated*/
size_t bytes = vecLength*singleNumber; /* Size, in bytes, of each vector */
startAll = clock();
/* Allocate memory for each vector on host */
h_aVec = (float *) malloc(bytes);
h_bVec = (float *) malloc(bytes);
h_c = (float *) malloc(sizeof(float));
/* Allocate memory for each vector on device */
cudaMalloc((void**)&d_aVec, bytes);
cudaMalloc((void**)&d_bVec, bytes);
cudaMalloc((void**)&d_c, singleNumber);
cudaMemset(d_c, 0.0, singleNumber); /* Initialize device value at zero*/
cudaCheckErrors("cudaMalloc fail");
srand((unsigned) time(&t)); /* Intializes random number generator */
/* Intializes host varibles with random variables */
for(iVec = 0; iVec < vecLength; ++iVec){
h_aVec[iVec] = rand();
h_bVec[iVec] = rand();
}
endCtoG = clock();
/* Copy arrays from host to device*/
cudaMemcpy( d_aVec, h_aVec, bytes, cudaMemcpyHostToDevice );
cudaMemcpy( d_bVec, h_bVec, bytes, cudaMemcpyHostToDevice );
cudaCheckErrors("cudaMemcpy 1 fail");
startGPUKern = clock();
/*launch configuration <<<numberOfThreadBlocks,numberOfThreadsWithinEachBlock*/
gpuBigDot<<<gridSize, blockSize>>>(d_aVec,d_bVec,d_c,vecLength);
cudaCheckErrors("kernel fail");
endGPUKern = clock();
/* Copy value from device back to host*/
cudaMemcpy(h_c, d_c, singleNumber, cudaMemcpyDeviceToHost );
cudaCheckErrors("cudaMemcpy 2 fail");
endAll = clock();
/*Uncomment To Check the first 10 values*/
/****************************************************
size_t numbOfTestVals = 10;
size_t i;
printf("\nindex\t vector A\t vector B\t\ta.b\n");
for(i = 0; i<numbOfTestVals; ++i)
printf("[%ld]\t%15.2lf\t%15.2lf\t\t%15.2lf\n",i,h_aVec[i],h_bVec[i],h_aVec[i]*h_bVec[i]);
****************************************************/
printf("a.b GPU value = %lf\n",* h_c);
startCPU = clock();
printf("a.b CPU value = %lf\n",cpuBigDot(h_aVec,h_bVec,vecLength));
endCPU = clock();
diffsCPU = (endCPU - startCPU)/(double)CLOCKS_PER_SEC;
diffsKernel = (endGPUKern - startGPUKern)/(double)CLOCKS_PER_SEC;
diffsAll = (endAll - startAll)/(double)CLOCKS_PER_SEC;
diffsCtoG = (endCtoG - startAll)/(double)CLOCKS_PER_SEC;
diffsGtoC = (endAll - endGPUKern)/(double)CLOCKS_PER_SEC;
printf("error value = %lf%%\n",(* h_c-cpuBigDot(h_aVec,h_bVec,vecLength))/cpuBigDot(h_aVec,h_bVec,vecLength)*100);
printf("\nCPU: Tcpu = %lf seconds\n",diffsCPU);
printf("GPU: Tgpu = %lf seconds\n",diffsAll);
printf("GPU: Memory allocation and data transfer from CPU to GPU = %lf seconds\n",diffsCtoG);
printf("GPU: Kernel execution = %lf seconds\n",diffsKernel);
printf("GPU: Data transfer from GPU to CPU time = %lf seconds\n",diffsGtoC);
printf("Speedup = GPU/CPU = %lf\n",diffsAll/diffsCPU);
/* Frees the device memory allocated to varibles */
cudaFree(d_aVec);
cudaFree(d_bVec);
cudaFree(d_c);
/* Frees the host device memory allocated to varibles */
free(h_aVec);
free(h_bVec);
free(h_c);
return 0;
}