-
Notifications
You must be signed in to change notification settings - Fork 0
/
CudaRand.cu
332 lines (285 loc) · 10.6 KB
/
CudaRand.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
//#include "log4cxx/logger.h"
#include "CudaRand.h"
#include "CudaRandKernels.h"
#include "ranluxCuda.constants"
CudaRand* CudaRand::getInstance(const int deviceBufferSizeByte,
const int seed)
{
static CudaRand instance(deviceBufferSizeByte, seed);
return &instance;
}
CudaRand::CudaRand(const int deviceBufferSizeByte_, const int seed) :
nNumbersInDeviceMemory(0),
nUsedNumbersInDeviceMemory(0)
{
//logger = log4cxx::Logger::getLogger("CudaRand");
// Every Thread is an independent PRNG and needs it's own randState.
// The randStates are organized in a pitched (1) 2-dimensional array:
// one row per block.
cudaMallocPitch((void**) &pCudaRandStates, &randPitch,
CRAND.blockDim.x * sizeof(sRandState),
CRAND.gridDim.x);
checkCudaError("cudaMallocPitch(&pCudaRandStates...");
// Create the same array on the host
pHostRandStates = (sRandState*) sMalloc(CRAND.gridDim.x * randPitch);
//LOG4CXX_INFO(logger, "Allocated host- and device-memory for randStates");
// Seed all those randStates and copy them to device memory.
seedRandStates(seed);
cudaMemcpy((void*) pCudaRandStates,
(void*) pHostRandStates,
CRAND.gridDim.x * randPitch,
cudaMemcpyHostToDevice);
checkCudaError("cudaMemcpy(pCudaRandStates, ...");
//LOG4CXX_INFO(logger, "Copied to device.");
// Get memory on device for results.
nNumbersInOneRun = CRAND.gridDim.x * CRAND.blockDim.x * 24;
int nBytesInOneRun = nNumbersInOneRun * sizeof(float);
int effectiveSizeByte = (deviceBufferSizeByte_ / nBytesInOneRun) *
nBytesInOneRun;
deviceBufferSizeByte = effectiveSizeByte;
//LOG4CXX_INFO(logger, "Allocating device memory for random numbers. "
//<< "Maximum buffer size was " << deviceBufferSizeByte_
//<< " bytes, i'm going to use "<< effectiveSizeByte << " bytes");
cudaMalloc((void**) &pCudaRandResult, deviceBufferSizeByte);
checkCudaError("cudaMalloc(&pCudaRandResult...");
}
void CudaRand::seedRandStates(const int seed)
{
for(int iBlock=0; iBlock<CRAND.gridDim.x; iBlock++)
{
sRandState* pRow =
(sRandState*) (((char*) pHostRandStates) + iBlock * randPitch);
for(int iThread=0; iThread<CRAND.blockDim.x; iThread++)
{
ranluxSeed(&pRow[iThread], seed + iBlock * CRAND.blockDim.x + iThread);
}
}
//LOG4CXX_INFO(logger, "Seeded randStates. Copying to device.");
}
void CudaRand::ranluxSeed(sRandState* s, const int seed)
{
//static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("ranlux.Seed"));
int jseed = seed;
int k;
//LOG4CXX_DEBUG(logger, "Seeding " << s << " with seed " << seed);
for(int i=0; i<24; i++)
{
k = jseed / 53668;
jseed = 40014 * (jseed - k*53668) - k*12211;
if(jseed < 0)
jseed += 2147483563;
s->seeds[i] = (jseed % 16777216) * CRAND.twom24;
}
s->i24 = 23;
s->j24 = 9;
s->carry = 0.f;
if(s->seeds[23] == 0.)
s->carry = CRAND.twom24;
}
// Generate new numbers. Those numbers will always start at the base-
// address of pCudaRandResult, probably overwriting a few unused numbers
// at the end.
void CudaRand::generateRandomNumbers(const int nIterations)
{
// Run Kernel
//LOG4CXX_INFO(logger, "Launching Kernel with " << CRAND.gridDim.x
// << " blocks and " << CRAND.blockDim.x << " threads per block. "
// << nIterations << " iterations yielding 24 numbers each.");
ranluxKernel<<< CRAND.gridDim, CRAND.blockDim >>>(pCudaRandResult, pCudaRandStates, randPitch, nIterations);
checkCudaError("Kernel launch ranluxKernel");
}
///I added - start
void CudaRand::test2()
{
// Check prerequisites
if(nNumbersInDeviceMemory == 0)
{
int nNumbers = deviceBufferSizeByte / sizeof(float);
float* pLocal = (float*) sMalloc(deviceBufferSizeByte);
getArray(pLocal, nNumbers);
int thread = 0;
int block = 0;
int nRuns = nNumbersInDeviceMemory / nNumbersInOneRun;
int offset = block * CRAND.blockDim.x * nRuns * 24 + thread;
int runOffset = 24 * CRAND.blockDim.x;
int numberOffset = CRAND.blockDim.x;
float test[nRuns];
for(int i=0; i<nRuns; i++){
test[i] = pLocal[offset + 0*runOffset + i*numberOffset];
printf("%f\n",test[i]);
}
}
}
///I added - end
// This test will fail (likely with a segfault) if deviceBufferSize
// is too small to generate all numbers needed at once.
void CudaRand::test()
{
// Reference numbers 0-4 and 100-104 published by F. James for seed
// 314159265 and p=223
const float r223[] = {0.53981817, 0.76155043, 0.06029940, 0.79600263,
0.30631220, 0.43156743, 0.03774416, 0.24897110,
0.00147784, 0.90274453};
// Reference numbers 0-4 and 100-104 published by F. James for seed 1
// and p=389
const float r389[] = {0.94589490, 0.47347850, 0.95152789, 0.42971975,
0.09127384, 0.02618265, 0.03775346, 0.97274780,
0.13302165, 0.43126065};
// You may change these numbers: 0 <= thread < blockDim.x
// 0 <= block < gridDim.x
// but you should really know what you're doing since you have to
// adjust the seed accordingly. You should also
int block = 0;
int thread = 0;
// Check prerequisites
if(nNumbersInDeviceMemory == 0)
{
// Ok, this instance is new.
// Get as many numbers as the device buffer is able to deliver.
int nNumbers = deviceBufferSizeByte / sizeof(float);
float* pLocal = (float*) sMalloc(deviceBufferSizeByte);
getArray(pLocal, nNumbers);
// Now that we have those numbers, we have to calculate at which position
// our references should be. This depends on the number of runs
// were requested from the kernel.
int nRuns = nNumbersInDeviceMemory / nNumbersInOneRun;
int offset = block * CRAND.blockDim.x * nRuns * 24 + thread;
int runOffset = 24 * CRAND.blockDim.x;
int numberOffset = CRAND.blockDim.x;
// Put the numbers into another array
float test[10];
for(int i=0; i<5; i++)
test[i] = pLocal[offset + 0*runOffset + i*numberOffset];
for(int i=0; i<5; i++)
test[i+5] = pLocal[offset + 4*runOffset + (i+4)*numberOffset];
const float* r;
if(CRAND.p == 223)
r = r223;
else if(CRAND.p == 389)
r = r389;
else
{
// LOG4CXX_ERROR(logger, "p should be 223 or 389, not " << CRAND.p
// << ". You will get weird results now, since i'm assuming 223!");
r = r223;
}
// Write differences to logger.
//LOG4CXX_ERROR(logger, "Following are the differences between the "
//<< "reference values and the calculated ones. If they differ only "
//<< "about 1e-8, everything's ok. If not, please check if you "
//<< "instantiated either with p=223 and seed=314159265 or with "
//<< "p=389 and seed=1");
for(int i=0; i<10; i++)
//LOG4CXX_ERROR(logger, test[i] - r[i]);
printf("test[%d]-r[%d] = %f - %f \n",i,i,test[i],r[i]);
//free(pLocal);
}
else
{
//LOG4CXX_ERROR(logger, "Can't perform test. This instance has been used "
//<< "before. Don't fetch any numbers before calling test()");
}
}
void CudaRand::getArray(float* a, const int n)
{
float* pDevice = getDeviceNumbers(n);
//LOG4CXX_INFO(logger, "Copying " << n << " numbers from device to array.");
cudaMemcpy((void*) a,
(void*) pDevice,
n * sizeof(float),
cudaMemcpyDeviceToHost);
checkCudaError("cudaMemcpy(device->a...");
}
// Reserves n numbers on the card for consumption on the card. Returns
// a pointer to the first usable number. If there are not enough numbers
// available, it generates those numbers. The returned pointer is always
// aligned at a multiple of 512bytes/128floats.
float* CudaRand::getDeviceNumbers(const int n)
{
// LOG4CXX_DEBUG(logger, "Trying to reserve " << n << " numbers on device.");
//printf("n=%d,deviceBufferSizeByte=%d\n",n,deviceBufferSizeByte);
int nextUsableIdx = ((nUsedNumbersInDeviceMemory + 127) / 128) * 128;
if(nNumbersInDeviceMemory - nextUsableIdx < n)
{
// We don't have enough numbers. Check if it's possible to create
// enough.
if(n > deviceBufferSizeByte / sizeof(float))
//TODO variablen mit raus.
throw RuntimeException("Device-buffer too small.");
// Yes. Fill up device buffer.
int nRuns;
if(nUsedNumbersInDeviceMemory == 0)
{
// Since there are not enough numbers available, but it was calculated
// that i may generate that many numbers but nUsedNumbers is zero, i
// conclude that this is the first run. I'm going to fill the whole
// buffer.
nRuns = (deviceBufferSizeByte / sizeof(float)) / nNumbersInOneRun;
}
else
{
// We already generated numbers some time ago. Therfore the numbers
// after nUsedNumbers are unused. We leave them (except a possible
// overlap), fill up from the beginning up to nUsedNumbers, and set
// back nUsedNumbers to zero.
nRuns = (nUsedNumbersInDeviceMemory + nNumbersInOneRun - 1) /
nNumbersInOneRun;
}
generateRandomNumbers(nRuns);
// This will happen only at the first run, when nNumbersInDevice is 0.
if(nRuns * nNumbersInOneRun > nNumbersInDeviceMemory)
nNumbersInDeviceMemory = nRuns * nNumbersInOneRun;
nUsedNumbersInDeviceMemory = 0;
}
else
{
// We do have enough numbers. Round up.
nUsedNumbersInDeviceMemory = nextUsableIdx;
}
// Return the pointer to the first usable number, and mark those numbers
// as used.
int startIdx = nUsedNumbersInDeviceMemory;
nUsedNumbersInDeviceMemory += n;
return &pCudaRandResult[startIdx];
}
CudaRand::~CudaRand()
{
// Release memory.
//LOG4CXX_INFO(logger, "Freeing memory on host and device");
cudaFree(pCudaRandStates);
cudaFree(pCudaRandResult);
free(pHostRandResult);
free(pHostRandStates);
}
void CudaRand::checkCudaError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
void* CudaRand::sMalloc(size_t size)
{
void* p = malloc(size);
if(!p)
printf("malloc returned Null-Pointer!\n");
return p;
}
extern "C"
{
CudaRand* CudaRand__setRandom(const int buff, const int seed){
CudaRand* pCR = CudaRand::getInstance(buff,seed);
return pCR;
}
float* CudaRand__getRandom(CudaRand* pCR, const int n){
float* x = pCR->getDeviceNumbers(n);
return x;
}
}
/* ==== REMARKS ====
(1) The pitching won't be used here: Since blockDim.x is 128 and the size
of an integer is 4 bytes, the rows would be aligned at 512-byte-offsets
anyway.
*/