-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstreamline_kernel.cu
207 lines (164 loc) · 5.79 KB
/
streamline_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
////////////////////////////////////////////////////////////////////////////
//
// OSUFlowCuda kernel code
// Chun-Ming Chen
//
///////////////////////////////////////////////////////////////////////////
#include <cuda_runtime_api.h> // C-style function interface
#include "kernel_helper.h"
#include "streamline_kernel.h"
#if 1
#define REAL double
#define REAL4 double4
#define REAL3 double3
#else
#define REAL float
#define REAL4 float4
#define REAL3 float3
#endif
texture<VolumeType, 3, cudaReadModeElementType> tex; // 3D volume
__device__ AtPhysResult d_atPhys;
//////////////
// texture lookup to get vector data
inline __device__
REAL3 d_texLerp3(const REAL3 &pos)
{
REAL3 posf = floorf(pos);
REAL3 posc = posf + 1.;
REAL3 d,d1;
d = pos-posf;
d1 = posc-pos;
// If your indices are not normalized to [0,1], then you need to add 0.5f to each coordinate
posc=posc+.5f;
posf=posf+.5f;
REAL3 data[8];
assign( data[0], make_float3(tex3D(tex, posf.x, posf.y, posf.z)) );
assign( data[1], make_float3(tex3D(tex, posf.x, posf.y, posc.z)) );
assign( data[2], make_float3(tex3D(tex, posf.x, posc.y, posf.z)) );
assign( data[3], make_float3(tex3D(tex, posf.x, posc.y, posc.z)) );
assign( data[4], make_float3(tex3D(tex, posc.x, posf.y, posf.z)) );
assign( data[5], make_float3(tex3D(tex, posc.x, posf.y, posc.z)) );
assign( data[6], make_float3(tex3D(tex, posc.x, posc.y, posf.z)) );
assign( data[7], make_float3(tex3D(tex, posc.x, posc.y, posc.z)) );
return (
( (data[0]) * d1.z
+ (data[1]) * d.z) * d1.y
+( (data[2]) * d1.z
+ (data[3]) * d.z) *d.y
) * d1.x +
(
( (data[4]) * d1.z
+ (data[5]) * d.z) * d1.y
+( (data[6]) * d1.z
+ (data[7]) * d.z) * d.y
) * d.x;
}
inline __device__ bool getData(const REAL3 &pos, REAL3 &vec3, const KernelParam ¶m)
{
if (pos.x < param.minBound.x || pos.y < param.minBound.y || pos.z < param.minBound.z
|| pos.x >= param.maxBound.x || pos.y >= param.maxBound.y || pos.z >= param.maxBound.z )
return false;
#if 0 // device linear interpolation
// If your indices are not normalized to [0,1], then you need to add 0.5f to each coordinate
float4 vec4 = tex3D(tex, pos.x-param.minBound.x+.5f, pos.y-param.minBound.y+.5f, pos.z-param.minBound.z+.5f);
#else // manual interpolation
REAL3 minBound; assign(minBound, param.minBound);
vec3 = d_texLerp3(pos-minBound);
#endif
return true;
}
//////////////
// main function
__global__ void d_streamline(const KernelParam param)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= param.seeds) return;
int i=1;
REAL3 pos;
assign(pos, param.d_seedPos[idx]);
assign(param.d_trace[idx], pos);
bool result=true;
for (i=1; i<=param.maxSteps; i++)
{
bool r;
REAL3 vec3;
REAL stepSize = param.stepSize;
#if 1 // RK4
r = getData(pos, vec3, param);
result = result && r;
REAL3 k1 = vec3 * stepSize;
r = getData(pos + k1*(REAL)0.5, vec3, param);
result = result && r;
REAL3 k2 = vec3 * stepSize;
r = getData(pos + k2*(REAL)0.5, vec3, param);
result = result && r;
REAL3 k3 = vec3 * stepSize;
r = getData(pos + k3, vec3, param);
result = result && r;
REAL3 k4 = vec3 * stepSize;
if (!result)
break;
pos += (k1 + (k2 + k3)*(REAL)2 + k4) / (REAL)6;
#else
r = getData(pos, vec3);
if (!r)
break;
pos += vec3 * param.stepSize;
#endif
if (param.fullTraces )
assign(param.d_trace[idx + i*param.seeds_coalesced], pos);
}
if (!param.fullTraces)
assign(param.d_trace[idx], pos);
param.dn_trace[idx] = i; // return number of steps traced
}
//////////////
// (debug)
__global__ void d_getPhys(const float3 pos, const KernelParam param)
{
REAL3 vec3;
REAL3 rpos;
assign(rpos, pos);
bool r = getData(rpos, vec3, param);
assign(d_atPhys.vec, vec3);
d_atPhys.valid = r;
}
// Use gpgpu to do interpolation and return the value
extern "C"
AtPhysResult getPhys(const float3 pos, const KernelParam ¶m, const cudaChannelFormatDesc &channelDesc, cudaArray *d_volumeArray)
{
// set texture parameters
tex.normalized = false; // access without normalized texture coordinates
tex.filterMode = cudaFilterModeLinear; // linear interpolation
tex.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates
tex.addressMode[1] = cudaAddressModeClamp;
tex.addressMode[2] = cudaAddressModeClamp;
// bind array to 3D texture
CUDA_SAFE_CALL( cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
//printf("pos=%f %f %f, param.min=%f %f %f\n", pos.x, pos.y, pos.z, param.minBound.x, param.minBound.y, param.minBound.z);
cudaThreadSynchronize();
d_getPhys<<<1,1>>>(pos, param);
cudaDeviceSynchronize();
AtPhysResult atPhysResult;
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&atPhysResult, d_atPhys, sizeof(atPhysResult),0, cudaMemcpyDeviceToHost));
//printf("atPhysResult: vec=%f %f %f, valid=%d\n", atPhysResult.vec.x, atPhysResult.vec.y, atPhysResult.vec.z, atPhysResult.valid);
return atPhysResult;
}
////////////////////////////////////////////////////////////////////////////////////
extern "C"
void run_kernel(const KernelParam ¶m, const cudaChannelFormatDesc &channelDesc, cudaArray *d_volumeArray)
{
// set texture parameters
tex.normalized = false; // access without normalized texture coordinates
tex.filterMode = cudaFilterModeLinear; // linear interpolation
tex.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates
tex.addressMode[1] = cudaAddressModeClamp;
tex.addressMode[2] = cudaAddressModeClamp;
// bind array to 3D texture
CUDA_SAFE_CALL( cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
cudaThreadSynchronize();
dim3 blockSize( 256 );
dim3 gridSize( (int)ceil(param.seeds/(float)blockSize.x) );
d_streamline<<<gridSize, blockSize>>>(param);
cudaDeviceSynchronize();
}