OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cudify_alpaka.hpp
1#ifndef CUDIFY_ALPAKA_HPP_
2#define CUDIFY_ALPAKA_HPP_
3
12#include "util/cudify/cudify_hardware_cpu.hpp"
13#include "util/cuda_util.hpp"
14#include "boost/bind.hpp"
15#include <type_traits>
16
17#define CUDA_ON_BACKEND CUDA_BACKEND_ALPAKA
18#define GPU_HOST_DEVICE
19
20extern alpa_base_structs __alpa_base__;
21
22extern thread_local dim3 threadIdx;
23extern thread_local dim3 blockIdx;
24
25extern dim3 blockDim;
26extern dim3 gridDim;
27
28static void __syncthreads()
29{
30 // saving states
31 dim3 threadIdx_s = threadIdx;
32 dim3 blockIdx_s = blockIdx;
33 dim3 blockDim_s = blockDim;
34 dim3 gridDim_s = gridDim;
35
36 alpaka::syncBlockThreads(*__alpa_base__.accKer);
37
38 // restoring states
39 threadIdx = threadIdx_s;
40 blockIdx = blockIdx_s;
41 blockDim = blockDim_s;
42 gridDim = gridDim_s;
43};
44
45static void cudaDeviceSynchronize()
46{
47 alpaka::wait(*__alpa_base__.queue);
48}
49
50static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
51{
52 memcpy(dev_mem,global_cuda_error_array,sz);
53}
54
58enum cudaMemcpyKind
59{
60 cudaMemcpyHostToHost = 0,
61 cudaMemcpyHostToDevice = 1,
62 cudaMemcpyDeviceToHost = 2,
63 cudaMemcpyDeviceToDevice = 3,
64 cudaMemcpyDefault = 4
65};
66
67extern int vct_atomic_add;
68extern int vct_atomic_rem;
69
70static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused)
71{
72 memcpy(global_cuda_error_array+offset,mem,sz);
73}
74
75namespace cub
76{
77 template<typename T, unsigned int dim>
78 class BlockScan
79 {
80 public:
81 typedef std::array<T,dim> TempStorage;
82
83 private:
84 TempStorage & tmp;
85
86 public:
87
88
89
90 BlockScan(TempStorage & tmp)
91 :tmp(tmp)
92 {};
93
94 void ExclusiveSum(T & in, T & out)
95 {
96 tmp[threadIdx.x] = in;
97
98 __syncthreads();
99
100 if (threadIdx.x == 0)
101 {
102 T prec = tmp[0];
103 tmp[0] = 0;
104 for (int i = 1 ; i < dim ; i++)
105 {
106 auto next = tmp[i-1] + prec;
107 prec = tmp[i];
108 tmp[i] = next;
109 }
110 }
111
112 __syncthreads();
113
114 out = tmp[threadIdx.x];
115 return;
116 }
117 };
118}
119
120
121template<typename T, typename T2>
122static T atomicAdd(T * address, T2 val)
123{
124 T old = *address;
125 *address += val;
126 return old;
127};
128
129namespace gpu
130{
131 template<typename type_t>
132 struct less_t : public std::binary_function<type_t, type_t, bool> {
133 bool operator()(type_t a, type_t b) const {
134 return a < b;
135 }
136 template<typename type2_t, typename type3_t>
137 bool operator()(type2_t a, type3_t b) const {
138 return a < b;
139 }
140 };
141/* template<typename type_t>
142 struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
143 bool operator()(type_t a, type_t b) const {
144 return a <= b;
145 }
146 };*/
147 template<typename type_t>
148 struct greater_t : public std::binary_function<type_t, type_t, bool> {
149 bool operator()(type_t a, type_t b) const {
150 return a > b;
151 }
152 template<typename type2_t, typename type3_t>
153 bool operator()(type2_t a, type3_t b) const {
154 return a > b;
155 }
156 };
157/* template<typename type_t>
158 struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
159 bool operator()(type_t a, type_t b) const {
160 return a >= b;
161 }
162 };
163 template<typename type_t>
164 struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
165 bool operator()(type_t a, type_t b) const {
166 return a == b;
167 }
168 };
169 template<typename type_t>
170 struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
171 bool operator()(type_t a, type_t b) const {
172 return a != b;
173 }
174 };*/
175
177 // Device-side arithmetic operators.
178
179 template<typename type_t>
180 struct plus_t : public std::binary_function<type_t, type_t, type_t> {
181 type_t operator()(type_t a, type_t b) const {
182 return a + b;
183 }
184 };
185
186/* template<typename type_t>
187 struct minus_t : public std::binary_function<type_t, type_t, type_t> {
188 type_t operator()(type_t a, type_t b) const {
189 return a - b;
190 }
191 };
192
193 template<typename type_t>
194 struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
195 type_t operator()(type_t a, type_t b) const {
196 return a * b;
197 }
198 };*/
199
200 template<typename type_t>
201 struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
202 type_t operator()(type_t a, type_t b) const {
203 return std::max(a, b);
204 }
205 };
206
207 template<typename type_t>
208 struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
209 type_t operator()(type_t a, type_t b) const {
210 return std::min(a, b);
211 }
212 };
213}
214
215
216namespace gpu
217{
218 template<typename input_it,
219 typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
220 void segreduce(input_it input, int count, segments_it segments,
221 int num_segments, output_it output, op_t op, type_t init,
222 context_t& context)
223 {
224 int i = 0;
225 for ( ; i < num_segments - 1; i++)
226 {
227 int j = segments[i];
228 output[i] = input[j];
229 ++j;
230 for ( ; j < segments[i+1] ; j++)
231 {
232 output[i] = op(output[i],input[j]);
233 }
234 }
235
236 // Last segment
237 int j = segments[i];
238 output[i] = input[j];
239 ++j;
240 for ( ; j < count ; j++)
241 {
242 output[i] = op(output[i],input[j]);
243 }
244 }
245
246 // Key-value merge.
247 template<typename a_keys_it, typename a_vals_it,
248 typename b_keys_it, typename b_vals_it,
249 typename c_keys_it, typename c_vals_it,
250 typename comp_t, typename context_t>
251 void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
252 b_keys_it b_keys, b_vals_it b_vals, int b_count,
253 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
254 {
255 int a_it = 0;
256 int b_it = 0;
257 int c_it = 0;
258
259 while (a_it < a_count || b_it < b_count)
260 {
261 if (a_it < a_count)
262 {
263 if (b_it < b_count)
264 {
265 if (comp(b_keys[b_it],a_keys[a_it]))
266 {
267 c_keys[c_it] = b_keys[b_it];
268 c_vals[c_it] = b_vals[b_it];
269 c_it++;
270 b_it++;
271 }
272 else
273 {
274 c_keys[c_it] = a_keys[a_it];
275 c_vals[c_it] = a_vals[a_it];
276 c_it++;
277 a_it++;
278 }
279 }
280 else
281 {
282 c_keys[c_it] = a_keys[a_it];
283 c_vals[c_it] = a_vals[a_it];
284 c_it++;
285 a_it++;
286 }
287 }
288 else
289 {
290 c_keys[c_it] = b_keys[b_it];
291 c_vals[c_it] = b_vals[b_it];
292 c_it++;
293 b_it++;
294 }
295 }
296 }
297}
298
299static void init_wrappers()
300{
301 if (__alpa_base__.initialized == true) {return;}
302
303 __alpa_base__.devAcc = new AccType_alpa(alpaka::getDevByIdx<Acc_alpa>(0u));
304
305 // Create a queue on the device
306 __alpa_base__.queue = new Queue_alpa(*__alpa_base__.devAcc);
307
308 __alpa_base__.initialized = true;
309}
310
311#ifdef PRINT_CUDA_LAUNCHES
312
313#define CUDA_LAUNCH(cuda_call,ite, ...)\
314 {\
315 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
316 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
317 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
318 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
319 \
320 gridDim.x = ite.wthr.x;\
321 gridDim.y = ite.wthr.y;\
322 gridDim.z = ite.wthr.z;\
323 \
324 blockDim.x = ite.thr.x;\
325 blockDim.y = ite.thr.y;\
326 blockDim.z = ite.thr.z;\
327 \
328 CHECK_SE_CLASS1_PRE\
329 \
330 std::cout << "Launching: " << #cuda_call << std::endl;\
331 \
332 alpaka::exec<Acc_alpa>(\
333 *__alpa_base__.queue,\
334 workDiv,\
335 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
336 \
337 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
338 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
339 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
340 \
341 blockIdx.x = globalBlockIdx[0];\
342 blockIdx.y = globalBlockIdx[1];\
343 blockIdx.z = globalBlockIdx[2];\
344 \
345 threadIdx.x = globalThreadIdx[0];\
346 threadIdx.y = globalThreadIdx[1];\
347 threadIdx.z = globalThreadIdx[2];\
348 \
349 __alpa_base__.accKer = &acc;\
350\
351 cuda_call(__VA_ARGS__);\
352 });\
353 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
354 }
355
356
357#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
358 {\
359 dim3 wthr__(wthr_);\
360 dim3 thr__(thr_);\
361 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
362 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
363 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
364 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
365 \
366 gridDim.x = wthr__.x;\
367 gridDim.y = wthr__.y;\
368 gridDim.z = wthr__.z;\
369 \
370 blockDim.x = thr__.x;\
371 blockDim.y = thr__.y;\
372 blockDim.z = thr__.z;\
373 \
374 CHECK_SE_CLASS1_PRE\
375 std::cout << "Launching: " << #cuda_call << std::endl;\
376 \
377 alpaka::exec<Acc_alpa>(\
378 *__alpa_base__.queue,\
379 workDiv,\
380 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
381 \
382 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
383 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
384 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
385 \
386 blockIdx.x = globalBlockIdx[0];\
387 blockIdx.y = globalBlockIdx[1];\
388 blockIdx.z = globalBlockIdx[2];\
389 \
390 threadIdx.x = globalThreadIdx[0];\
391 threadIdx.y = globalThreadIdx[1];\
392 threadIdx.z = globalThreadIdx[2];\
393 \
394 __alpa_base__.accKer = &acc;\
395\
396 cuda_call(__VA_ARGS__);\
397 });\
398 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
399 }
400
401#define CUDA_CHECK()
402
403#else
404
405#define CUDA_LAUNCH(cuda_call,ite, ...)\
406 {\
407 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
408 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
409 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
410 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
411 \
412 gridDim.x = ite.wthr.x;\
413 gridDim.y = ite.wthr.y;\
414 gridDim.z = ite.wthr.z;\
415 \
416 blockDim.x = ite.thr.x;\
417 blockDim.y = ite.thr.y;\
418 blockDim.z = ite.thr.z;\
419 \
420 CHECK_SE_CLASS1_PRE\
421 \
422 \
423 alpaka::exec<Acc_alpa>(\
424 *__alpa_base__.queue,\
425 workDiv,\
426 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
427 \
428 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
429 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
430 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
431 \
432 blockIdx.x = globalBlockIdx[0];\
433 blockIdx.y = globalBlockIdx[1];\
434 blockIdx.z = globalBlockIdx[2];\
435 \
436 threadIdx.x = globalThreadIdx[0];\
437 threadIdx.y = globalThreadIdx[1];\
438 threadIdx.z = globalThreadIdx[2];\
439 \
440 __alpa_base__.accKer = &acc;\
441\
442 cuda_call(__VA_ARGS__);\
443 });\
444 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
445 }
446
447
448#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
449 {\
450 dim3 wthr__(wthr_);\
451 dim3 thr__(thr_);\
452 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
453 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
454 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
455 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
456 \
457 gridDim.x = wthr__.x;\
458 gridDim.y = wthr__.y;\
459 gridDim.z = wthr__.z;\
460 \
461 blockDim.x = thr__.x;\
462 blockDim.y = thr__.y;\
463 blockDim.z = thr__.z;\
464 \
465 CHECK_SE_CLASS1_PRE\
466 \
467 alpaka::exec<Acc_alpa>(\
468 *__alpa_base__.queue,\
469 workDiv,\
470 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
471 \
472 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
473 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
474 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
475 \
476 blockIdx.x = globalBlockIdx[0];\
477 blockIdx.y = globalBlockIdx[1];\
478 blockIdx.z = globalBlockIdx[2];\
479 \
480 threadIdx.x = globalThreadIdx[0];\
481 threadIdx.y = globalThreadIdx[1];\
482 threadIdx.z = globalThreadIdx[2];\
483 \
484 __alpa_base__.accKer = &acc;\
485\
486 cuda_call(__VA_ARGS__);\
487 });\
488 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
489 }
490
491#define CUDA_CHECK()
492
493#endif
494
495#endif
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)