1 #ifndef CUDIFY_ALPAKA_HPP_
2 #define CUDIFY_ALPAKA_HPP_
12 #include "util/cudify/cudify_hardware_cpu.hpp"
13 #include "boost/bind.hpp"
14 #include <type_traits>
16 #define CUDA_ON_BACKEND CUDA_BACKEND_ALPAKA
20 extern thread_local dim3 threadIdx;
21 extern thread_local dim3 blockIdx;
26 static void __syncthreads()
29 dim3 threadIdx_s = threadIdx;
30 dim3 blockIdx_s = blockIdx;
31 dim3 blockDim_s = blockDim;
32 dim3 gridDim_s = gridDim;
34 alpaka::syncBlockThreads(*__alpa_base__.accKer);
37 threadIdx = threadIdx_s;
38 blockIdx = blockIdx_s;
39 blockDim = blockDim_s;
43 static void cudaDeviceSynchronize()
45 alpaka::wait(*__alpa_base__.queue);
48 static void cudaMemcpyFromSymbol(
void * dev_mem,
const unsigned char * global_cuda_error_array,
size_t sz)
50 memcpy(dev_mem,global_cuda_error_array,sz);
58 cudaMemcpyHostToHost = 0,
59 cudaMemcpyHostToDevice = 1,
60 cudaMemcpyDeviceToHost = 2,
61 cudaMemcpyDeviceToDevice = 3,
65 extern int vct_atomic_add;
66 extern int vct_atomic_rem;
68 static void cudaMemcpyToSymbol(
unsigned char * global_cuda_error_array,
const void * mem,
size_t sz,
int offset,
int unused)
70 memcpy(global_cuda_error_array+offset,mem,sz);
75 template<
typename T,
unsigned int dim>
79 typedef std::array<T,dim> TempStorage;
94 tmp[threadIdx.x] = in;
102 for (
int i = 1 ; i < dim ; i++)
104 auto next = tmp[i-1] + prec;
112 out = tmp[threadIdx.x];
119 template<
typename T,
typename T2>
120 static T atomicAdd(T * address, T2 val)
129 template<
typename type_t>
130 struct less_t :
public std::binary_function<type_t, type_t, bool> {
131 bool operator()(type_t a, type_t b)
const {
135 template<
typename type2_t,
typename type3_t>
136 bool operator()(type2_t a, type3_t b)
const {
146 template<
typename type_t>
147 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
148 bool operator()(type_t a, type_t b)
const {
152 template<
typename type2_t,
typename type3_t>
153 bool operator()(type2_t a, type3_t b)
const {
179 template<
typename type_t>
180 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
181 type_t operator()(type_t a, type_t b)
const {
185 type_t reduceInitValue()
const {
204 template<
typename type_t>
205 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
206 type_t operator()(type_t a, type_t b)
const {
207 return std::max(a, b);
210 type_t reduceInitValue()
const {
211 return std::numeric_limits<type_t>::min();
215 template<
typename type_t>
216 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
217 type_t operator()(type_t a, type_t b)
const {
218 return std::min(a, b);
221 type_t reduceInitValue()
const {
222 return std::numeric_limits<type_t>::max();
230 template<
typename input_it,
231 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
232 void segreduce(input_it input,
int count, segments_it segments,
233 int num_segments, output_it output, op_t op, type_t
init,
234 context_t& gpuContext)
237 for ( ; i < num_segments - 1; i++)
240 output[i] = input[j];
242 for ( ; j < segments[i+1] ; j++)
244 output[i] = op(output[i],input[j]);
250 output[i] = input[j];
252 for ( ; j < count ; j++)
254 output[i] = op(output[i],input[j]);
259 template<
typename a_keys_it,
typename a_vals_it,
260 typename b_keys_it,
typename b_vals_it,
261 typename c_keys_it,
typename c_vals_it,
262 typename comp_t,
typename context_t>
263 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
264 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
265 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& gpuContext)
271 while (a_it < a_count || b_it < b_count)
277 if (comp(b_keys[b_it],a_keys[a_it]))
279 c_keys[c_it] = b_keys[b_it];
280 c_vals[c_it] = b_vals[b_it];
286 c_keys[c_it] = a_keys[a_it];
287 c_vals[c_it] = a_vals[a_it];
294 c_keys[c_it] = a_keys[a_it];
295 c_vals[c_it] = a_vals[a_it];
302 c_keys[c_it] = b_keys[b_it];
303 c_vals[c_it] = b_vals[b_it];
311 static void init_wrappers()
313 if (__alpa_base__.initialized ==
true) {
return;}
315 __alpa_base__.devAcc =
new AccType_alpa(alpaka::getDevByIdx<Acc_alpa>(0u));
318 __alpa_base__.queue =
new Queue_alpa(*__alpa_base__.devAcc);
320 __alpa_base__.initialized =
true;
323 #ifdef PRINT_CUDA_LAUNCHES
325 #define CUDA_LAUNCH(cuda_call,ite, ...)\
327 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
328 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
329 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
330 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
332 gridDim.x = ite.wthr.x;\
333 gridDim.y = ite.wthr.y;\
334 gridDim.z = ite.wthr.z;\
336 blockDim.x = ite.thr.x;\
337 blockDim.y = ite.thr.y;\
338 blockDim.z = ite.thr.z;\
342 std::cout << "Launching: " << #cuda_call << std::endl;\
344 alpaka::exec<Acc_alpa>(\
345 *__alpa_base__.queue,\
347 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
349 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
350 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
351 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
353 blockIdx.x = globalBlockIdx[0];\
354 blockIdx.y = globalBlockIdx[1];\
355 blockIdx.z = globalBlockIdx[2];\
357 threadIdx.x = globalThreadIdx[0];\
358 threadIdx.y = globalThreadIdx[1];\
359 threadIdx.z = globalThreadIdx[2];\
361 __alpa_base__.accKer = &acc;\
363 cuda_call(__VA_ARGS__);\
365 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
369 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
373 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
374 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
375 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
376 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
378 gridDim.x = wthr__.x;\
379 gridDim.y = wthr__.y;\
380 gridDim.z = wthr__.z;\
382 blockDim.x = thr__.x;\
383 blockDim.y = thr__.y;\
384 blockDim.z = thr__.z;\
387 std::cout << "Launching: " << #cuda_call << std::endl;\
389 alpaka::exec<Acc_alpa>(\
390 *__alpa_base__.queue,\
392 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
394 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
395 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
396 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
398 blockIdx.x = globalBlockIdx[0];\
399 blockIdx.y = globalBlockIdx[1];\
400 blockIdx.z = globalBlockIdx[2];\
402 threadIdx.x = globalThreadIdx[0];\
403 threadIdx.y = globalThreadIdx[1];\
404 threadIdx.z = globalThreadIdx[2];\
406 __alpa_base__.accKer = &acc;\
408 cuda_call(__VA_ARGS__);\
410 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
417 #define CUDA_LAUNCH(cuda_call,ite, ...)\
419 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
420 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
421 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
422 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
424 gridDim.x = ite.wthr.x;\
425 gridDim.y = ite.wthr.y;\
426 gridDim.z = ite.wthr.z;\
428 blockDim.x = ite.thr.x;\
429 blockDim.y = ite.thr.y;\
430 blockDim.z = ite.thr.z;\
435 alpaka::exec<Acc_alpa>(\
436 *__alpa_base__.queue,\
438 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
440 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
441 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
442 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
444 blockIdx.x = globalBlockIdx[0];\
445 blockIdx.y = globalBlockIdx[1];\
446 blockIdx.z = globalBlockIdx[2];\
448 threadIdx.x = globalThreadIdx[0];\
449 threadIdx.y = globalThreadIdx[1];\
450 threadIdx.z = globalThreadIdx[2];\
452 __alpa_base__.accKer = &acc;\
454 cuda_call(__VA_ARGS__);\
456 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
460 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
464 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
465 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
466 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
467 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
469 gridDim.x = wthr__.x;\
470 gridDim.y = wthr__.y;\
471 gridDim.z = wthr__.z;\
473 blockDim.x = thr__.x;\
474 blockDim.y = thr__.y;\
475 blockDim.z = thr__.z;\
479 alpaka::exec<Acc_alpa>(\
480 *__alpa_base__.queue,\
482 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
484 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
485 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
486 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
488 blockIdx.x = globalBlockIdx[0];\
489 blockIdx.y = globalBlockIdx[1];\
490 blockIdx.z = globalBlockIdx[2];\
492 threadIdx.x = globalThreadIdx[0];\
493 threadIdx.y = globalThreadIdx[1];\
494 threadIdx.z = globalThreadIdx[2];\
496 __alpa_base__.accKer = &acc;\
498 cuda_call(__VA_ARGS__);\
500 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction