1#ifndef CUDIFY_ALPAKA_HPP_
2#define CUDIFY_ALPAKA_HPP_
12#include "util/cudify/cudify_hardware_cpu.hpp"
13#include "util/cuda_util.hpp"
14#include "boost/bind.hpp"
17#define CUDA_ON_BACKEND CUDA_BACKEND_ALPAKA
18#define GPU_HOST_DEVICE
22extern thread_local dim3 threadIdx;
23extern thread_local dim3 blockIdx;
28static void __syncthreads()
31 dim3 threadIdx_s = threadIdx;
32 dim3 blockIdx_s = blockIdx;
33 dim3 blockDim_s = blockDim;
34 dim3 gridDim_s = gridDim;
36 alpaka::syncBlockThreads(*__alpa_base__.accKer);
39 threadIdx = threadIdx_s;
40 blockIdx = blockIdx_s;
41 blockDim = blockDim_s;
45static void cudaDeviceSynchronize()
47 alpaka::wait(*__alpa_base__.queue);
50static void cudaMemcpyFromSymbol(
void * dev_mem,
const unsigned char * global_cuda_error_array,
size_t sz)
52 memcpy(dev_mem,global_cuda_error_array,sz);
60 cudaMemcpyHostToHost = 0,
61 cudaMemcpyHostToDevice = 1,
62 cudaMemcpyDeviceToHost = 2,
63 cudaMemcpyDeviceToDevice = 3,
67extern int vct_atomic_add;
68extern int vct_atomic_rem;
70static void cudaMemcpyToSymbol(
unsigned char * global_cuda_error_array,
const void * mem,
size_t sz,
int offset,
int unused)
72 memcpy(global_cuda_error_array+offset,mem,sz);
77 template<
typename T,
unsigned int dim>
81 typedef std::array<T,dim> TempStorage;
96 tmp[threadIdx.x] = in;
100 if (threadIdx.x == 0)
104 for (
int i = 1 ; i < dim ; i++)
106 auto next = tmp[i-1] + prec;
114 out = tmp[threadIdx.x];
121template<
typename T,
typename T2>
122static T atomicAdd(T * address, T2 val)
131 template<
typename type_t>
132 struct less_t :
public std::binary_function<type_t, type_t, bool> {
133 bool operator()(type_t a, type_t b)
const {
136 template<
typename type2_t,
typename type3_t>
137 bool operator()(type2_t a, type3_t b)
const {
147 template<
typename type_t>
148 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
149 bool operator()(type_t a, type_t b)
const {
152 template<
typename type2_t,
typename type3_t>
153 bool operator()(type2_t a, type3_t b)
const {
179 template<
typename type_t>
180 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
181 type_t operator()(type_t a, type_t b)
const {
200 template<
typename type_t>
201 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
202 type_t operator()(type_t a, type_t b)
const {
203 return std::max(a, b);
207 template<
typename type_t>
208 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
209 type_t operator()(type_t a, type_t b)
const {
210 return std::min(a, b);
218 template<
typename input_it,
219 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
220 void segreduce(input_it input,
int count, segments_it segments,
221 int num_segments, output_it output, op_t op, type_t init,
225 for ( ; i < num_segments - 1; i++)
228 output[i] = input[j];
230 for ( ; j < segments[i+1] ; j++)
232 output[i] = op(output[i],input[j]);
238 output[i] = input[j];
240 for ( ; j < count ; j++)
242 output[i] = op(output[i],input[j]);
247 template<
typename a_keys_it,
typename a_vals_it,
248 typename b_keys_it,
typename b_vals_it,
249 typename c_keys_it,
typename c_vals_it,
250 typename comp_t,
typename context_t>
251 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
252 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
253 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
259 while (a_it < a_count || b_it < b_count)
265 if (comp(b_keys[b_it],a_keys[a_it]))
267 c_keys[c_it] = b_keys[b_it];
268 c_vals[c_it] = b_vals[b_it];
274 c_keys[c_it] = a_keys[a_it];
275 c_vals[c_it] = a_vals[a_it];
282 c_keys[c_it] = a_keys[a_it];
283 c_vals[c_it] = a_vals[a_it];
290 c_keys[c_it] = b_keys[b_it];
291 c_vals[c_it] = b_vals[b_it];
299static void init_wrappers()
301 if (__alpa_base__.initialized ==
true) {
return;}
303 __alpa_base__.devAcc =
new AccType_alpa(alpaka::getDevByIdx<Acc_alpa>(0u));
306 __alpa_base__.queue =
new Queue_alpa(*__alpa_base__.devAcc);
308 __alpa_base__.initialized =
true;
311#ifdef PRINT_CUDA_LAUNCHES
313#define CUDA_LAUNCH(cuda_call,ite, ...)\
315 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
316 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
317 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
318 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
320 gridDim.x = ite.wthr.x;\
321 gridDim.y = ite.wthr.y;\
322 gridDim.z = ite.wthr.z;\
324 blockDim.x = ite.thr.x;\
325 blockDim.y = ite.thr.y;\
326 blockDim.z = ite.thr.z;\
330 std::cout << "Launching: " << #cuda_call << std::endl;\
332 alpaka::exec<Acc_alpa>(\
333 *__alpa_base__.queue,\
335 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
337 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
338 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
339 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
341 blockIdx.x = globalBlockIdx[0];\
342 blockIdx.y = globalBlockIdx[1];\
343 blockIdx.z = globalBlockIdx[2];\
345 threadIdx.x = globalThreadIdx[0];\
346 threadIdx.y = globalThreadIdx[1];\
347 threadIdx.z = globalThreadIdx[2];\
349 __alpa_base__.accKer = &acc;\
351 cuda_call(__VA_ARGS__);\
353 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
357#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
361 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
362 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
363 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
364 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
366 gridDim.x = wthr__.x;\
367 gridDim.y = wthr__.y;\
368 gridDim.z = wthr__.z;\
370 blockDim.x = thr__.x;\
371 blockDim.y = thr__.y;\
372 blockDim.z = thr__.z;\
375 std::cout << "Launching: " << #cuda_call << std::endl;\
377 alpaka::exec<Acc_alpa>(\
378 *__alpa_base__.queue,\
380 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
382 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
383 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
384 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
386 blockIdx.x = globalBlockIdx[0];\
387 blockIdx.y = globalBlockIdx[1];\
388 blockIdx.z = globalBlockIdx[2];\
390 threadIdx.x = globalThreadIdx[0];\
391 threadIdx.y = globalThreadIdx[1];\
392 threadIdx.z = globalThreadIdx[2];\
394 __alpa_base__.accKer = &acc;\
396 cuda_call(__VA_ARGS__);\
398 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
405#define CUDA_LAUNCH(cuda_call,ite, ...)\
407 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
408 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
409 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
410 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
412 gridDim.x = ite.wthr.x;\
413 gridDim.y = ite.wthr.y;\
414 gridDim.z = ite.wthr.z;\
416 blockDim.x = ite.thr.x;\
417 blockDim.y = ite.thr.y;\
418 blockDim.z = ite.thr.z;\
423 alpaka::exec<Acc_alpa>(\
424 *__alpa_base__.queue,\
426 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
428 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
429 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
430 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
432 blockIdx.x = globalBlockIdx[0];\
433 blockIdx.y = globalBlockIdx[1];\
434 blockIdx.z = globalBlockIdx[2];\
436 threadIdx.x = globalThreadIdx[0];\
437 threadIdx.y = globalThreadIdx[1];\
438 threadIdx.z = globalThreadIdx[2];\
440 __alpa_base__.accKer = &acc;\
442 cuda_call(__VA_ARGS__);\
444 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
448#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
452 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
453 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
454 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
455 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
457 gridDim.x = wthr__.x;\
458 gridDim.y = wthr__.y;\
459 gridDim.z = wthr__.z;\
461 blockDim.x = thr__.x;\
462 blockDim.y = thr__.y;\
463 blockDim.z = thr__.z;\
467 alpaka::exec<Acc_alpa>(\
468 *__alpa_base__.queue,\
470 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
472 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
473 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
474 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
476 blockIdx.x = globalBlockIdx[0];\
477 blockIdx.y = globalBlockIdx[1];\
478 blockIdx.z = globalBlockIdx[2];\
480 threadIdx.x = globalThreadIdx[0];\
481 threadIdx.y = globalThreadIdx[1];\
482 threadIdx.z = globalThreadIdx[2];\
484 __alpa_base__.accKer = &acc;\
486 cuda_call(__VA_ARGS__);\
488 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)