1 #ifndef CUDIFY_ALPAKA_HPP_ 2 #define CUDIFY_ALPAKA_HPP_ 12 #include "cudify_hardware.hpp" 13 #include "cuda_util.hpp" 14 #include "boost/bind.hpp" 15 #include <type_traits> 17 #define CUDA_ON_BACKEND CUDA_BACKEND_ALPAKA 21 extern thread_local dim3 threadIdx;
22 extern thread_local dim3 blockIdx;
27 static void __syncthreads()
30 dim3 threadIdx_s = threadIdx;
31 dim3 blockIdx_s = blockIdx;
32 dim3 blockDim_s = blockDim;
33 dim3 gridDim_s = gridDim;
35 alpaka::syncBlockThreads(*__alpa_base__.accKer);
38 threadIdx = threadIdx_s;
39 blockIdx = blockIdx_s;
40 blockDim = blockDim_s;
44 static void cudaDeviceSynchronize()
46 alpaka::wait(*__alpa_base__.queue);
49 static void cudaMemcpyFromSymbol(
void * dev_mem,
const unsigned char * global_cuda_error_array,
size_t sz)
51 memcpy(dev_mem,global_cuda_error_array,sz);
59 cudaMemcpyHostToHost = 0,
60 cudaMemcpyHostToDevice = 1,
61 cudaMemcpyDeviceToHost = 2,
62 cudaMemcpyDeviceToDevice = 3,
66 extern int vct_atomic_add;
67 extern int vct_atomic_rem;
69 static void cudaMemcpyToSymbol(
unsigned char * global_cuda_error_array,
const void * mem,
size_t sz,
int offset,
int unused)
71 memcpy(global_cuda_error_array+offset,mem,sz);
76 template<
typename T,
unsigned int dim>
80 typedef std::array<T,dim> TempStorage;
95 tmp[threadIdx.x] = in;
103 for (
int i = 1 ; i < dim ; i++)
105 auto next = tmp[i-1] + prec;
113 out = tmp[threadIdx.x];
120 template<
typename T,
typename T2>
121 static T atomicAdd(T * address, T2 val)
128 #define MGPU_HOST_DEVICE 132 template<
typename type_t>
133 struct less_t :
public std::binary_function<type_t, type_t, bool> {
134 bool operator()(type_t a, type_t b)
const {
137 template<
typename type2_t,
typename type3_t>
138 bool operator()(type2_t a, type3_t b)
const {
148 template<
typename type_t>
149 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
150 MGPU_HOST_DEVICE
bool operator()(type_t a, type_t b)
const {
153 template<
typename type2_t,
typename type3_t>
154 MGPU_HOST_DEVICE
bool operator()(type2_t a, type3_t b)
const {
180 template<
typename type_t>
181 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
182 type_t operator()(type_t a, type_t b)
const {
201 template<
typename type_t>
202 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
203 type_t operator()(type_t a, type_t b)
const {
204 return std::max(a, b);
208 template<
typename type_t>
209 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
210 type_t operator()(type_t a, type_t b)
const {
211 return std::min(a, b);
219 template<
typename input_it,
220 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
221 void segreduce(input_it input,
int count, segments_it segments,
222 int num_segments, output_it output, op_t op, type_t
init,
226 for ( ; i < num_segments - 1; i++)
229 output[i] = input[j];
231 for ( ; j < segments[i+1] ; j++)
233 output[i] = op(output[i],input[j]);
239 output[i] = input[j];
241 for ( ; j < count ; j++)
243 output[i] = op(output[i],input[j]);
248 template<
typename a_keys_it,
typename a_vals_it,
249 typename b_keys_it,
typename b_vals_it,
250 typename c_keys_it,
typename c_vals_it,
251 typename comp_t,
typename context_t>
252 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
253 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
254 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
260 while (a_it < a_count || b_it < b_count)
266 if (comp(b_keys[b_it],a_keys[a_it]))
268 c_keys[c_it] = b_keys[b_it];
269 c_vals[c_it] = b_vals[b_it];
275 c_keys[c_it] = a_keys[a_it];
276 c_vals[c_it] = a_vals[a_it];
283 c_keys[c_it] = a_keys[a_it];
284 c_vals[c_it] = a_vals[a_it];
291 c_keys[c_it] = b_keys[b_it];
292 c_vals[c_it] = b_vals[b_it];
300 static void init_wrappers()
302 if (__alpa_base__.initialized ==
true) {
return;}
304 __alpa_base__.devAcc =
new AccType_alpa(alpaka::getDevByIdx<Acc_alpa>(0u));
307 __alpa_base__.queue =
new Queue_alpa(*__alpa_base__.devAcc);
309 __alpa_base__.initialized =
true;
312 #ifdef PRINT_CUDA_LAUNCHES 314 #define CUDA_LAUNCH(cuda_call,ite, ...)\ 316 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\ 317 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\ 318 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\ 319 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\ 321 gridDim.x = ite.wthr.x;\ 322 gridDim.y = ite.wthr.y;\ 323 gridDim.z = ite.wthr.z;\ 325 blockDim.x = ite.thr.x;\ 326 blockDim.y = ite.thr.y;\ 327 blockDim.z = ite.thr.z;\ 331 std::cout << "Launching: " << #cuda_call << std::endl;\ 333 alpaka::exec<Acc_alpa>(\ 334 *__alpa_base__.queue,\ 336 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\ 338 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\ 339 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\ 340 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\ 342 blockIdx.x = globalBlockIdx[0];\ 343 blockIdx.y = globalBlockIdx[1];\ 344 blockIdx.z = globalBlockIdx[2];\ 346 threadIdx.x = globalThreadIdx[0];\ 347 threadIdx.y = globalThreadIdx[1];\ 348 threadIdx.z = globalThreadIdx[2];\ 350 __alpa_base__.accKer = &acc;\ 352 cuda_call(__VA_ARGS__);\ 354 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 358 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 362 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\ 363 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\ 364 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\ 365 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\ 367 gridDim.x = wthr__.x;\ 368 gridDim.y = wthr__.y;\ 369 gridDim.z = wthr__.z;\ 371 blockDim.x = thr__.x;\ 372 blockDim.y = thr__.y;\ 373 blockDim.z = thr__.z;\ 376 std::cout << "Launching: " << #cuda_call << std::endl;\ 378 alpaka::exec<Acc_alpa>(\ 379 *__alpa_base__.queue,\ 381 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\ 383 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\ 384 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\ 385 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\ 387 blockIdx.x = globalBlockIdx[0];\ 388 blockIdx.y = globalBlockIdx[1];\ 389 blockIdx.z = globalBlockIdx[2];\ 391 threadIdx.x = globalThreadIdx[0];\ 392 threadIdx.y = globalThreadIdx[1];\ 393 threadIdx.z = globalThreadIdx[2];\ 395 __alpa_base__.accKer = &acc;\ 397 cuda_call(__VA_ARGS__);\ 399 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 406 #define CUDA_LAUNCH(cuda_call,ite, ...)\ 408 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\ 409 Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\ 410 Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\ 411 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\ 413 gridDim.x = ite.wthr.x;\ 414 gridDim.y = ite.wthr.y;\ 415 gridDim.z = ite.wthr.z;\ 417 blockDim.x = ite.thr.x;\ 418 blockDim.y = ite.thr.y;\ 419 blockDim.z = ite.thr.z;\ 424 alpaka::exec<Acc_alpa>(\ 425 *__alpa_base__.queue,\ 427 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\ 429 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\ 430 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\ 431 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\ 433 blockIdx.x = globalBlockIdx[0];\ 434 blockIdx.y = globalBlockIdx[1];\ 435 blockIdx.z = globalBlockIdx[2];\ 437 threadIdx.x = globalThreadIdx[0];\ 438 threadIdx.y = globalThreadIdx[1];\ 439 threadIdx.z = globalThreadIdx[2];\ 441 __alpa_base__.accKer = &acc;\ 443 cuda_call(__VA_ARGS__);\ 445 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 449 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 453 Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\ 454 Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\ 455 Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\ 456 WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\ 458 gridDim.x = wthr__.x;\ 459 gridDim.y = wthr__.y;\ 460 gridDim.z = wthr__.z;\ 462 blockDim.x = thr__.x;\ 463 blockDim.y = thr__.y;\ 464 blockDim.z = thr__.z;\ 468 alpaka::exec<Acc_alpa>(\ 469 *__alpa_base__.queue,\ 471 [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\ 473 auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\ 474 auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\ 475 auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\ 477 blockIdx.x = globalBlockIdx[0];\ 478 blockIdx.y = globalBlockIdx[1];\ 479 blockIdx.z = globalBlockIdx[2];\ 481 threadIdx.x = globalThreadIdx[0];\ 482 threadIdx.y = globalThreadIdx[1];\ 483 threadIdx.z = globalThreadIdx[2];\ 485 __alpa_base__.accKer = &acc;\ 487 cuda_call(__VA_ARGS__);\ 489 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
Optional outer namespace(s)
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....