OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
cudify_alpaka.hpp
1 #ifndef CUDIFY_ALPAKA_HPP_
2 #define CUDIFY_ALPAKA_HPP_
3 
12 #include "cudify_hardware.hpp"
13 #include "cuda_util.hpp"
14 #include "boost/bind.hpp"
15 #include <type_traits>
16 
17 #define CUDA_ON_BACKEND CUDA_BACKEND_ALPAKA
18 
19 extern alpa_base_structs __alpa_base__;
20 
21 extern thread_local dim3 threadIdx;
22 extern thread_local dim3 blockIdx;
23 
24 extern dim3 blockDim;
25 extern dim3 gridDim;
26 
27 static void __syncthreads()
28 {
29  // saving states
30  dim3 threadIdx_s = threadIdx;
31  dim3 blockIdx_s = blockIdx;
32  dim3 blockDim_s = blockDim;
33  dim3 gridDim_s = gridDim;
34 
35  alpaka::syncBlockThreads(*__alpa_base__.accKer);
36 
37  // restoring states
38  threadIdx = threadIdx_s;
39  blockIdx = blockIdx_s;
40  blockDim = blockDim_s;
41  gridDim = gridDim_s;
42 };
43 
44 static void cudaDeviceSynchronize()
45 {
46  alpaka::wait(*__alpa_base__.queue);
47 }
48 
49 static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
50 {
51  memcpy(dev_mem,global_cuda_error_array,sz);
52 }
53 
57 enum cudaMemcpyKind
58 {
59  cudaMemcpyHostToHost = 0,
60  cudaMemcpyHostToDevice = 1,
61  cudaMemcpyDeviceToHost = 2,
62  cudaMemcpyDeviceToDevice = 3,
63  cudaMemcpyDefault = 4
64 };
65 
66 extern int vct_atomic_add;
67 extern int vct_atomic_rem;
68 
69 static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused)
70 {
71  memcpy(global_cuda_error_array+offset,mem,sz);
72 }
73 
74 namespace cub
75 {
76  template<typename T, unsigned int dim>
77  class BlockScan
78  {
79  public:
80  typedef std::array<T,dim> TempStorage;
81 
82  private:
83  TempStorage & tmp;
84 
85  public:
86 
87 
88 
89  BlockScan(TempStorage & tmp)
90  :tmp(tmp)
91  {};
92 
93  void ExclusiveSum(T & in, T & out)
94  {
95  tmp[threadIdx.x] = in;
96 
97  __syncthreads();
98 
99  if (threadIdx.x == 0)
100  {
101  T prec = tmp[0];
102  tmp[0] = 0;
103  for (int i = 1 ; i < dim ; i++)
104  {
105  auto next = tmp[i-1] + prec;
106  prec = tmp[i];
107  tmp[i] = next;
108  }
109  }
110 
111  __syncthreads();
112 
113  out = tmp[threadIdx.x];
114  return;
115  }
116  };
117 }
118 
119 
120 template<typename T, typename T2>
121 static T atomicAdd(T * address, T2 val)
122 {
123  T old = *address;
124  *address += val;
125  return old;
126 };
127 
128 #define MGPU_HOST_DEVICE
129 
130 namespace mgpu
131 {
132  template<typename type_t>
133  struct less_t : public std::binary_function<type_t, type_t, bool> {
134  bool operator()(type_t a, type_t b) const {
135  return a < b;
136  }
137  template<typename type2_t, typename type3_t>
138  bool operator()(type2_t a, type3_t b) const {
139  return a < b;
140  }
141  };
142 /* template<typename type_t>
143  struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
144  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
145  return a <= b;
146  }
147  };*/
148  template<typename type_t>
149  struct greater_t : public std::binary_function<type_t, type_t, bool> {
150  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
151  return a > b;
152  }
153  template<typename type2_t, typename type3_t>
154  MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const {
155  return a > b;
156  }
157  };
158 /* template<typename type_t>
159  struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
160  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
161  return a >= b;
162  }
163  };
164  template<typename type_t>
165  struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
166  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
167  return a == b;
168  }
169  };
170  template<typename type_t>
171  struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
172  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
173  return a != b;
174  }
175  };*/
176 
178  // Device-side arithmetic operators.
179 
180  template<typename type_t>
181  struct plus_t : public std::binary_function<type_t, type_t, type_t> {
182  type_t operator()(type_t a, type_t b) const {
183  return a + b;
184  }
185  };
186 
187 /* template<typename type_t>
188  struct minus_t : public std::binary_function<type_t, type_t, type_t> {
189  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
190  return a - b;
191  }
192  };
193 
194  template<typename type_t>
195  struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
196  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
197  return a * b;
198  }
199  };*/
200 
201  template<typename type_t>
202  struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
203  type_t operator()(type_t a, type_t b) const {
204  return std::max(a, b);
205  }
206  };
207 
208  template<typename type_t>
209  struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
210  type_t operator()(type_t a, type_t b) const {
211  return std::min(a, b);
212  }
213  };
214 }
215 
216 
217 namespace mgpu
218 {
219  template<typename input_it,
220  typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
221  void segreduce(input_it input, int count, segments_it segments,
222  int num_segments, output_it output, op_t op, type_t init,
223  context_t& context)
224  {
225  int i = 0;
226  for ( ; i < num_segments - 1; i++)
227  {
228  int j = segments[i];
229  output[i] = input[j];
230  ++j;
231  for ( ; j < segments[i+1] ; j++)
232  {
233  output[i] = op(output[i],input[j]);
234  }
235  }
236 
237  // Last segment
238  int j = segments[i];
239  output[i] = input[j];
240  ++j;
241  for ( ; j < count ; j++)
242  {
243  output[i] = op(output[i],input[j]);
244  }
245  }
246 
247  // Key-value merge.
248  template<typename a_keys_it, typename a_vals_it,
249  typename b_keys_it, typename b_vals_it,
250  typename c_keys_it, typename c_vals_it,
251  typename comp_t, typename context_t>
252  void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
253  b_keys_it b_keys, b_vals_it b_vals, int b_count,
254  c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
255  {
256  int a_it = 0;
257  int b_it = 0;
258  int c_it = 0;
259 
260  while (a_it < a_count || b_it < b_count)
261  {
262  if (a_it < a_count)
263  {
264  if (b_it < b_count)
265  {
266  if (comp(b_keys[b_it],a_keys[a_it]))
267  {
268  c_keys[c_it] = b_keys[b_it];
269  c_vals[c_it] = b_vals[b_it];
270  c_it++;
271  b_it++;
272  }
273  else
274  {
275  c_keys[c_it] = a_keys[a_it];
276  c_vals[c_it] = a_vals[a_it];
277  c_it++;
278  a_it++;
279  }
280  }
281  else
282  {
283  c_keys[c_it] = a_keys[a_it];
284  c_vals[c_it] = a_vals[a_it];
285  c_it++;
286  a_it++;
287  }
288  }
289  else
290  {
291  c_keys[c_it] = b_keys[b_it];
292  c_vals[c_it] = b_vals[b_it];
293  c_it++;
294  b_it++;
295  }
296  }
297  }
298 }
299 
300 static void init_wrappers()
301 {
302  if (__alpa_base__.initialized == true) {return;}
303 
304  __alpa_base__.devAcc = new AccType_alpa(alpaka::getDevByIdx<Acc_alpa>(0u));
305 
306  // Create a queue on the device
307  __alpa_base__.queue = new Queue_alpa(*__alpa_base__.devAcc);
308 
309  __alpa_base__.initialized = true;
310 }
311 
312 #ifdef PRINT_CUDA_LAUNCHES
313 
314 #define CUDA_LAUNCH(cuda_call,ite, ...)\
315  {\
316  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
317  Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
318  Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
319  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
320  \
321  gridDim.x = ite.wthr.x;\
322  gridDim.y = ite.wthr.y;\
323  gridDim.z = ite.wthr.z;\
324  \
325  blockDim.x = ite.thr.x;\
326  blockDim.y = ite.thr.y;\
327  blockDim.z = ite.thr.z;\
328  \
329  CHECK_SE_CLASS1_PRE\
330  \
331  std::cout << "Launching: " << #cuda_call << std::endl;\
332  \
333  alpaka::exec<Acc_alpa>(\
334  *__alpa_base__.queue,\
335  workDiv,\
336  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
337  \
338  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
339  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
340  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
341  \
342  blockIdx.x = globalBlockIdx[0];\
343  blockIdx.y = globalBlockIdx[1];\
344  blockIdx.z = globalBlockIdx[2];\
345  \
346  threadIdx.x = globalThreadIdx[0];\
347  threadIdx.y = globalThreadIdx[1];\
348  threadIdx.z = globalThreadIdx[2];\
349  \
350  __alpa_base__.accKer = &acc;\
351 \
352  cuda_call(__VA_ARGS__);\
353  });\
354  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
355  }
356 
357 
358 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
359  {\
360  dim3 wthr__(wthr_);\
361  dim3 thr__(thr_);\
362  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
363  Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
364  Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
365  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
366  \
367  gridDim.x = wthr__.x;\
368  gridDim.y = wthr__.y;\
369  gridDim.z = wthr__.z;\
370  \
371  blockDim.x = thr__.x;\
372  blockDim.y = thr__.y;\
373  blockDim.z = thr__.z;\
374  \
375  CHECK_SE_CLASS1_PRE\
376  std::cout << "Launching: " << #cuda_call << std::endl;\
377  \
378  alpaka::exec<Acc_alpa>(\
379  *__alpa_base__.queue,\
380  workDiv,\
381  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
382  \
383  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
384  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
385  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
386  \
387  blockIdx.x = globalBlockIdx[0];\
388  blockIdx.y = globalBlockIdx[1];\
389  blockIdx.z = globalBlockIdx[2];\
390  \
391  threadIdx.x = globalThreadIdx[0];\
392  threadIdx.y = globalThreadIdx[1];\
393  threadIdx.z = globalThreadIdx[2];\
394  \
395  __alpa_base__.accKer = &acc;\
396 \
397  cuda_call(__VA_ARGS__);\
398  });\
399  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
400  }
401 
402 #define CUDA_CHECK()
403 
404 #else
405 
406 #define CUDA_LAUNCH(cuda_call,ite, ...)\
407  {\
408  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
409  Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
410  Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
411  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
412  \
413  gridDim.x = ite.wthr.x;\
414  gridDim.y = ite.wthr.y;\
415  gridDim.z = ite.wthr.z;\
416  \
417  blockDim.x = ite.thr.x;\
418  blockDim.y = ite.thr.y;\
419  blockDim.z = ite.thr.z;\
420  \
421  CHECK_SE_CLASS1_PRE\
422  \
423  \
424  alpaka::exec<Acc_alpa>(\
425  *__alpa_base__.queue,\
426  workDiv,\
427  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
428  \
429  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
430  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
431  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
432  \
433  blockIdx.x = globalBlockIdx[0];\
434  blockIdx.y = globalBlockIdx[1];\
435  blockIdx.z = globalBlockIdx[2];\
436  \
437  threadIdx.x = globalThreadIdx[0];\
438  threadIdx.y = globalThreadIdx[1];\
439  threadIdx.z = globalThreadIdx[2];\
440  \
441  __alpa_base__.accKer = &acc;\
442 \
443  cuda_call(__VA_ARGS__);\
444  });\
445  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
446  }
447 
448 
449 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
450  {\
451  dim3 wthr__(wthr_);\
452  dim3 thr__(thr_);\
453  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
454  Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
455  Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
456  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
457  \
458  gridDim.x = wthr__.x;\
459  gridDim.y = wthr__.y;\
460  gridDim.z = wthr__.z;\
461  \
462  blockDim.x = thr__.x;\
463  blockDim.y = thr__.y;\
464  blockDim.z = thr__.z;\
465  \
466  CHECK_SE_CLASS1_PRE\
467  \
468  alpaka::exec<Acc_alpa>(\
469  *__alpa_base__.queue,\
470  workDiv,\
471  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
472  \
473  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
474  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
475  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
476  \
477  blockIdx.x = globalBlockIdx[0];\
478  blockIdx.y = globalBlockIdx[1];\
479  blockIdx.z = globalBlockIdx[2];\
480  \
481  threadIdx.x = globalThreadIdx[0];\
482  threadIdx.y = globalThreadIdx[1];\
483  threadIdx.z = globalThreadIdx[2];\
484  \
485  __alpa_base__.accKer = &acc;\
486 \
487  cuda_call(__VA_ARGS__);\
488  });\
489  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
490  }
491 
492 #define CUDA_CHECK()
493 
494 #endif
495 
496 #endif
Optional outer namespace(s)
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition: block_scan.cuh:271
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:333