OpenFPM  5.2.0
Project that contain the implementation of distributed structures
cudify_alpaka.hpp
1 #ifndef CUDIFY_ALPAKA_HPP_
2 #define CUDIFY_ALPAKA_HPP_
3 
12 #include "util/cudify/cudify_hardware_cpu.hpp"
13 #include "boost/bind.hpp"
14 #include <type_traits>
15 
16 #define CUDA_ON_BACKEND CUDA_BACKEND_ALPAKA
17 
18 extern alpa_base_structs __alpa_base__;
19 
20 extern thread_local dim3 threadIdx;
21 extern thread_local dim3 blockIdx;
22 
23 extern dim3 blockDim;
24 extern dim3 gridDim;
25 
26 static void __syncthreads()
27 {
28  // saving states
29  dim3 threadIdx_s = threadIdx;
30  dim3 blockIdx_s = blockIdx;
31  dim3 blockDim_s = blockDim;
32  dim3 gridDim_s = gridDim;
33 
34  alpaka::syncBlockThreads(*__alpa_base__.accKer);
35 
36  // restoring states
37  threadIdx = threadIdx_s;
38  blockIdx = blockIdx_s;
39  blockDim = blockDim_s;
40  gridDim = gridDim_s;
41 };
42 
43 static void cudaDeviceSynchronize()
44 {
45  alpaka::wait(*__alpa_base__.queue);
46 }
47 
48 static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
49 {
50  memcpy(dev_mem,global_cuda_error_array,sz);
51 }
52 
56 enum cudaMemcpyKind
57 {
58  cudaMemcpyHostToHost = 0,
59  cudaMemcpyHostToDevice = 1,
60  cudaMemcpyDeviceToHost = 2,
61  cudaMemcpyDeviceToDevice = 3,
62  cudaMemcpyDefault = 4
63 };
64 
65 extern int vct_atomic_add;
66 extern int vct_atomic_rem;
67 
68 static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused)
69 {
70  memcpy(global_cuda_error_array+offset,mem,sz);
71 }
72 
73 namespace cub
74 {
75  template<typename T, unsigned int dim>
76  class BlockScan
77  {
78  public:
79  typedef std::array<T,dim> TempStorage;
80 
81  private:
82  TempStorage & tmp;
83 
84  public:
85 
86 
87 
88  BlockScan(TempStorage & tmp)
89  :tmp(tmp)
90  {};
91 
92  void ExclusiveSum(T & in, T & out)
93  {
94  tmp[threadIdx.x] = in;
95 
96  __syncthreads();
97 
98  if (threadIdx.x == 0)
99  {
100  T prec = tmp[0];
101  tmp[0] = 0;
102  for (int i = 1 ; i < dim ; i++)
103  {
104  auto next = tmp[i-1] + prec;
105  prec = tmp[i];
106  tmp[i] = next;
107  }
108  }
109 
110  __syncthreads();
111 
112  out = tmp[threadIdx.x];
113  return;
114  }
115  };
116 }
117 
118 
119 template<typename T, typename T2>
120 static T atomicAdd(T * address, T2 val)
121 {
122  T old = *address;
123  *address += val;
124  return old;
125 };
126 
127 namespace gpu
128 {
129  template<typename type_t>
130  struct less_t : public std::binary_function<type_t, type_t, bool> {
131  bool operator()(type_t a, type_t b) const {
132  return a < b;
133  }
134 
135  template<typename type2_t, typename type3_t>
136  bool operator()(type2_t a, type3_t b) const {
137  return a < b;
138  }
139  };
140 /* template<typename type_t>
141  struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
142  bool operator()(type_t a, type_t b) const {
143  return a <= b;
144  }
145  };*/
146  template<typename type_t>
147  struct greater_t : public std::binary_function<type_t, type_t, bool> {
148  bool operator()(type_t a, type_t b) const {
149  return a > b;
150  }
151 
152  template<typename type2_t, typename type3_t>
153  bool operator()(type2_t a, type3_t b) const {
154  return a > b;
155  }
156  };
157 /* template<typename type_t>
158  struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
159  bool operator()(type_t a, type_t b) const {
160  return a >= b;
161  }
162  };
163  template<typename type_t>
164  struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
165  bool operator()(type_t a, type_t b) const {
166  return a == b;
167  }
168  };
169  template<typename type_t>
170  struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
171  bool operator()(type_t a, type_t b) const {
172  return a != b;
173  }
174  };*/
175 
177  // Device-side arithmetic operators.
178 
179  template<typename type_t>
180  struct plus_t : public std::binary_function<type_t, type_t, type_t> {
181  type_t operator()(type_t a, type_t b) const {
182  return a + b;
183  }
184 
185  type_t reduceInitValue() const {
186  return 0;
187  }
188  };
189 
190 /* template<typename type_t>
191  struct minus_t : public std::binary_function<type_t, type_t, type_t> {
192  type_t operator()(type_t a, type_t b) const {
193  return a - b;
194  }
195  };
196 
197  template<typename type_t>
198  struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
199  type_t operator()(type_t a, type_t b) const {
200  return a * b;
201  }
202  };*/
203 
204  template<typename type_t>
205  struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
206  type_t operator()(type_t a, type_t b) const {
207  return std::max(a, b);
208  }
209 
210  type_t reduceInitValue() const {
211  return std::numeric_limits<type_t>::min();
212  }
213  };
214 
215  template<typename type_t>
216  struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
217  type_t operator()(type_t a, type_t b) const {
218  return std::min(a, b);
219  }
220 
221  type_t reduceInitValue() const {
222  return std::numeric_limits<type_t>::max();
223  }
224  };
225 }
226 
227 
228 namespace gpu
229 {
230  template<typename input_it,
231  typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
232  void segreduce(input_it input, int count, segments_it segments,
233  int num_segments, output_it output, op_t op, type_t init,
234  context_t& gpuContext)
235  {
236  int i = 0;
237  for ( ; i < num_segments - 1; i++)
238  {
239  int j = segments[i];
240  output[i] = input[j];
241  ++j;
242  for ( ; j < segments[i+1] ; j++)
243  {
244  output[i] = op(output[i],input[j]);
245  }
246  }
247 
248  // Last segment
249  int j = segments[i];
250  output[i] = input[j];
251  ++j;
252  for ( ; j < count ; j++)
253  {
254  output[i] = op(output[i],input[j]);
255  }
256  }
257 
258  // Key-value merge.
259  template<typename a_keys_it, typename a_vals_it,
260  typename b_keys_it, typename b_vals_it,
261  typename c_keys_it, typename c_vals_it,
262  typename comp_t, typename context_t>
263  void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
264  b_keys_it b_keys, b_vals_it b_vals, int b_count,
265  c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& gpuContext)
266  {
267  int a_it = 0;
268  int b_it = 0;
269  int c_it = 0;
270 
271  while (a_it < a_count || b_it < b_count)
272  {
273  if (a_it < a_count)
274  {
275  if (b_it < b_count)
276  {
277  if (comp(b_keys[b_it],a_keys[a_it]))
278  {
279  c_keys[c_it] = b_keys[b_it];
280  c_vals[c_it] = b_vals[b_it];
281  c_it++;
282  b_it++;
283  }
284  else
285  {
286  c_keys[c_it] = a_keys[a_it];
287  c_vals[c_it] = a_vals[a_it];
288  c_it++;
289  a_it++;
290  }
291  }
292  else
293  {
294  c_keys[c_it] = a_keys[a_it];
295  c_vals[c_it] = a_vals[a_it];
296  c_it++;
297  a_it++;
298  }
299  }
300  else
301  {
302  c_keys[c_it] = b_keys[b_it];
303  c_vals[c_it] = b_vals[b_it];
304  c_it++;
305  b_it++;
306  }
307  }
308  }
309 }
310 
311 static void init_wrappers()
312 {
313  if (__alpa_base__.initialized == true) {return;}
314 
315  __alpa_base__.devAcc = new AccType_alpa(alpaka::getDevByIdx<Acc_alpa>(0u));
316 
317  // Create a queue on the device
318  __alpa_base__.queue = new Queue_alpa(*__alpa_base__.devAcc);
319 
320  __alpa_base__.initialized = true;
321 }
322 
323 #ifdef PRINT_CUDA_LAUNCHES
324 
325 #define CUDA_LAUNCH(cuda_call,ite, ...)\
326  {\
327  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
328  Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
329  Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
330  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
331  \
332  gridDim.x = ite.wthr.x;\
333  gridDim.y = ite.wthr.y;\
334  gridDim.z = ite.wthr.z;\
335  \
336  blockDim.x = ite.thr.x;\
337  blockDim.y = ite.thr.y;\
338  blockDim.z = ite.thr.z;\
339  \
340  CHECK_SE_CLASS1_PRE\
341  \
342  std::cout << "Launching: " << #cuda_call << std::endl;\
343  \
344  alpaka::exec<Acc_alpa>(\
345  *__alpa_base__.queue,\
346  workDiv,\
347  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
348  \
349  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
350  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
351  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
352  \
353  blockIdx.x = globalBlockIdx[0];\
354  blockIdx.y = globalBlockIdx[1];\
355  blockIdx.z = globalBlockIdx[2];\
356  \
357  threadIdx.x = globalThreadIdx[0];\
358  threadIdx.y = globalThreadIdx[1];\
359  threadIdx.z = globalThreadIdx[2];\
360  \
361  __alpa_base__.accKer = &acc;\
362 \
363  cuda_call(__VA_ARGS__);\
364  });\
365  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
366  }
367 
368 
369 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
370  {\
371  dim3 wthr__(wthr_);\
372  dim3 thr__(thr_);\
373  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
374  Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
375  Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
376  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
377  \
378  gridDim.x = wthr__.x;\
379  gridDim.y = wthr__.y;\
380  gridDim.z = wthr__.z;\
381  \
382  blockDim.x = thr__.x;\
383  blockDim.y = thr__.y;\
384  blockDim.z = thr__.z;\
385  \
386  CHECK_SE_CLASS1_PRE\
387  std::cout << "Launching: " << #cuda_call << std::endl;\
388  \
389  alpaka::exec<Acc_alpa>(\
390  *__alpa_base__.queue,\
391  workDiv,\
392  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
393  \
394  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
395  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
396  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
397  \
398  blockIdx.x = globalBlockIdx[0];\
399  blockIdx.y = globalBlockIdx[1];\
400  blockIdx.z = globalBlockIdx[2];\
401  \
402  threadIdx.x = globalThreadIdx[0];\
403  threadIdx.y = globalThreadIdx[1];\
404  threadIdx.z = globalThreadIdx[2];\
405  \
406  __alpa_base__.accKer = &acc;\
407 \
408  cuda_call(__VA_ARGS__);\
409  });\
410  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
411  }
412 
413 #define CUDA_CHECK()
414 
415 #else
416 
417 #define CUDA_LAUNCH(cuda_call,ite, ...)\
418  {\
419  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
420  Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
421  Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
422  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
423  \
424  gridDim.x = ite.wthr.x;\
425  gridDim.y = ite.wthr.y;\
426  gridDim.z = ite.wthr.z;\
427  \
428  blockDim.x = ite.thr.x;\
429  blockDim.y = ite.thr.y;\
430  blockDim.z = ite.thr.z;\
431  \
432  CHECK_SE_CLASS1_PRE\
433  \
434  \
435  alpaka::exec<Acc_alpa>(\
436  *__alpa_base__.queue,\
437  workDiv,\
438  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
439  \
440  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
441  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
442  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
443  \
444  blockIdx.x = globalBlockIdx[0];\
445  blockIdx.y = globalBlockIdx[1];\
446  blockIdx.z = globalBlockIdx[2];\
447  \
448  threadIdx.x = globalThreadIdx[0];\
449  threadIdx.y = globalThreadIdx[1];\
450  threadIdx.z = globalThreadIdx[2];\
451  \
452  __alpa_base__.accKer = &acc;\
453 \
454  cuda_call(__VA_ARGS__);\
455  });\
456  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
457  }
458 
459 
460 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
461  {\
462  dim3 wthr__(wthr_);\
463  dim3 thr__(thr_);\
464  Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
465  Vec_alpa const grid_d((Idx_alpa)wthr__.x,(Idx_alpa)wthr__.y,(Idx_alpa)wthr__.z);\
466  Vec_alpa const thread_d((Idx_alpa)thr__.x,(Idx_alpa)thr__.y,(Idx_alpa)thr__.z);\
467  WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
468  \
469  gridDim.x = wthr__.x;\
470  gridDim.y = wthr__.y;\
471  gridDim.z = wthr__.z;\
472  \
473  blockDim.x = thr__.x;\
474  blockDim.y = thr__.y;\
475  blockDim.z = thr__.z;\
476  \
477  CHECK_SE_CLASS1_PRE\
478  \
479  alpaka::exec<Acc_alpa>(\
480  *__alpa_base__.queue,\
481  workDiv,\
482  [&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
483  \
484  auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
485  auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
486  auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
487  \
488  blockIdx.x = globalBlockIdx[0];\
489  blockIdx.y = globalBlockIdx[1];\
490  blockIdx.z = globalBlockIdx[2];\
491  \
492  threadIdx.x = globalThreadIdx[0];\
493  threadIdx.y = globalThreadIdx[1];\
494  threadIdx.z = globalThreadIdx[2];\
495  \
496  __alpa_base__.accKer = &acc;\
497 \
498  cuda_call(__VA_ARGS__);\
499  });\
500  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
501  }
502 
503 #define CUDA_CHECK()
504 
505 #endif
506 
507 #endif
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:333
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition: block_scan.cuh:271
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction