OpenFPM  5.2.0
Project that contain the implementation of distributed structures
cudify_openmp.hpp
1 #ifndef CUDIFY_OPENMP_HPP_
2 #define CUDIFY_OPENMP_HPP_
3 
4 #include "config.h"
5 
6 constexpr int default_kernel_wg_threads_ = 1024;
7 
8 #include <omp.h>
9 
10 #define CUDA_ON_BACKEND CUDA_BACKEND_OPENMP
11 
12 #include "util/cudify/cudify_hardware_cpu.hpp"
13 
14 #ifdef HAVE_BOOST_CONTEXT
15 
16 #include <boost/bind/bind.hpp>
17 #include <type_traits>
18 #ifdef HAVE_BOOST_CONTEXT
19 #include <boost/context/continuation.hpp>
20 #endif
21 #include <vector>
22 #include <string.h>
23 
24 
25 #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
26 #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
27 #endif
28 
29 extern std::vector<void *>mem_stack;
30 
31 extern thread_local dim3 threadIdx;
32 extern thread_local dim3 blockIdx;
33 
34 extern dim3 blockDim;
35 extern dim3 gridDim;
36 
37 extern std::vector<void *> mem_stack;
38 extern std::vector<boost::context::detail::fcontext_t> contexts;
39 extern thread_local void * par_glob;
40 extern thread_local boost::context::detail::fcontext_t main_ctx;
41 
42 static void __syncthreads()
43 {
44  boost::context::detail::jump_fcontext(main_ctx,par_glob);
45 };
46 
47 
48 extern thread_local int vct_atomic_add;
49 extern thread_local int vct_atomic_rem;
50 
51 
52 namespace cub
53 {
54  template<typename T, unsigned int dim>
55  class BlockScan
56  {
57  public:
58  typedef std::array<T,dim> TempStorage;
59 
60  private:
61  TempStorage & tmp;
62 
63  public:
64 
65 
66 
67  BlockScan(TempStorage & tmp)
68  :tmp(tmp)
69  {};
70 
71  void ExclusiveSum(T & in, T & out)
72  {
73  tmp[threadIdx.x] = in;
74 
75  __syncthreads();
76 
77  if (threadIdx.x == 0)
78  {
79  T prec = tmp[0];
80  tmp[0] = 0;
81  for (int i = 1 ; i < dim ; i++)
82  {
83  auto next = tmp[i-1] + prec;
84  prec = tmp[i];
85  tmp[i] = next;
86  }
87  }
88 
89  __syncthreads();
90 
91  out = tmp[threadIdx.x];
92  return;
93  }
94  };
95 }
96 
97 template<typename T, typename T2>
98 static T atomicAdd(T * address, T2 val)
99 {
100  return __atomic_fetch_add(address,val,__ATOMIC_RELAXED);
101 };
102 
103 template<typename T, typename T2>
104 static T atomicAddShared(T * address, T2 val)
105 {
106  T old = *address;
107  *address += val;
108  return old;
109 }
110 
111 namespace gpu
112 {
113  template<typename type_t>
114  struct less_t : public std::binary_function<type_t, type_t, bool> {
115  bool operator()(type_t a, type_t b) const {
116  return a < b;
117  }
118 
119  template<typename type2_t, typename type3_t>
120  bool operator()(type2_t a, type3_t b) const {
121  return a < b;
122  }
123  };
124 /* template<typename type_t>
125  struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
126  bool operator()(type_t a, type_t b) const {
127  return a <= b;
128  }
129  };*/
130  template<typename type_t>
131  struct greater_t : public std::binary_function<type_t, type_t, bool> {
132  bool operator()(type_t a, type_t b) const {
133  return a > b;
134  }
135 
136  template<typename type2_t, typename type3_t>
137  bool operator()(type2_t a, type3_t b) const {
138  return a > b;
139  }
140  };
141 /* template<typename type_t>
142  struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
143  bool operator()(type_t a, type_t b) const {
144  return a >= b;
145  }
146  };
147  template<typename type_t>
148  struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
149  bool operator()(type_t a, type_t b) const {
150  return a == b;
151  }
152  };
153  template<typename type_t>
154  struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
155  bool operator()(type_t a, type_t b) const {
156  return a != b;
157  }
158  };*/
159 
161  // Device-side arithmetic operators.
162 
163  template<typename type_t>
164  struct plus_t : public std::binary_function<type_t, type_t, type_t> {
165  type_t operator()(type_t a, type_t b) const {
166  return a + b;
167  }
168 
169  type_t reduceInitValue() const {
170  return 0;
171  }
172  };
173 
174 /* template<typename type_t>
175  struct minus_t : public std::binary_function<type_t, type_t, type_t> {
176  type_t operator()(type_t a, type_t b) const {
177  return a - b;
178  }
179  };
180 
181  template<typename type_t>
182  struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
183  type_t operator()(type_t a, type_t b) const {
184  return a * b;
185  }
186  };*/
187 
188  template<typename type_t>
189  struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
190  type_t operator()(type_t a, type_t b) const {
191  return std::max(a, b);
192  }
193 
194  type_t reduceInitValue() const {
195  return std::numeric_limits<type_t>::min();
196  }
197  };
198 
199  template<typename type_t>
200  struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
201  type_t operator()(type_t a, type_t b) const {
202  return std::min(a, b);
203  }
204 
205  type_t reduceInitValue() const {
206  return std::numeric_limits<type_t>::max();
207  }
208  };
209 }
210 
211 
212 namespace gpu
213 {
214  template<typename input_it,
215  typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
216  void segreduce(input_it input, int count, segments_it segments,
217  int num_segments, output_it output, op_t op, type_t init,
218  context_t& gpuContext)
219  {
220  int i = 0;
221  for ( ; i < num_segments - 1; i++)
222  {
223  int j = segments[i];
224  output[i] = input[j];
225  ++j;
226  for ( ; j < segments[i+1] ; j++)
227  {
228  output[i] = op(output[i],input[j]);
229  }
230  }
231 
232  // Last segment
233  int j = segments[i];
234  output[i] = input[j];
235  ++j;
236  for ( ; j < count ; j++)
237  {
238  output[i] = op(output[i],input[j]);
239  }
240  }
241 
242  // Key-value merge.
243  template<typename a_keys_it, typename a_vals_it,
244  typename b_keys_it, typename b_vals_it,
245  typename c_keys_it, typename c_vals_it,
246  typename comp_t, typename context_t>
247  void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
248  b_keys_it b_keys, b_vals_it b_vals, int b_count,
249  c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& gpuContext)
250  {
251  int a_it = 0;
252  int b_it = 0;
253  int c_it = 0;
254 
255  while (a_it < a_count || b_it < b_count)
256  {
257  if (a_it < a_count)
258  {
259  if (b_it < b_count)
260  {
261  if (comp(b_keys[b_it],a_keys[a_it]))
262  {
263  c_keys[c_it] = b_keys[b_it];
264  c_vals[c_it] = b_vals[b_it];
265  c_it++;
266  b_it++;
267  }
268  else
269  {
270  c_keys[c_it] = a_keys[a_it];
271  c_vals[c_it] = a_vals[a_it];
272  c_it++;
273  a_it++;
274  }
275  }
276  else
277  {
278  c_keys[c_it] = a_keys[a_it];
279  c_vals[c_it] = a_vals[a_it];
280  c_it++;
281  a_it++;
282  }
283  }
284  else
285  {
286  c_keys[c_it] = b_keys[b_it];
287  c_vals[c_it] = b_vals[b_it];
288  c_it++;
289  b_it++;
290  }
291  }
292  }
293 }
294 
295 extern size_t n_workers;
296 
297 extern bool init_wrappers_call;
298 
299 extern unsigned int * tid_x[OPENMP_MAX_NUM_THREADS];
300 extern unsigned int * tid_y[OPENMP_MAX_NUM_THREADS];
301 extern unsigned int * tid_z[OPENMP_MAX_NUM_THREADS];
302 
303 static void init_wrappers()
304 {
305  init_wrappers_call = true;
306 
307  #pragma omp parallel
308  {
309  n_workers = omp_get_num_threads();
310  }
311 
312  #pragma omp parallel for
313  for (int s = 0 ; s < n_workers ; s++)
314  {
315  unsigned int tid = omp_get_thread_num();
316  tid_x[tid] = &threadIdx.x;
317  tid_y[tid] = &threadIdx.y;
318  tid_z[tid] = &threadIdx.z;
319 
320  }
321 }
322 
323 
324 template<typename lambda_f>
325 struct Fun_enc
326 {
327  lambda_f Fn;
328 
329  Fun_enc(lambda_f Fn)
330  :Fn(Fn)
331  {}
332 
333  void run()
334  {
335  Fn();
336  }
337 };
338 
339 template<typename lambda_f>
340 struct Fun_enc_bt
341 {
342  lambda_f Fn;
343  dim3 & blockIdx;
344  dim3 & threadIdx;
345 
346  Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
347  :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
348  {}
349 
350  void run()
351  {
352  Fn(blockIdx,threadIdx);
353  }
354 };
355 
356 template<typename Fun_enc_type>
357 void launch_kernel(boost::context::detail::transfer_t par)
358 {
359  main_ctx = par.fctx;
360  par_glob = par.data;
361  Fun_enc_type * ptr = (Fun_enc_type *)par.data;
362 
363  ptr->run();
364 
365  boost::context::detail::jump_fcontext(par.fctx,0);
366 }
367 
368 
369 template<typename lambda_f, typename ite_type>
370 static void exe_kernel(lambda_f f, const ite_type & ite)
371 {
372 #ifdef SE_CLASS1
373  if (init_wrappers_call == false)
374  {
375  std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
376  }
377 #endif
378 
379  if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
380 
381  if (mem_stack.size() < ite.nthrs()*n_workers)
382  {
383  int old_size = mem_stack.size();
384  mem_stack.resize(ite.nthrs()*n_workers);
385 
386  for (int i = old_size ; i < mem_stack.size() ; i++)
387  {
388  mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
389  }
390  }
391 
392  size_t stride = ite.nthrs();
393 
394  // Resize contexts
395  contexts.resize(mem_stack.size());
396 
397  Fun_enc<lambda_f> fe(f);
398  bool is_sync_free = true;
399 
400  bool first_block = true;
401 
402  #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
403  for (int i = 0 ; i < ite.wthr.z ; i++)
404  {
405  for (int j = 0 ; j < ite.wthr.y ; j++)
406  {
407  for (int k = 0 ; k < ite.wthr.x ; k++)
408  {
409  size_t tid = omp_get_thread_num();
410 
411  if (first_block == true || is_sync_free == false)
412  {
413  blockIdx.z = i;
414  blockIdx.y = j;
415  blockIdx.x = k;
416  int nc = 0;
417  for (int it = 0 ; it < ite.thr.z ; it++)
418  {
419  for (int jt = 0 ; jt < ite.thr.y ; jt++)
420  {
421  for (int kt = 0 ; kt < ite.thr.x ; kt++)
422  {
423  contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
424 
425  nc++;
426  }
427  }
428  }
429 
430  bool work_to_do = true;
431  while(work_to_do)
432  {
433  nc = 0;
434  // Work threads
435  for (int it = 0 ; it < ite.thr.z ; it++)
436  {
437  *tid_z[tid] = it;
438  for (int jt = 0 ; jt < ite.thr.y ; jt++)
439  {
440  *tid_y[tid] = jt;
441  for (int kt = 0 ; kt < ite.thr.x ; kt++)
442  {
443  *tid_x[tid] = kt;
444  auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
445  contexts[nc + tid*stride] = t.fctx;
446 
447  work_to_do &= (t.data != 0);
448  is_sync_free &= !(work_to_do);
449  nc++;
450  }
451  }
452  }
453  }
454  }
455  else
456  {
457  blockIdx.z = i;
458  blockIdx.y = j;
459  blockIdx.x = k;
460  int fb = 0;
461  // Work threads
462  for (int it = 0 ; it < ite.thr.z ; it++)
463  {
464  *tid_z[tid] = it;
465  for (int jt = 0 ; jt < ite.thr.y ; jt++)
466  {
467  *tid_y[tid] = jt;
468  for (int kt = 0 ; kt < ite.thr.x ; kt++)
469  {
470  *tid_x[tid] = kt;
471  f();
472  }
473  }
474  }
475  }
476 
477  first_block = false;
478  }
479  }
480  }
481 }
482 
483 template<typename lambda_f, typename ite_type>
484 static void exe_kernel_lambda(lambda_f f, ite_type & ite)
485 {
486 #ifdef SE_CLASS1
487  if (init_wrappers_call == false)
488  {
489  std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
490  }
491 #endif
492 
493  if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
494 
495  if (mem_stack.size() < ite.nthrs()*n_workers)
496  {
497  int old_size = mem_stack.size();
498  mem_stack.resize(ite.nthrs()*n_workers);
499 
500  for (int i = old_size ; i < mem_stack.size() ; i++)
501  {
502  mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
503  }
504  }
505 
506  size_t stride = ite.nthrs();
507 
508  // Resize contexts
509  contexts.resize(mem_stack.size());
510 
511  bool is_sync_free = true;
512 
513  bool first_block = true;
514 
515  #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
516  for (int i = 0 ; i < ite.wthr.z ; i++)
517  {
518  for (int j = 0 ; j < ite.wthr.y ; j++)
519  {
520  for (int k = 0 ; k < ite.wthr.x ; k++)
521  {
522  dim3 blockIdx;
523  dim3 threadIdx;
524  Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
525  if (first_block == true || is_sync_free == false)
526  {
527  size_t tid = omp_get_thread_num();
528 
529  blockIdx.z = i;
530  blockIdx.y = j;
531  blockIdx.x = k;
532  int nc = 0;
533  for (int it = 0 ; it < ite.thr.z ; it++)
534  {
535  for (int jt = 0 ; jt < ite.thr.y ; jt++)
536  {
537  for (int kt = 0 ; kt < ite.thr.x ; kt++)
538  {
539  contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
540 
541  nc++;
542  }
543  }
544  }
545 
546  bool work_to_do = true;
547  while(work_to_do)
548  {
549  nc = 0;
550  // Work threads
551  for (int it = 0 ; it < ite.thr.z ; it++)
552  {
553  threadIdx.z = it;
554  for (int jt = 0 ; jt < ite.thr.y ; jt++)
555  {
556  threadIdx.y = jt;
557  for (int kt = 0 ; kt < ite.thr.x ; kt++)
558  {
559  threadIdx.x = kt;
560  auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
561  contexts[nc + tid*stride] = t.fctx;
562 
563  work_to_do &= (t.data != 0);
564  is_sync_free &= !(work_to_do);
565  nc++;
566  }
567  }
568  }
569  }
570  }
571  else
572  {
573  blockIdx.z = i;
574  blockIdx.y = j;
575  blockIdx.x = k;
576  int fb = 0;
577  // Work threads
578  for (int it = 0 ; it < ite.thr.z ; it++)
579  {
580  threadIdx.z = it;
581  for (int jt = 0 ; jt < ite.thr.y ; jt++)
582  {
583  threadIdx.y = jt;
584  for (int kt = 0 ; kt < ite.thr.x ; kt++)
585  {
586  threadIdx.x = kt;
587  f(blockIdx,threadIdx);
588  }
589  }
590  }
591  }
592 
593  first_block = false;
594  }
595  }
596  }
597 }
598 
599 template<typename lambda_f, typename ite_type>
600 static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
601 {
602 #ifdef SE_CLASS1
603  if (init_wrappers_call == false)
604  {
605  std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
606  }
607 #endif
608 
609  if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
610 
611  if (mem_stack.size() < ite.nthrs()*n_workers)
612  {
613  int old_size = mem_stack.size();
614  mem_stack.resize(ite.nthrs()*n_workers);
615 
616  for (int i = old_size ; i < mem_stack.size() ; i++)
617  {
618  mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
619  }
620  }
621 
622  size_t stride = ite.nthrs();
623 
624  // Resize contexts
625  contexts.resize(mem_stack.size());
626 
627  bool is_sync_free = true;
628 
629  bool first_block = true;
630 
631  #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
632  for (int i = 0 ; i < ite.wthr.z ; i++)
633  {
634  for (int j = 0 ; j < ite.wthr.y ; j++)
635  {
636  for (int k = 0 ; k < ite.wthr.x ; k++)
637  {
638  Fun_enc<lambda_f> fe(f);
639  if (first_block == true || is_sync_free == false)
640  {
641  size_t tid = omp_get_thread_num();
642 
643  blockIdx.z = i;
644  blockIdx.y = j;
645  blockIdx.x = k;
646  int nc = 0;
647  for (int it = 0 ; it < ite.thr.z ; it++)
648  {
649  for (int jt = 0 ; jt < ite.thr.y ; jt++)
650  {
651  for (int kt = 0 ; kt < ite.thr.x ; kt++)
652  {
653  contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
654 
655  nc++;
656  }
657  }
658  }
659 
660  bool work_to_do = true;
661  while(work_to_do)
662  {
663  nc = 0;
664  // Work threads
665  for (int it = 0 ; it < ite.thr.z ; it++)
666  {
667  threadIdx.z = it;
668  for (int jt = 0 ; jt < ite.thr.y ; jt++)
669  {
670  threadIdx.y = jt;
671  for (int kt = 0 ; kt < ite.thr.x ; kt++)
672  {
673  threadIdx.x = kt;
674  auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
675  contexts[nc + tid*stride] = t.fctx;
676 
677  work_to_do &= (t.data != 0);
678  is_sync_free &= !(work_to_do);
679  nc++;
680  }
681  }
682  }
683  }
684  }
685  else
686  {
687  blockIdx.z = i;
688  blockIdx.y = j;
689  blockIdx.x = k;
690  int fb = 0;
691  // Work threads
692  for (int it = 0 ; it < ite.thr.z ; it++)
693  {
694  threadIdx.z = it;
695  for (int jt = 0 ; jt < ite.thr.y ; jt++)
696  {
697  threadIdx.y = jt;
698  for (int kt = 0 ; kt < ite.thr.x ; kt++)
699  {
700  threadIdx.x = kt;
701  f();
702  }
703  }
704  }
705  }
706 
707  first_block = false;
708  }
709  }
710  }
711 }
712 
713 template<typename lambda_f, typename ite_type>
714 static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
715 {
716  #pragma omp parallel for collapse(3)
717  for (int i = 0 ; i < ite.wthr.z ; i++)
718  {
719  for (int j = 0 ; j < ite.wthr.y ; j++)
720  {
721  for (int k = 0 ; k < ite.wthr.x ; k++)
722  {
723  blockIdx.z = i;
724  blockIdx.y = j;
725  blockIdx.x = k;
726  int fb = 0;
727  // Work threads
728  for (int it = 0 ; it < ite.thr.z ; it++)
729  {
730  threadIdx.z = it;
731  for (int jt = 0 ; jt < ite.thr.y ; jt++)
732  {
733  threadIdx.y = jt;
734  for (int kt = 0 ; kt < ite.thr.x ; kt++)
735  {
736  threadIdx.x = kt;
737  f();
738  }
739  }
740  }
741  }
742  }
743  }
744 }
745 
746 #ifdef PRINT_CUDA_LAUNCHES
747 
748 #define CUDA_LAUNCH(cuda_call,ite, ...) \
749  {\
750  gridDim.x = ite.wthr.x;\
751  gridDim.y = ite.wthr.y;\
752  gridDim.z = ite.wthr.z;\
753  \
754  blockDim.x = ite.thr.x;\
755  blockDim.y = ite.thr.y;\
756  blockDim.z = ite.thr.z;\
757  \
758  CHECK_SE_CLASS1_PRE\
759  \
760  std::cout << "Launching: " << #cuda_call << " (" << ite.wthr.x << "," << ite.wthr.y << "," << ite.wthr.z << ") (" << ite.thr.x << "," << ite.thr.y << "," << ite.thr.z << ")" << std::endl;\
761  \
762  exe_kernel([&]() -> void {\
763  \
764  \
765  cuda_call(__VA_ARGS__);\
766  \
767  },ite);\
768  \
769  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
770  }
771 
772 
773 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
774  {\
775  dim3 wthr__(wthr_);\
776  dim3 thr__(thr_);\
777  \
778  ite_gpu<1> itg;\
779  itg.wthr = wthr_;\
780  itg.thr = thr_;\
781  \
782  gridDim.x = wthr__.x;\
783  gridDim.y = wthr__.y;\
784  gridDim.z = wthr__.z;\
785  \
786  blockDim.x = thr__.x;\
787  blockDim.y = thr__.y;\
788  blockDim.z = thr__.z;\
789  \
790  CHECK_SE_CLASS1_PRE\
791  std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
792  \
793  exe_kernel([&]() -> void {\
794  \
795  cuda_call(__VA_ARGS__);\
796  \
797  },itg);\
798  \
799  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
800  }
801 
802 
803 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
804  {\
805  dim3 wthr__(wthr_);\
806  dim3 thr__(thr_);\
807  \
808  ite_gpu<1> itg;\
809  itg.wthr = wthr_;\
810  itg.thr = thr_;\
811  \
812  gridDim.x = wthr__.x;\
813  gridDim.y = wthr__.y;\
814  gridDim.z = wthr__.z;\
815  \
816  blockDim.x = thr__.x;\
817  blockDim.y = thr__.y;\
818  blockDim.z = thr__.z;\
819  \
820  CHECK_SE_CLASS1_PRE\
821  std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
822  \
823  exe_kernel([&]() -> void {\
824  \
825  cuda_call(__VA_ARGS__);\
826  \
827  },itg);\
828  \
829  }
830 
831 #define CUDA_CHECK()
832 
833 #else
834 
835 
836 #define CUDA_LAUNCH(cuda_call,ite, ...) \
837  {\
838  gridDim.x = ite.wthr.x;\
839  gridDim.y = ite.wthr.y;\
840  gridDim.z = ite.wthr.z;\
841  \
842  blockDim.x = ite.thr.x;\
843  blockDim.y = ite.thr.y;\
844  blockDim.z = ite.thr.z;\
845  \
846  CHECK_SE_CLASS1_PRE\
847  \
848  exe_kernel([&]() -> void {\
849  \
850  \
851  cuda_call(__VA_ARGS__);\
852  \
853  },ite);\
854  \
855  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
856  }
857 
858 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
859  {\
860  gridDim.x = ite.wthr.x;\
861  gridDim.y = ite.wthr.y;\
862  gridDim.z = ite.wthr.z;\
863  \
864  blockDim.x = ite.thr.x;\
865  blockDim.y = ite.thr.y;\
866  blockDim.z = ite.thr.z;\
867  \
868  CHECK_SE_CLASS1_PRE\
869  \
870  exe_kernel_lambda(lambda_f,ite);\
871  \
872  CHECK_SE_CLASS1_POST("lambda",0)\
873  }
874 
875 #define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
876  {\
877  gridDim.x = ite.wthr.x;\
878  gridDim.y = ite.wthr.y;\
879  gridDim.z = ite.wthr.z;\
880  \
881  blockDim.x = ite.thr.x;\
882  blockDim.y = ite.thr.y;\
883  blockDim.z = ite.thr.z;\
884  \
885  CHECK_SE_CLASS1_PRE\
886  \
887  exe_kernel_lambda_tls(lambda_f,ite);\
888  \
889  CHECK_SE_CLASS1_POST("lambda",0)\
890  }
891 
892 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
893  {\
894  dim3 wthr__(wthr_);\
895  dim3 thr__(thr_);\
896  \
897  ite_gpu<1> itg;\
898  itg.wthr = wthr_;\
899  itg.thr = thr_;\
900  gridDim.x = itg.wthr.x;\
901  gridDim.y = itg.wthr.y;\
902  gridDim.z = itg.wthr.z;\
903  \
904  blockDim.x = itg.thr.x;\
905  blockDim.y = itg.thr.y;\
906  blockDim.z = itg.thr.z;\
907  \
908  CHECK_SE_CLASS1_PRE\
909  \
910  exe_kernel_lambda_tls(lambda_f,itg);\
911  \
912  CHECK_SE_CLASS1_POST("lambda",0)\
913  }
914 
915 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
916  {\
917  dim3 wthr__(wthr_);\
918  dim3 thr__(thr_);\
919  \
920  ite_gpu<1> itg;\
921  itg.wthr = wthr_;\
922  itg.thr = thr_;\
923  \
924  gridDim.x = wthr__.x;\
925  gridDim.y = wthr__.y;\
926  gridDim.z = wthr__.z;\
927  \
928  blockDim.x = thr__.x;\
929  blockDim.y = thr__.y;\
930  blockDim.z = thr__.z;\
931  \
932  CHECK_SE_CLASS1_PRE\
933  \
934  \
935  exe_kernel([&]() -> void {\
936  \
937  cuda_call(__VA_ARGS__);\
938  \
939  },itg);\
940  \
941  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
942  }
943 
944 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
945  {\
946  dim3 wthr__(wthr_);\
947  dim3 thr__(thr_);\
948  \
949  ite_gpu<1> itg;\
950  itg.wthr = wthr_;\
951  itg.thr = thr_;\
952  \
953  gridDim.x = wthr__.x;\
954  gridDim.y = wthr__.y;\
955  gridDim.z = wthr__.z;\
956  \
957  blockDim.x = thr__.x;\
958  blockDim.y = thr__.y;\
959  blockDim.z = thr__.z;\
960  \
961  CHECK_SE_CLASS1_PRE\
962  \
963  \
964  exe_kernel([&]() -> void {\
965  \
966  cuda_call(__VA_ARGS__);\
967  \
968  },itg);\
969  \
970  }
971 
972 #define CUDA_LAUNCH_NOSYNC(cuda_call,ite, ...) \
973  {\
974  gridDim.x = ite.wthr.x;\
975  gridDim.y = ite.wthr.y;\
976  gridDim.z = ite.wthr.z;\
977  \
978  blockDim.x = ite.thr.x;\
979  blockDim.y = ite.thr.y;\
980  blockDim.z = ite.thr.z;\
981  \
982  CHECK_SE_CLASS1_PRE\
983  \
984  exe_kernel_no_sync([&]() -> void {\
985  \
986  \
987  cuda_call(__VA_ARGS__);\
988  \
989  },ite);\
990  \
991  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
992  }
993 
994 
995 #define CUDA_LAUNCH_DIM3_NOSYNC(cuda_call,wthr_,thr_, ...)\
996  {\
997  dim3 wthr__(wthr_);\
998  dim3 thr__(thr_);\
999  \
1000  ite_gpu<1> itg;\
1001  itg.wthr = wthr_;\
1002  itg.thr = thr_;\
1003  \
1004  gridDim.x = wthr__.x;\
1005  gridDim.y = wthr__.y;\
1006  gridDim.z = wthr__.z;\
1007  \
1008  blockDim.x = thr__.x;\
1009  blockDim.y = thr__.y;\
1010  blockDim.z = thr__.z;\
1011  \
1012  CHECK_SE_CLASS1_PRE\
1013  \
1014  exe_kernel_no_sync([&]() -> void {\
1015  \
1016  cuda_call(__VA_ARGS__);\
1017  \
1018  },itg);\
1019  \
1020  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
1021  }
1022 
1023 #define CUDA_CHECK()
1024 
1025 #endif
1026 
1027 #endif
1028 
1029 #endif
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:333
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition: block_scan.cuh:271
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction