OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
cudify_openmp.hpp
1 #ifndef CUDIFY_OPENMP_HPP_
2 #define CUDIFY_OPENMP_HPP_
3 
4 
5 
6 #include "config.h"
7 
8 constexpr int default_kernel_wg_threads_ = 1024;
9 
10 #include <omp.h>
11 
12 
13 #define CUDA_ON_BACKEND CUDA_BACKEND_OPENMP
14 
15 #include "cudify_hardware_common.hpp"
16 
17 #ifdef HAVE_BOOST_CONTEXT
18 
19 #include <boost/bind/bind.hpp>
20 #include <type_traits>
21 #ifdef HAVE_BOOST_CONTEXT
22 #include <boost/context/continuation.hpp>
23 #endif
24 #include <vector>
25 #include <string.h>
26 
27 
28 #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
29 #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
30 #endif
31 
32 extern std::vector<void *>mem_stack;
33 
34 extern thread_local dim3 threadIdx;
35 extern thread_local dim3 blockIdx;
36 
37 extern dim3 blockDim;
38 extern dim3 gridDim;
39 
40 extern std::vector<void *> mem_stack;
41 extern std::vector<boost::context::detail::fcontext_t> contexts;
42 extern thread_local void * par_glob;
43 extern thread_local boost::context::detail::fcontext_t main_ctx;
44 
45 static void __syncthreads()
46 {
47  boost::context::detail::jump_fcontext(main_ctx,par_glob);
48 };
49 
50 
51 extern thread_local int vct_atomic_add;
52 extern thread_local int vct_atomic_rem;
53 
54 
55 namespace cub
56 {
57  template<typename T, unsigned int dim>
58  class BlockScan
59  {
60  public:
61  typedef std::array<T,dim> TempStorage;
62 
63  private:
64  TempStorage & tmp;
65 
66  public:
67 
68 
69 
70  BlockScan(TempStorage & tmp)
71  :tmp(tmp)
72  {};
73 
74  void ExclusiveSum(T & in, T & out)
75  {
76  tmp[threadIdx.x] = in;
77 
78  __syncthreads();
79 
80  if (threadIdx.x == 0)
81  {
82  T prec = tmp[0];
83  tmp[0] = 0;
84  for (int i = 1 ; i < dim ; i++)
85  {
86  auto next = tmp[i-1] + prec;
87  prec = tmp[i];
88  tmp[i] = next;
89  }
90  }
91 
92  __syncthreads();
93 
94  out = tmp[threadIdx.x];
95  return;
96  }
97  };
98 }
99 
100 template<typename T, typename T2>
101 static T atomicAdd(T * address, T2 val)
102 {
103  return __atomic_fetch_add(address,val,__ATOMIC_RELAXED);
104 };
105 
106 template<typename T, typename T2>
107 static T atomicAddShared(T * address, T2 val)
108 {
109  T old = *address;
110  *address += val;
111  return old;
112 }
113 
114 #define MGPU_HOST_DEVICE
115 
116 namespace mgpu
117 {
118  template<typename type_t>
119  struct less_t : public std::binary_function<type_t, type_t, bool> {
120  bool operator()(type_t a, type_t b) const {
121  return a < b;
122  }
123  template<typename type2_t, typename type3_t>
124  bool operator()(type2_t a, type3_t b) const {
125  return a < b;
126  }
127  };
128 /* template<typename type_t>
129  struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
130  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
131  return a <= b;
132  }
133  };*/
134  template<typename type_t>
135  struct greater_t : public std::binary_function<type_t, type_t, bool> {
136  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
137  return a > b;
138  }
139  template<typename type2_t, typename type3_t>
140  MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const {
141  return a > b;
142  }
143  };
144 /* template<typename type_t>
145  struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
146  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
147  return a >= b;
148  }
149  };
150  template<typename type_t>
151  struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
152  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
153  return a == b;
154  }
155  };
156  template<typename type_t>
157  struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
158  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
159  return a != b;
160  }
161  };*/
162 
164  // Device-side arithmetic operators.
165 
166  template<typename type_t>
167  struct plus_t : public std::binary_function<type_t, type_t, type_t> {
168  type_t operator()(type_t a, type_t b) const {
169  return a + b;
170  }
171  };
172 
173 /* template<typename type_t>
174  struct minus_t : public std::binary_function<type_t, type_t, type_t> {
175  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
176  return a - b;
177  }
178  };
179 
180  template<typename type_t>
181  struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
182  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
183  return a * b;
184  }
185  };*/
186 
187  template<typename type_t>
188  struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
189  type_t operator()(type_t a, type_t b) const {
190  return std::max(a, b);
191  }
192  };
193 
194  template<typename type_t>
195  struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
196  type_t operator()(type_t a, type_t b) const {
197  return std::min(a, b);
198  }
199  };
200 }
201 
202 
203 namespace mgpu
204 {
205  template<typename input_it,
206  typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
207  void segreduce(input_it input, int count, segments_it segments,
208  int num_segments, output_it output, op_t op, type_t init,
209  context_t& context)
210  {
211  int i = 0;
212  for ( ; i < num_segments - 1; i++)
213  {
214  int j = segments[i];
215  output[i] = input[j];
216  ++j;
217  for ( ; j < segments[i+1] ; j++)
218  {
219  output[i] = op(output[i],input[j]);
220  }
221  }
222 
223  // Last segment
224  int j = segments[i];
225  output[i] = input[j];
226  ++j;
227  for ( ; j < count ; j++)
228  {
229  output[i] = op(output[i],input[j]);
230  }
231  }
232 
233  // Key-value merge.
234  template<typename a_keys_it, typename a_vals_it,
235  typename b_keys_it, typename b_vals_it,
236  typename c_keys_it, typename c_vals_it,
237  typename comp_t, typename context_t>
238  void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
239  b_keys_it b_keys, b_vals_it b_vals, int b_count,
240  c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
241  {
242  int a_it = 0;
243  int b_it = 0;
244  int c_it = 0;
245 
246  while (a_it < a_count || b_it < b_count)
247  {
248  if (a_it < a_count)
249  {
250  if (b_it < b_count)
251  {
252  if (comp(b_keys[b_it],a_keys[a_it]))
253  {
254  c_keys[c_it] = b_keys[b_it];
255  c_vals[c_it] = b_vals[b_it];
256  c_it++;
257  b_it++;
258  }
259  else
260  {
261  c_keys[c_it] = a_keys[a_it];
262  c_vals[c_it] = a_vals[a_it];
263  c_it++;
264  a_it++;
265  }
266  }
267  else
268  {
269  c_keys[c_it] = a_keys[a_it];
270  c_vals[c_it] = a_vals[a_it];
271  c_it++;
272  a_it++;
273  }
274  }
275  else
276  {
277  c_keys[c_it] = b_keys[b_it];
278  c_vals[c_it] = b_vals[b_it];
279  c_it++;
280  b_it++;
281  }
282  }
283  }
284 }
285 
286 extern size_t n_workers;
287 
288 extern bool init_wrappers_call;
289 
290 extern unsigned int * tid_x[OPENMP_MAX_NUM_THREADS];
291 extern unsigned int * tid_y[OPENMP_MAX_NUM_THREADS];
292 extern unsigned int * tid_z[OPENMP_MAX_NUM_THREADS];
293 
294 static void init_wrappers()
295 {
296  init_wrappers_call = true;
297 
298  #pragma omp parallel
299  {
300  n_workers = omp_get_num_threads();
301  }
302 
303  #pragma omp parallel for
304  for (int s = 0 ; s < n_workers ; s++)
305  {
306  unsigned int tid = omp_get_thread_num();
307  tid_x[tid] = &threadIdx.x;
308  tid_y[tid] = &threadIdx.y;
309  tid_z[tid] = &threadIdx.z;
310 
311  }
312 }
313 
314 
315 template<typename lambda_f>
316 struct Fun_enc
317 {
318  lambda_f Fn;
319 
320  Fun_enc(lambda_f Fn)
321  :Fn(Fn)
322  {}
323 
324  void run()
325  {
326  Fn();
327  }
328 };
329 
330 template<typename lambda_f>
331 struct Fun_enc_bt
332 {
333  lambda_f Fn;
334  dim3 & blockIdx;
335  dim3 & threadIdx;
336 
337  Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
338  :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
339  {}
340 
341  void run()
342  {
343  Fn(blockIdx,threadIdx);
344  }
345 };
346 
347 template<typename Fun_enc_type>
348 void launch_kernel(boost::context::detail::transfer_t par)
349 {
350  main_ctx = par.fctx;
351  par_glob = par.data;
352  Fun_enc_type * ptr = (Fun_enc_type *)par.data;
353 
354  ptr->run();
355 
356  boost::context::detail::jump_fcontext(par.fctx,0);
357 }
358 
359 
360 template<typename lambda_f, typename ite_type>
361 static void exe_kernel(lambda_f f, ite_type & ite)
362 {
363 #ifdef SE_CLASS1
364  if (init_wrappers_call == false)
365  {
366  std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
367  }
368 #endif
369 
370  if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
371 
372  if (mem_stack.size() < ite.nthrs()*n_workers)
373  {
374  int old_size = mem_stack.size();
375  mem_stack.resize(ite.nthrs()*n_workers);
376 
377  for (int i = old_size ; i < mem_stack.size() ; i++)
378  {
379  mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
380  }
381  }
382 
383  size_t stride = ite.nthrs();
384 
385  // Resize contexts
386  contexts.resize(mem_stack.size());
387 
388  Fun_enc<lambda_f> fe(f);
389  bool is_sync_free = true;
390 
391  bool first_block = true;
392 
393  #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
394  for (int i = 0 ; i < ite.wthr.z ; i++)
395  {
396  for (int j = 0 ; j < ite.wthr.y ; j++)
397  {
398  for (int k = 0 ; k < ite.wthr.x ; k++)
399  {
400  size_t tid = omp_get_thread_num();
401 
402  if (first_block == true || is_sync_free == false)
403  {
404  blockIdx.z = i;
405  blockIdx.y = j;
406  blockIdx.x = k;
407  int nc = 0;
408  for (int it = 0 ; it < ite.thr.z ; it++)
409  {
410  for (int jt = 0 ; jt < ite.thr.y ; jt++)
411  {
412  for (int kt = 0 ; kt < ite.thr.x ; kt++)
413  {
414  contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
415 
416  nc++;
417  }
418  }
419  }
420 
421  bool work_to_do = true;
422  while(work_to_do)
423  {
424  nc = 0;
425  // Work threads
426  for (int it = 0 ; it < ite.thr.z ; it++)
427  {
428  *tid_z[tid] = it;
429  for (int jt = 0 ; jt < ite.thr.y ; jt++)
430  {
431  *tid_y[tid] = jt;
432  for (int kt = 0 ; kt < ite.thr.x ; kt++)
433  {
434  *tid_x[tid] = kt;
435  auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
436  contexts[nc + tid*stride] = t.fctx;
437 
438  work_to_do &= (t.data != 0);
439  is_sync_free &= !(work_to_do);
440  nc++;
441  }
442  }
443  }
444  }
445  }
446  else
447  {
448  blockIdx.z = i;
449  blockIdx.y = j;
450  blockIdx.x = k;
451  int fb = 0;
452  // Work threads
453  for (int it = 0 ; it < ite.thr.z ; it++)
454  {
455  *tid_z[tid] = it;
456  for (int jt = 0 ; jt < ite.thr.y ; jt++)
457  {
458  *tid_y[tid] = jt;
459  for (int kt = 0 ; kt < ite.thr.x ; kt++)
460  {
461  *tid_x[tid] = kt;
462  f();
463  }
464  }
465  }
466  }
467 
468  first_block = false;
469  }
470  }
471  }
472 }
473 
474 template<typename lambda_f, typename ite_type>
475 static void exe_kernel_lambda(lambda_f f, ite_type & ite)
476 {
477 #ifdef SE_CLASS1
478  if (init_wrappers_call == false)
479  {
480  std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
481  }
482 #endif
483 
484  if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
485 
486  if (mem_stack.size() < ite.nthrs()*n_workers)
487  {
488  int old_size = mem_stack.size();
489  mem_stack.resize(ite.nthrs()*n_workers);
490 
491  for (int i = old_size ; i < mem_stack.size() ; i++)
492  {
493  mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
494  }
495  }
496 
497  size_t stride = ite.nthrs();
498 
499  // Resize contexts
500  contexts.resize(mem_stack.size());
501 
502  bool is_sync_free = true;
503 
504  bool first_block = true;
505 
506  #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
507  for (int i = 0 ; i < ite.wthr.z ; i++)
508  {
509  for (int j = 0 ; j < ite.wthr.y ; j++)
510  {
511  for (int k = 0 ; k < ite.wthr.x ; k++)
512  {
513  dim3 blockIdx;
514  dim3 threadIdx;
515  Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
516  if (first_block == true || is_sync_free == false)
517  {
518  size_t tid = omp_get_thread_num();
519 
520  blockIdx.z = i;
521  blockIdx.y = j;
522  blockIdx.x = k;
523  int nc = 0;
524  for (int it = 0 ; it < ite.thr.z ; it++)
525  {
526  for (int jt = 0 ; jt < ite.thr.y ; jt++)
527  {
528  for (int kt = 0 ; kt < ite.thr.x ; kt++)
529  {
530  contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
531 
532  nc++;
533  }
534  }
535  }
536 
537  bool work_to_do = true;
538  while(work_to_do)
539  {
540  nc = 0;
541  // Work threads
542  for (int it = 0 ; it < ite.thr.z ; it++)
543  {
544  threadIdx.z = it;
545  for (int jt = 0 ; jt < ite.thr.y ; jt++)
546  {
547  threadIdx.y = jt;
548  for (int kt = 0 ; kt < ite.thr.x ; kt++)
549  {
550  threadIdx.x = kt;
551  auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
552  contexts[nc + tid*stride] = t.fctx;
553 
554  work_to_do &= (t.data != 0);
555  is_sync_free &= !(work_to_do);
556  nc++;
557  }
558  }
559  }
560  }
561  }
562  else
563  {
564  blockIdx.z = i;
565  blockIdx.y = j;
566  blockIdx.x = k;
567  int fb = 0;
568  // Work threads
569  for (int it = 0 ; it < ite.thr.z ; it++)
570  {
571  threadIdx.z = it;
572  for (int jt = 0 ; jt < ite.thr.y ; jt++)
573  {
574  threadIdx.y = jt;
575  for (int kt = 0 ; kt < ite.thr.x ; kt++)
576  {
577  threadIdx.x = kt;
578  f(blockIdx,threadIdx);
579  }
580  }
581  }
582  }
583 
584  first_block = false;
585  }
586  }
587  }
588 }
589 
590 template<typename lambda_f, typename ite_type>
591 static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
592 {
593 #ifdef SE_CLASS1
594  if (init_wrappers_call == false)
595  {
596  std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
597  }
598 #endif
599 
600  if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
601 
602  if (mem_stack.size() < ite.nthrs()*n_workers)
603  {
604  int old_size = mem_stack.size();
605  mem_stack.resize(ite.nthrs()*n_workers);
606 
607  for (int i = old_size ; i < mem_stack.size() ; i++)
608  {
609  mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
610  }
611  }
612 
613  size_t stride = ite.nthrs();
614 
615  // Resize contexts
616  contexts.resize(mem_stack.size());
617 
618  bool is_sync_free = true;
619 
620  bool first_block = true;
621 
622  #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
623  for (int i = 0 ; i < ite.wthr.z ; i++)
624  {
625  for (int j = 0 ; j < ite.wthr.y ; j++)
626  {
627  for (int k = 0 ; k < ite.wthr.x ; k++)
628  {
629  Fun_enc<lambda_f> fe(f);
630  if (first_block == true || is_sync_free == false)
631  {
632  size_t tid = omp_get_thread_num();
633 
634  blockIdx.z = i;
635  blockIdx.y = j;
636  blockIdx.x = k;
637  int nc = 0;
638  for (int it = 0 ; it < ite.thr.z ; it++)
639  {
640  for (int jt = 0 ; jt < ite.thr.y ; jt++)
641  {
642  for (int kt = 0 ; kt < ite.thr.x ; kt++)
643  {
644  contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
645 
646  nc++;
647  }
648  }
649  }
650 
651  bool work_to_do = true;
652  while(work_to_do)
653  {
654  nc = 0;
655  // Work threads
656  for (int it = 0 ; it < ite.thr.z ; it++)
657  {
658  threadIdx.z = it;
659  for (int jt = 0 ; jt < ite.thr.y ; jt++)
660  {
661  threadIdx.y = jt;
662  for (int kt = 0 ; kt < ite.thr.x ; kt++)
663  {
664  threadIdx.x = kt;
665  auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
666  contexts[nc + tid*stride] = t.fctx;
667 
668  work_to_do &= (t.data != 0);
669  is_sync_free &= !(work_to_do);
670  nc++;
671  }
672  }
673  }
674  }
675  }
676  else
677  {
678  blockIdx.z = i;
679  blockIdx.y = j;
680  blockIdx.x = k;
681  int fb = 0;
682  // Work threads
683  for (int it = 0 ; it < ite.thr.z ; it++)
684  {
685  threadIdx.z = it;
686  for (int jt = 0 ; jt < ite.thr.y ; jt++)
687  {
688  threadIdx.y = jt;
689  for (int kt = 0 ; kt < ite.thr.x ; kt++)
690  {
691  threadIdx.x = kt;
692  f();
693  }
694  }
695  }
696  }
697 
698  first_block = false;
699  }
700  }
701  }
702 }
703 
704 template<typename lambda_f, typename ite_type>
705 static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
706 {
707  #pragma omp parallel for collapse(3)
708  for (int i = 0 ; i < ite.wthr.z ; i++)
709  {
710  for (int j = 0 ; j < ite.wthr.y ; j++)
711  {
712  for (int k = 0 ; k < ite.wthr.x ; k++)
713  {
714  blockIdx.z = i;
715  blockIdx.y = j;
716  blockIdx.x = k;
717  int fb = 0;
718  // Work threads
719  for (int it = 0 ; it < ite.thr.z ; it++)
720  {
721  threadIdx.z = it;
722  for (int jt = 0 ; jt < ite.thr.y ; jt++)
723  {
724  threadIdx.y = jt;
725  for (int kt = 0 ; kt < ite.thr.x ; kt++)
726  {
727  threadIdx.x = kt;
728  f();
729  }
730  }
731  }
732  }
733  }
734  }
735 }
736 
737 #ifdef PRINT_CUDA_LAUNCHES
738 
739 #define CUDA_LAUNCH(cuda_call,ite, ...) \
740  {\
741  gridDim.x = ite.wthr.x;\
742  gridDim.y = ite.wthr.y;\
743  gridDim.z = ite.wthr.z;\
744  \
745  blockDim.x = ite.thr.x;\
746  blockDim.y = ite.thr.y;\
747  blockDim.z = ite.thr.z;\
748  \
749  CHECK_SE_CLASS1_PRE\
750  \
751  std::cout << "Launching: " << #cuda_call << " (" << ite.wthr.x << "," << ite.wthr.y << "," << ite.wthr.z << ") (" << ite.thr.x << "," << ite.thr.y << "," << ite.thr.z << ")" << std::endl;\
752  \
753  exe_kernel([&]() -> void {\
754  \
755  \
756  cuda_call(__VA_ARGS__);\
757  \
758  },ite);\
759  \
760  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
761  }
762 
763 
764 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
765  {\
766  dim3 wthr__(wthr_);\
767  dim3 thr__(thr_);\
768  \
769  ite_gpu<1> itg;\
770  itg.wthr = wthr_;\
771  itg.thr = thr_;\
772  \
773  gridDim.x = wthr__.x;\
774  gridDim.y = wthr__.y;\
775  gridDim.z = wthr__.z;\
776  \
777  blockDim.x = thr__.x;\
778  blockDim.y = thr__.y;\
779  blockDim.z = thr__.z;\
780  \
781  CHECK_SE_CLASS1_PRE\
782  std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
783  \
784  exe_kernel([&]() -> void {\
785  \
786  cuda_call(__VA_ARGS__);\
787  \
788  },itg);\
789  \
790  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
791  }
792 
793 
794 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
795  {\
796  dim3 wthr__(wthr_);\
797  dim3 thr__(thr_);\
798  \
799  ite_gpu<1> itg;\
800  itg.wthr = wthr_;\
801  itg.thr = thr_;\
802  \
803  gridDim.x = wthr__.x;\
804  gridDim.y = wthr__.y;\
805  gridDim.z = wthr__.z;\
806  \
807  blockDim.x = thr__.x;\
808  blockDim.y = thr__.y;\
809  blockDim.z = thr__.z;\
810  \
811  CHECK_SE_CLASS1_PRE\
812  std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
813  \
814  exe_kernel([&]() -> void {\
815  \
816  cuda_call(__VA_ARGS__);\
817  \
818  },itg);\
819  \
820  }
821 
822 #define CUDA_CHECK()
823 
824 #else
825 
826 #define CUDA_LAUNCH(cuda_call,ite, ...) \
827  {\
828  gridDim.x = ite.wthr.x;\
829  gridDim.y = ite.wthr.y;\
830  gridDim.z = ite.wthr.z;\
831  \
832  blockDim.x = ite.thr.x;\
833  blockDim.y = ite.thr.y;\
834  blockDim.z = ite.thr.z;\
835  \
836  CHECK_SE_CLASS1_PRE\
837  \
838  exe_kernel([&]() -> void {\
839  \
840  \
841  cuda_call(__VA_ARGS__);\
842  \
843  },ite);\
844  \
845  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
846  }
847 
848 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
849  {\
850  gridDim.x = ite.wthr.x;\
851  gridDim.y = ite.wthr.y;\
852  gridDim.z = ite.wthr.z;\
853  \
854  blockDim.x = ite.thr.x;\
855  blockDim.y = ite.thr.y;\
856  blockDim.z = ite.thr.z;\
857  \
858  CHECK_SE_CLASS1_PRE\
859  \
860  exe_kernel_lambda(lambda_f,ite);\
861  \
862  CHECK_SE_CLASS1_POST("lambda",0)\
863  }
864 
865 #define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
866  {\
867  gridDim.x = ite.wthr.x;\
868  gridDim.y = ite.wthr.y;\
869  gridDim.z = ite.wthr.z;\
870  \
871  blockDim.x = ite.thr.x;\
872  blockDim.y = ite.thr.y;\
873  blockDim.z = ite.thr.z;\
874  \
875  CHECK_SE_CLASS1_PRE\
876  \
877  exe_kernel_lambda_tls(lambda_f,ite);\
878  \
879  CHECK_SE_CLASS1_POST("lambda",0)\
880  }
881 
882 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
883  {\
884  dim3 wthr__(wthr_);\
885  dim3 thr__(thr_);\
886  \
887  ite_gpu<1> itg;\
888  itg.wthr = wthr_;\
889  itg.thr = thr_;\
890  gridDim.x = itg.wthr.x;\
891  gridDim.y = itg.wthr.y;\
892  gridDim.z = itg.wthr.z;\
893  \
894  blockDim.x = itg.thr.x;\
895  blockDim.y = itg.thr.y;\
896  blockDim.z = itg.thr.z;\
897  \
898  CHECK_SE_CLASS1_PRE\
899  \
900  exe_kernel_lambda_tls(lambda_f,itg);\
901  \
902  CHECK_SE_CLASS1_POST("lambda",0)\
903  }
904 
905 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
906  {\
907  dim3 wthr__(wthr_);\
908  dim3 thr__(thr_);\
909  \
910  ite_gpu<1> itg;\
911  itg.wthr = wthr_;\
912  itg.thr = thr_;\
913  \
914  gridDim.x = wthr__.x;\
915  gridDim.y = wthr__.y;\
916  gridDim.z = wthr__.z;\
917  \
918  blockDim.x = thr__.x;\
919  blockDim.y = thr__.y;\
920  blockDim.z = thr__.z;\
921  \
922  CHECK_SE_CLASS1_PRE\
923  \
924  \
925  exe_kernel([&]() -> void {\
926  \
927  cuda_call(__VA_ARGS__);\
928  \
929  },itg);\
930  \
931  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
932  }
933 
934 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
935  {\
936  dim3 wthr__(wthr_);\
937  dim3 thr__(thr_);\
938  \
939  ite_gpu<1> itg;\
940  itg.wthr = wthr_;\
941  itg.thr = thr_;\
942  \
943  gridDim.x = wthr__.x;\
944  gridDim.y = wthr__.y;\
945  gridDim.z = wthr__.z;\
946  \
947  blockDim.x = thr__.x;\
948  blockDim.y = thr__.y;\
949  blockDim.z = thr__.z;\
950  \
951  CHECK_SE_CLASS1_PRE\
952  \
953  \
954  exe_kernel([&]() -> void {\
955  \
956  cuda_call(__VA_ARGS__);\
957  \
958  },itg);\
959  \
960  }
961 
962 #define CUDA_LAUNCH_NOSYNC(cuda_call,ite, ...) \
963  {\
964  gridDim.x = ite.wthr.x;\
965  gridDim.y = ite.wthr.y;\
966  gridDim.z = ite.wthr.z;\
967  \
968  blockDim.x = ite.thr.x;\
969  blockDim.y = ite.thr.y;\
970  blockDim.z = ite.thr.z;\
971  \
972  CHECK_SE_CLASS1_PRE\
973  \
974  exe_kernel_no_sync([&]() -> void {\
975  \
976  \
977  cuda_call(__VA_ARGS__);\
978  \
979  },ite);\
980  \
981  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
982  }
983 
984 
985 #define CUDA_LAUNCH_DIM3_NOSYNC(cuda_call,wthr_,thr_, ...)\
986  {\
987  dim3 wthr__(wthr_);\
988  dim3 thr__(thr_);\
989  \
990  ite_gpu<1> itg;\
991  itg.wthr = wthr_;\
992  itg.thr = thr_;\
993  \
994  gridDim.x = wthr__.x;\
995  gridDim.y = wthr__.y;\
996  gridDim.z = wthr__.z;\
997  \
998  blockDim.x = thr__.x;\
999  blockDim.y = thr__.y;\
1000  blockDim.z = thr__.z;\
1001  \
1002  CHECK_SE_CLASS1_PRE\
1003  \
1004  exe_kernel_no_sync([&]() -> void {\
1005  \
1006  cuda_call(__VA_ARGS__);\
1007  \
1008  },itg);\
1009  \
1010  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
1011  }
1012 
1013 #define CUDA_CHECK()
1014 
1015 #endif
1016 
1017 #endif
1018 
1019 #endif
Optional outer namespace(s)
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition: block_scan.cuh:271
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:333