OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cudify_openmp.hpp
1#ifndef CUDIFY_OPENMP_HPP_
2#define CUDIFY_OPENMP_HPP_
3
4
5
6#include "config.h"
7
8constexpr int default_kernel_wg_threads_ = 1024;
9
10#include <omp.h>
11
12
13#define CUDA_ON_BACKEND CUDA_BACKEND_OPENMP
14#define GPU_HOST_DEVICE
15
16#include "util/cudify/cudify_hardware_cpu.hpp"
17
18#ifdef HAVE_BOOST_CONTEXT
19
20#include <boost/bind/bind.hpp>
21#include <type_traits>
22#ifdef HAVE_BOOST_CONTEXT
23#include <boost/context/continuation.hpp>
24#endif
25#include <vector>
26#include <string.h>
27
28
29#ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
30#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
31#endif
32
33extern std::vector<void *>mem_stack;
34
35extern thread_local dim3 threadIdx;
36extern thread_local dim3 blockIdx;
37
38extern dim3 blockDim;
39extern dim3 gridDim;
40
41extern std::vector<void *> mem_stack;
42extern std::vector<boost::context::detail::fcontext_t> contexts;
43extern thread_local void * par_glob;
44extern thread_local boost::context::detail::fcontext_t main_ctx;
45
46static void __syncthreads()
47{
48 boost::context::detail::jump_fcontext(main_ctx,par_glob);
49};
50
51
52extern thread_local int vct_atomic_add;
53extern thread_local int vct_atomic_rem;
54
55
56namespace cub
57{
58 template<typename T, unsigned int dim>
59 class BlockScan
60 {
61 public:
62 typedef std::array<T,dim> TempStorage;
63
64 private:
65 TempStorage & tmp;
66
67 public:
68
69
70
71 BlockScan(TempStorage & tmp)
72 :tmp(tmp)
73 {};
74
75 void ExclusiveSum(T & in, T & out)
76 {
77 tmp[threadIdx.x] = in;
78
79 __syncthreads();
80
81 if (threadIdx.x == 0)
82 {
83 T prec = tmp[0];
84 tmp[0] = 0;
85 for (int i = 1 ; i < dim ; i++)
86 {
87 auto next = tmp[i-1] + prec;
88 prec = tmp[i];
89 tmp[i] = next;
90 }
91 }
92
93 __syncthreads();
94
95 out = tmp[threadIdx.x];
96 return;
97 }
98 };
99}
100
101template<typename T, typename T2>
102static T atomicAdd(T * address, T2 val)
103{
104 return __atomic_fetch_add(address,val,__ATOMIC_RELAXED);
105};
106
107template<typename T, typename T2>
108static T atomicAddShared(T * address, T2 val)
109{
110 T old = *address;
111 *address += val;
112 return old;
113}
114
115namespace gpu
116{
117 template<typename type_t>
118 struct less_t : public std::binary_function<type_t, type_t, bool> {
119 bool operator()(type_t a, type_t b) const {
120 return a < b;
121 }
122 template<typename type2_t, typename type3_t>
123 bool operator()(type2_t a, type3_t b) const {
124 return a < b;
125 }
126 };
127/* template<typename type_t>
128 struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
129 bool operator()(type_t a, type_t b) const {
130 return a <= b;
131 }
132 };*/
133 template<typename type_t>
134 struct greater_t : public std::binary_function<type_t, type_t, bool> {
135 bool operator()(type_t a, type_t b) const {
136 return a > b;
137 }
138 template<typename type2_t, typename type3_t>
139 bool operator()(type2_t a, type3_t b) const {
140 return a > b;
141 }
142 };
143/* template<typename type_t>
144 struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
145 bool operator()(type_t a, type_t b) const {
146 return a >= b;
147 }
148 };
149 template<typename type_t>
150 struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
151 bool operator()(type_t a, type_t b) const {
152 return a == b;
153 }
154 };
155 template<typename type_t>
156 struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
157 bool operator()(type_t a, type_t b) const {
158 return a != b;
159 }
160 };*/
161
163 // Device-side arithmetic operators.
164
165 template<typename type_t>
166 struct plus_t : public std::binary_function<type_t, type_t, type_t> {
167 type_t operator()(type_t a, type_t b) const {
168 return a + b;
169 }
170 };
171
172/* template<typename type_t>
173 struct minus_t : public std::binary_function<type_t, type_t, type_t> {
174 type_t operator()(type_t a, type_t b) const {
175 return a - b;
176 }
177 };
178
179 template<typename type_t>
180 struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
181 type_t operator()(type_t a, type_t b) const {
182 return a * b;
183 }
184 };*/
185
186 template<typename type_t>
187 struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
188 type_t operator()(type_t a, type_t b) const {
189 return std::max(a, b);
190 }
191 };
192
193 template<typename type_t>
194 struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
195 type_t operator()(type_t a, type_t b) const {
196 return std::min(a, b);
197 }
198 };
199}
200
201
202namespace gpu
203{
204 template<typename input_it,
205 typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
206 void segreduce(input_it input, int count, segments_it segments,
207 int num_segments, output_it output, op_t op, type_t init,
208 context_t& context)
209 {
210 int i = 0;
211 for ( ; i < num_segments - 1; i++)
212 {
213 int j = segments[i];
214 output[i] = input[j];
215 ++j;
216 for ( ; j < segments[i+1] ; j++)
217 {
218 output[i] = op(output[i],input[j]);
219 }
220 }
221
222 // Last segment
223 int j = segments[i];
224 output[i] = input[j];
225 ++j;
226 for ( ; j < count ; j++)
227 {
228 output[i] = op(output[i],input[j]);
229 }
230 }
231
232 // Key-value merge.
233 template<typename a_keys_it, typename a_vals_it,
234 typename b_keys_it, typename b_vals_it,
235 typename c_keys_it, typename c_vals_it,
236 typename comp_t, typename context_t>
237 void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
238 b_keys_it b_keys, b_vals_it b_vals, int b_count,
239 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
240 {
241 int a_it = 0;
242 int b_it = 0;
243 int c_it = 0;
244
245 while (a_it < a_count || b_it < b_count)
246 {
247 if (a_it < a_count)
248 {
249 if (b_it < b_count)
250 {
251 if (comp(b_keys[b_it],a_keys[a_it]))
252 {
253 c_keys[c_it] = b_keys[b_it];
254 c_vals[c_it] = b_vals[b_it];
255 c_it++;
256 b_it++;
257 }
258 else
259 {
260 c_keys[c_it] = a_keys[a_it];
261 c_vals[c_it] = a_vals[a_it];
262 c_it++;
263 a_it++;
264 }
265 }
266 else
267 {
268 c_keys[c_it] = a_keys[a_it];
269 c_vals[c_it] = a_vals[a_it];
270 c_it++;
271 a_it++;
272 }
273 }
274 else
275 {
276 c_keys[c_it] = b_keys[b_it];
277 c_vals[c_it] = b_vals[b_it];
278 c_it++;
279 b_it++;
280 }
281 }
282 }
283}
284
285extern size_t n_workers;
286
287extern bool init_wrappers_call;
288
289extern unsigned int * tid_x[OPENMP_MAX_NUM_THREADS];
290extern unsigned int * tid_y[OPENMP_MAX_NUM_THREADS];
291extern unsigned int * tid_z[OPENMP_MAX_NUM_THREADS];
292
293static void init_wrappers()
294{
295 init_wrappers_call = true;
296
297 #pragma omp parallel
298 {
299 n_workers = omp_get_num_threads();
300 }
301
302 #pragma omp parallel for
303 for (int s = 0 ; s < n_workers ; s++)
304 {
305 unsigned int tid = omp_get_thread_num();
306 tid_x[tid] = &threadIdx.x;
307 tid_y[tid] = &threadIdx.y;
308 tid_z[tid] = &threadIdx.z;
309
310 }
311}
312
313
314template<typename lambda_f>
315struct Fun_enc
316{
317 lambda_f Fn;
318
319 Fun_enc(lambda_f Fn)
320 :Fn(Fn)
321 {}
322
323 void run()
324 {
325 Fn();
326 }
327};
328
329template<typename lambda_f>
330struct Fun_enc_bt
331{
332 lambda_f Fn;
333 dim3 & blockIdx;
334 dim3 & threadIdx;
335
336 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
337 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
338 {}
339
340 void run()
341 {
342 Fn(blockIdx,threadIdx);
343 }
344};
345
346template<typename Fun_enc_type>
347void launch_kernel(boost::context::detail::transfer_t par)
348{
349 main_ctx = par.fctx;
350 par_glob = par.data;
351 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
352
353 ptr->run();
354
355 boost::context::detail::jump_fcontext(par.fctx,0);
356}
357
358
359template<typename lambda_f, typename ite_type>
360static void exe_kernel(lambda_f f, ite_type & ite)
361{
362#ifdef SE_CLASS1
363 if (init_wrappers_call == false)
364 {
365 std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
366 }
367#endif
368
369 if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
370
371 if (mem_stack.size() < ite.nthrs()*n_workers)
372 {
373 int old_size = mem_stack.size();
374 mem_stack.resize(ite.nthrs()*n_workers);
375
376 for (int i = old_size ; i < mem_stack.size() ; i++)
377 {
378 mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
379 }
380 }
381
382 size_t stride = ite.nthrs();
383
384 // Resize contexts
385 contexts.resize(mem_stack.size());
386
387 Fun_enc<lambda_f> fe(f);
388 bool is_sync_free = true;
389
390 bool first_block = true;
391
392 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
393 for (int i = 0 ; i < ite.wthr.z ; i++)
394 {
395 for (int j = 0 ; j < ite.wthr.y ; j++)
396 {
397 for (int k = 0 ; k < ite.wthr.x ; k++)
398 {
399 size_t tid = omp_get_thread_num();
400
401 if (first_block == true || is_sync_free == false)
402 {
403 blockIdx.z = i;
404 blockIdx.y = j;
405 blockIdx.x = k;
406 int nc = 0;
407 for (int it = 0 ; it < ite.thr.z ; it++)
408 {
409 for (int jt = 0 ; jt < ite.thr.y ; jt++)
410 {
411 for (int kt = 0 ; kt < ite.thr.x ; kt++)
412 {
413 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
414
415 nc++;
416 }
417 }
418 }
419
420 bool work_to_do = true;
421 while(work_to_do)
422 {
423 nc = 0;
424 // Work threads
425 for (int it = 0 ; it < ite.thr.z ; it++)
426 {
427 *tid_z[tid] = it;
428 for (int jt = 0 ; jt < ite.thr.y ; jt++)
429 {
430 *tid_y[tid] = jt;
431 for (int kt = 0 ; kt < ite.thr.x ; kt++)
432 {
433 *tid_x[tid] = kt;
434 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
435 contexts[nc + tid*stride] = t.fctx;
436
437 work_to_do &= (t.data != 0);
438 is_sync_free &= !(work_to_do);
439 nc++;
440 }
441 }
442 }
443 }
444 }
445 else
446 {
447 blockIdx.z = i;
448 blockIdx.y = j;
449 blockIdx.x = k;
450 int fb = 0;
451 // Work threads
452 for (int it = 0 ; it < ite.thr.z ; it++)
453 {
454 *tid_z[tid] = it;
455 for (int jt = 0 ; jt < ite.thr.y ; jt++)
456 {
457 *tid_y[tid] = jt;
458 for (int kt = 0 ; kt < ite.thr.x ; kt++)
459 {
460 *tid_x[tid] = kt;
461 f();
462 }
463 }
464 }
465 }
466
467 first_block = false;
468 }
469 }
470 }
471}
472
473template<typename lambda_f, typename ite_type>
474static void exe_kernel_lambda(lambda_f f, ite_type & ite)
475{
476#ifdef SE_CLASS1
477 if (init_wrappers_call == false)
478 {
479 std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
480 }
481#endif
482
483 if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
484
485 if (mem_stack.size() < ite.nthrs()*n_workers)
486 {
487 int old_size = mem_stack.size();
488 mem_stack.resize(ite.nthrs()*n_workers);
489
490 for (int i = old_size ; i < mem_stack.size() ; i++)
491 {
492 mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
493 }
494 }
495
496 size_t stride = ite.nthrs();
497
498 // Resize contexts
499 contexts.resize(mem_stack.size());
500
501 bool is_sync_free = true;
502
503 bool first_block = true;
504
505 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
506 for (int i = 0 ; i < ite.wthr.z ; i++)
507 {
508 for (int j = 0 ; j < ite.wthr.y ; j++)
509 {
510 for (int k = 0 ; k < ite.wthr.x ; k++)
511 {
512 dim3 blockIdx;
513 dim3 threadIdx;
514 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
515 if (first_block == true || is_sync_free == false)
516 {
517 size_t tid = omp_get_thread_num();
518
519 blockIdx.z = i;
520 blockIdx.y = j;
521 blockIdx.x = k;
522 int nc = 0;
523 for (int it = 0 ; it < ite.thr.z ; it++)
524 {
525 for (int jt = 0 ; jt < ite.thr.y ; jt++)
526 {
527 for (int kt = 0 ; kt < ite.thr.x ; kt++)
528 {
529 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
530
531 nc++;
532 }
533 }
534 }
535
536 bool work_to_do = true;
537 while(work_to_do)
538 {
539 nc = 0;
540 // Work threads
541 for (int it = 0 ; it < ite.thr.z ; it++)
542 {
543 threadIdx.z = it;
544 for (int jt = 0 ; jt < ite.thr.y ; jt++)
545 {
546 threadIdx.y = jt;
547 for (int kt = 0 ; kt < ite.thr.x ; kt++)
548 {
549 threadIdx.x = kt;
550 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
551 contexts[nc + tid*stride] = t.fctx;
552
553 work_to_do &= (t.data != 0);
554 is_sync_free &= !(work_to_do);
555 nc++;
556 }
557 }
558 }
559 }
560 }
561 else
562 {
563 blockIdx.z = i;
564 blockIdx.y = j;
565 blockIdx.x = k;
566 int fb = 0;
567 // Work threads
568 for (int it = 0 ; it < ite.thr.z ; it++)
569 {
570 threadIdx.z = it;
571 for (int jt = 0 ; jt < ite.thr.y ; jt++)
572 {
573 threadIdx.y = jt;
574 for (int kt = 0 ; kt < ite.thr.x ; kt++)
575 {
576 threadIdx.x = kt;
577 f(blockIdx,threadIdx);
578 }
579 }
580 }
581 }
582
583 first_block = false;
584 }
585 }
586 }
587}
588
589template<typename lambda_f, typename ite_type>
590static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
591{
592#ifdef SE_CLASS1
593 if (init_wrappers_call == false)
594 {
595 std::cerr << __FILE__ << ":" << __LINE__ << " error, you must call init_wrappers to use cuda openmp backend" << std::endl;
596 }
597#endif
598
599 if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
600
601 if (mem_stack.size() < ite.nthrs()*n_workers)
602 {
603 int old_size = mem_stack.size();
604 mem_stack.resize(ite.nthrs()*n_workers);
605
606 for (int i = old_size ; i < mem_stack.size() ; i++)
607 {
608 mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
609 }
610 }
611
612 size_t stride = ite.nthrs();
613
614 // Resize contexts
615 contexts.resize(mem_stack.size());
616
617 bool is_sync_free = true;
618
619 bool first_block = true;
620
621 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
622 for (int i = 0 ; i < ite.wthr.z ; i++)
623 {
624 for (int j = 0 ; j < ite.wthr.y ; j++)
625 {
626 for (int k = 0 ; k < ite.wthr.x ; k++)
627 {
628 Fun_enc<lambda_f> fe(f);
629 if (first_block == true || is_sync_free == false)
630 {
631 size_t tid = omp_get_thread_num();
632
633 blockIdx.z = i;
634 blockIdx.y = j;
635 blockIdx.x = k;
636 int nc = 0;
637 for (int it = 0 ; it < ite.thr.z ; it++)
638 {
639 for (int jt = 0 ; jt < ite.thr.y ; jt++)
640 {
641 for (int kt = 0 ; kt < ite.thr.x ; kt++)
642 {
643 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
644
645 nc++;
646 }
647 }
648 }
649
650 bool work_to_do = true;
651 while(work_to_do)
652 {
653 nc = 0;
654 // Work threads
655 for (int it = 0 ; it < ite.thr.z ; it++)
656 {
657 threadIdx.z = it;
658 for (int jt = 0 ; jt < ite.thr.y ; jt++)
659 {
660 threadIdx.y = jt;
661 for (int kt = 0 ; kt < ite.thr.x ; kt++)
662 {
663 threadIdx.x = kt;
664 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
665 contexts[nc + tid*stride] = t.fctx;
666
667 work_to_do &= (t.data != 0);
668 is_sync_free &= !(work_to_do);
669 nc++;
670 }
671 }
672 }
673 }
674 }
675 else
676 {
677 blockIdx.z = i;
678 blockIdx.y = j;
679 blockIdx.x = k;
680 int fb = 0;
681 // Work threads
682 for (int it = 0 ; it < ite.thr.z ; it++)
683 {
684 threadIdx.z = it;
685 for (int jt = 0 ; jt < ite.thr.y ; jt++)
686 {
687 threadIdx.y = jt;
688 for (int kt = 0 ; kt < ite.thr.x ; kt++)
689 {
690 threadIdx.x = kt;
691 f();
692 }
693 }
694 }
695 }
696
697 first_block = false;
698 }
699 }
700 }
701}
702
703template<typename lambda_f, typename ite_type>
704static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
705{
706 #pragma omp parallel for collapse(3)
707 for (int i = 0 ; i < ite.wthr.z ; i++)
708 {
709 for (int j = 0 ; j < ite.wthr.y ; j++)
710 {
711 for (int k = 0 ; k < ite.wthr.x ; k++)
712 {
713 blockIdx.z = i;
714 blockIdx.y = j;
715 blockIdx.x = k;
716 int fb = 0;
717 // Work threads
718 for (int it = 0 ; it < ite.thr.z ; it++)
719 {
720 threadIdx.z = it;
721 for (int jt = 0 ; jt < ite.thr.y ; jt++)
722 {
723 threadIdx.y = jt;
724 for (int kt = 0 ; kt < ite.thr.x ; kt++)
725 {
726 threadIdx.x = kt;
727 f();
728 }
729 }
730 }
731 }
732 }
733 }
734}
735
736#ifdef PRINT_CUDA_LAUNCHES
737
738#define CUDA_LAUNCH(cuda_call,ite, ...) \
739 {\
740 gridDim.x = ite.wthr.x;\
741 gridDim.y = ite.wthr.y;\
742 gridDim.z = ite.wthr.z;\
743 \
744 blockDim.x = ite.thr.x;\
745 blockDim.y = ite.thr.y;\
746 blockDim.z = ite.thr.z;\
747 \
748 CHECK_SE_CLASS1_PRE\
749 \
750 std::cout << "Launching: " << #cuda_call << " (" << ite.wthr.x << "," << ite.wthr.y << "," << ite.wthr.z << ") (" << ite.thr.x << "," << ite.thr.y << "," << ite.thr.z << ")" << std::endl;\
751 \
752 exe_kernel([&]() -> void {\
753 \
754 \
755 cuda_call(__VA_ARGS__);\
756 \
757 },ite);\
758 \
759 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
760 }
761
762
763#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
764 {\
765 dim3 wthr__(wthr_);\
766 dim3 thr__(thr_);\
767 \
768 ite_gpu<1> itg;\
769 itg.wthr = wthr_;\
770 itg.thr = thr_;\
771 \
772 gridDim.x = wthr__.x;\
773 gridDim.y = wthr__.y;\
774 gridDim.z = wthr__.z;\
775 \
776 blockDim.x = thr__.x;\
777 blockDim.y = thr__.y;\
778 blockDim.z = thr__.z;\
779 \
780 CHECK_SE_CLASS1_PRE\
781 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
782 \
783 exe_kernel([&]() -> void {\
784 \
785 cuda_call(__VA_ARGS__);\
786 \
787 },itg);\
788 \
789 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
790 }
791
792
793#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
794 {\
795 dim3 wthr__(wthr_);\
796 dim3 thr__(thr_);\
797 \
798 ite_gpu<1> itg;\
799 itg.wthr = wthr_;\
800 itg.thr = thr_;\
801 \
802 gridDim.x = wthr__.x;\
803 gridDim.y = wthr__.y;\
804 gridDim.z = wthr__.z;\
805 \
806 blockDim.x = thr__.x;\
807 blockDim.y = thr__.y;\
808 blockDim.z = thr__.z;\
809 \
810 CHECK_SE_CLASS1_PRE\
811 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
812 \
813 exe_kernel([&]() -> void {\
814 \
815 cuda_call(__VA_ARGS__);\
816 \
817 },itg);\
818 \
819 }
820
821#define CUDA_CHECK()
822
823#else
824
825#define CUDA_LAUNCH(cuda_call,ite, ...) \
826 {\
827 gridDim.x = ite.wthr.x;\
828 gridDim.y = ite.wthr.y;\
829 gridDim.z = ite.wthr.z;\
830 \
831 blockDim.x = ite.thr.x;\
832 blockDim.y = ite.thr.y;\
833 blockDim.z = ite.thr.z;\
834 \
835 CHECK_SE_CLASS1_PRE\
836 \
837 exe_kernel([&]() -> void {\
838 \
839 \
840 cuda_call(__VA_ARGS__);\
841 \
842 },ite);\
843 \
844 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
845 }
846
847#define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
848 {\
849 gridDim.x = ite.wthr.x;\
850 gridDim.y = ite.wthr.y;\
851 gridDim.z = ite.wthr.z;\
852 \
853 blockDim.x = ite.thr.x;\
854 blockDim.y = ite.thr.y;\
855 blockDim.z = ite.thr.z;\
856 \
857 CHECK_SE_CLASS1_PRE\
858 \
859 exe_kernel_lambda(lambda_f,ite);\
860 \
861 CHECK_SE_CLASS1_POST("lambda",0)\
862 }
863
864#define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
865 {\
866 gridDim.x = ite.wthr.x;\
867 gridDim.y = ite.wthr.y;\
868 gridDim.z = ite.wthr.z;\
869 \
870 blockDim.x = ite.thr.x;\
871 blockDim.y = ite.thr.y;\
872 blockDim.z = ite.thr.z;\
873 \
874 CHECK_SE_CLASS1_PRE\
875 \
876 exe_kernel_lambda_tls(lambda_f,ite);\
877 \
878 CHECK_SE_CLASS1_POST("lambda",0)\
879 }
880
881#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
882 {\
883 dim3 wthr__(wthr_);\
884 dim3 thr__(thr_);\
885 \
886 ite_gpu<1> itg;\
887 itg.wthr = wthr_;\
888 itg.thr = thr_;\
889 gridDim.x = itg.wthr.x;\
890 gridDim.y = itg.wthr.y;\
891 gridDim.z = itg.wthr.z;\
892 \
893 blockDim.x = itg.thr.x;\
894 blockDim.y = itg.thr.y;\
895 blockDim.z = itg.thr.z;\
896 \
897 CHECK_SE_CLASS1_PRE\
898 \
899 exe_kernel_lambda_tls(lambda_f,itg);\
900 \
901 CHECK_SE_CLASS1_POST("lambda",0)\
902 }
903
904#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
905 {\
906 dim3 wthr__(wthr_);\
907 dim3 thr__(thr_);\
908 \
909 ite_gpu<1> itg;\
910 itg.wthr = wthr_;\
911 itg.thr = thr_;\
912 \
913 gridDim.x = wthr__.x;\
914 gridDim.y = wthr__.y;\
915 gridDim.z = wthr__.z;\
916 \
917 blockDim.x = thr__.x;\
918 blockDim.y = thr__.y;\
919 blockDim.z = thr__.z;\
920 \
921 CHECK_SE_CLASS1_PRE\
922 \
923 \
924 exe_kernel([&]() -> void {\
925 \
926 cuda_call(__VA_ARGS__);\
927 \
928 },itg);\
929 \
930 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
931 }
932
933#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
934 {\
935 dim3 wthr__(wthr_);\
936 dim3 thr__(thr_);\
937 \
938 ite_gpu<1> itg;\
939 itg.wthr = wthr_;\
940 itg.thr = thr_;\
941 \
942 gridDim.x = wthr__.x;\
943 gridDim.y = wthr__.y;\
944 gridDim.z = wthr__.z;\
945 \
946 blockDim.x = thr__.x;\
947 blockDim.y = thr__.y;\
948 blockDim.z = thr__.z;\
949 \
950 CHECK_SE_CLASS1_PRE\
951 \
952 \
953 exe_kernel([&]() -> void {\
954 \
955 cuda_call(__VA_ARGS__);\
956 \
957 },itg);\
958 \
959 }
960
961#define CUDA_LAUNCH_NOSYNC(cuda_call,ite, ...) \
962 {\
963 gridDim.x = ite.wthr.x;\
964 gridDim.y = ite.wthr.y;\
965 gridDim.z = ite.wthr.z;\
966 \
967 blockDim.x = ite.thr.x;\
968 blockDim.y = ite.thr.y;\
969 blockDim.z = ite.thr.z;\
970 \
971 CHECK_SE_CLASS1_PRE\
972 \
973 exe_kernel_no_sync([&]() -> void {\
974 \
975 \
976 cuda_call(__VA_ARGS__);\
977 \
978 },ite);\
979 \
980 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
981 }
982
983
984#define CUDA_LAUNCH_DIM3_NOSYNC(cuda_call,wthr_,thr_, ...)\
985 {\
986 dim3 wthr__(wthr_);\
987 dim3 thr__(thr_);\
988 \
989 ite_gpu<1> itg;\
990 itg.wthr = wthr_;\
991 itg.thr = thr_;\
992 \
993 gridDim.x = wthr__.x;\
994 gridDim.y = wthr__.y;\
995 gridDim.z = wthr__.z;\
996 \
997 blockDim.x = thr__.x;\
998 blockDim.y = thr__.y;\
999 blockDim.z = thr__.z;\
1000 \
1001 CHECK_SE_CLASS1_PRE\
1002 \
1003 exe_kernel_no_sync([&]() -> void {\
1004 \
1005 cuda_call(__VA_ARGS__);\
1006 \
1007 },itg);\
1008 \
1009 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
1010 }
1011
1012#define CUDA_CHECK()
1013
1014#endif
1015
1016#endif
1017
1018#endif
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)