OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cudify_sequential.hpp
1#ifndef CUDIFY_SEQUENCIAL_HPP_
2#define CUDIFY_SEQUENCIAL_HPP_
3
4#define CUDA_ON_BACKEND CUDA_BACKEND_SEQUENTIAL
5#define GPU_HOST_DEVICE
6
7#include "config.h"
8
9constexpr int default_kernel_wg_threads_ = 1024;
10
11#include "util/cudify/cudify_hardware_cpu.hpp"
12
13#ifdef HAVE_BOOST_CONTEXT
14
15#include "util/cuda_util.hpp"
16#include <boost/bind/bind.hpp>
17#include <type_traits>
18#ifdef HAVE_BOOST_CONTEXT
19#include <boost/context/continuation.hpp>
20#endif
21#include <vector>
22#include <string.h>
23
24
25#ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
26#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
27#endif
28
29extern std::vector<void *>mem_stack;
30
31extern thread_local dim3 threadIdx;
32extern thread_local dim3 blockIdx;
33
34static dim3 blockDim;
35static dim3 gridDim;
36
37extern std::vector<void *> mem_stack;
38extern std::vector<boost::context::detail::fcontext_t> contexts;
39extern thread_local void * par_glob;
40extern thread_local boost::context::detail::fcontext_t main_ctx;
41
42static void __syncthreads()
43{
44 boost::context::detail::jump_fcontext(main_ctx,par_glob);
45};
46
47
48
49extern int thread_local vct_atomic_add;
50extern int thread_local vct_atomic_rem;
51
52
53namespace cub
54{
55 template<typename T, unsigned int dim>
56 class BlockScan
57 {
58 public:
59 typedef std::array<T,dim> TempStorage;
60
61 private:
62 TempStorage & tmp;
63
64 public:
65
66
67
68 BlockScan(TempStorage & tmp)
69 :tmp(tmp)
70 {};
71
72 void ExclusiveSum(T & in, T & out)
73 {
74 tmp[threadIdx.x] = in;
75
76 __syncthreads();
77
78 if (threadIdx.x == 0)
79 {
80 T prec = tmp[0];
81 tmp[0] = 0;
82 for (int i = 1 ; i < dim ; i++)
83 {
84 auto next = tmp[i-1] + prec;
85 prec = tmp[i];
86 tmp[i] = next;
87 }
88 }
89
90 __syncthreads();
91
92 out = tmp[threadIdx.x];
93 return;
94 }
95 };
96}
97
98
99template<typename T, typename T2>
100static T atomicAdd(T * address, T2 val)
101{
102 T old = *address;
103 *address += val;
104 return old;
105};
106
107
108namespace gpu
109{
110 template<typename type_t>
111 struct less_t : public std::binary_function<type_t, type_t, bool> {
112 bool operator()(type_t a, type_t b) const {
113 return a < b;
114 }
115 template<typename type2_t, typename type3_t>
116 bool operator()(type2_t a, type3_t b) const {
117 return a < b;
118 }
119 };
120/* template<typename type_t>
121 struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
122 bool operator()(type_t a, type_t b) const {
123 return a <= b;
124 }
125 };*/
126 template<typename type_t>
127 struct greater_t : public std::binary_function<type_t, type_t, bool> {
128 bool operator()(type_t a, type_t b) const {
129 return a > b;
130 }
131 template<typename type2_t, typename type3_t>
132 bool operator()(type2_t a, type3_t b) const {
133 return a > b;
134 }
135 };
136/* template<typename type_t>
137 struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
138 bool operator()(type_t a, type_t b) const {
139 return a >= b;
140 }
141 };
142 template<typename type_t>
143 struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
144 bool operator()(type_t a, type_t b) const {
145 return a == b;
146 }
147 };
148 template<typename type_t>
149 struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
150 bool operator()(type_t a, type_t b) const {
151 return a != b;
152 }
153 };*/
154
156 // Device-side arithmetic operators.
157
158 template<typename type_t>
159 struct plus_t : public std::binary_function<type_t, type_t, type_t> {
160 type_t operator()(type_t a, type_t b) const {
161 return a + b;
162 }
163 };
164
165/* template<typename type_t>
166 struct minus_t : public std::binary_function<type_t, type_t, type_t> {
167 type_t operator()(type_t a, type_t b) const {
168 return a - b;
169 }
170 };
171
172 template<typename type_t>
173 struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
174 type_t operator()(type_t a, type_t b) const {
175 return a * b;
176 }
177 };*/
178
179 template<typename type_t>
180 struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
181 type_t operator()(type_t a, type_t b) const {
182 return std::max(a, b);
183 }
184 };
185
186 template<typename type_t>
187 struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
188 type_t operator()(type_t a, type_t b) const {
189 return std::min(a, b);
190 }
191 };
192}
193
194
195namespace gpu
196{
197 template<typename input_it,
198 typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
199 void segreduce(input_it input, int count, segments_it segments,
200 int num_segments, output_it output, op_t op, type_t init,
201 context_t& context)
202 {
203 int i = 0;
204 for ( ; i < num_segments - 1; i++)
205 {
206 int j = segments[i];
207 output[i] = input[j];
208 ++j;
209 for ( ; j < segments[i+1] ; j++)
210 {
211 output[i] = op(output[i],input[j]);
212 }
213 }
214
215 // Last segment
216 int j = segments[i];
217 output[i] = input[j];
218 ++j;
219 for ( ; j < count ; j++)
220 {
221 output[i] = op(output[i],input[j]);
222 }
223 }
224
225 // Key-value merge.
226 template<typename a_keys_it, typename a_vals_it,
227 typename b_keys_it, typename b_vals_it,
228 typename c_keys_it, typename c_vals_it,
229 typename comp_t, typename context_t>
230 void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
231 b_keys_it b_keys, b_vals_it b_vals, int b_count,
232 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
233 {
234 int a_it = 0;
235 int b_it = 0;
236 int c_it = 0;
237
238 while (a_it < a_count || b_it < b_count)
239 {
240 if (a_it < a_count)
241 {
242 if (b_it < b_count)
243 {
244 if (comp(b_keys[b_it],a_keys[a_it]))
245 {
246 c_keys[c_it] = b_keys[b_it];
247 c_vals[c_it] = b_vals[b_it];
248 c_it++;
249 b_it++;
250 }
251 else
252 {
253 c_keys[c_it] = a_keys[a_it];
254 c_vals[c_it] = a_vals[a_it];
255 c_it++;
256 a_it++;
257 }
258 }
259 else
260 {
261 c_keys[c_it] = a_keys[a_it];
262 c_vals[c_it] = a_vals[a_it];
263 c_it++;
264 a_it++;
265 }
266 }
267 else
268 {
269 c_keys[c_it] = b_keys[b_it];
270 c_vals[c_it] = b_vals[b_it];
271 c_it++;
272 b_it++;
273 }
274 }
275 }
276}
277
278static void init_wrappers()
279{}
280
281template<typename lambda_f>
282struct Fun_enc
283{
284 lambda_f Fn;
285
286 Fun_enc(lambda_f Fn)
287 :Fn(Fn)
288 {}
289
290 void run()
291 {
292 Fn();
293 }
294};
295
296template<typename lambda_f>
297struct Fun_enc_bt
298{
299 lambda_f Fn;
300 dim3 & blockIdx;
301 dim3 & threadIdx;
302
303 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
304 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
305 {}
306
307 void run()
308 {
309 Fn(blockIdx,threadIdx);
310 }
311};
312
313template<typename Fun_enc_type>
314void launch_kernel(boost::context::detail::transfer_t par)
315{
316 main_ctx = par.fctx;
317 par_glob = par.data;
318 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
319
320 ptr->run();
321
322 boost::context::detail::jump_fcontext(par.fctx,0);
323}
324
325template<typename lambda_f, typename ite_type>
326static void exe_kernel(lambda_f f, ite_type & ite)
327{
328 if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
329
330 if (mem_stack.size() < ite.nthrs())
331 {
332 int old_size = mem_stack.size();
333 mem_stack.resize(ite.nthrs());
334
335 for (int i = old_size ; i < mem_stack.size() ; i++)
336 {
337 mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
338 }
339 }
340
341 // Resize contexts
342 contexts.resize(mem_stack.size());
343
344 Fun_enc<lambda_f> fe(f);
345
346 for (int i = 0 ; i < ite.wthr.z ; i++)
347 {
348 blockIdx.z = i;
349 for (int j = 0 ; j < ite.wthr.y ; j++)
350 {
351 blockIdx.y = j;
352 for (int k = 0 ; k < ite.wthr.x ; k++)
353 {
354 blockIdx.x = k;
355 int nc = 0;
356 for (int it = 0 ; it < ite.thr.z ; it++)
357 {
358 for (int jt = 0 ; jt < ite.thr.y ; jt++)
359 {
360 for (int kt = 0 ; kt < ite.thr.x ; kt++)
361 {
362 contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
363 nc++;
364 }
365 }
366 }
367
368 bool work_to_do = true;
369 while(work_to_do)
370 {
371 nc = 0;
372 // Work threads
373 for (int it = 0 ; it < ite.thr.z ; it++)
374 {
375 threadIdx.z = it;
376 for (int jt = 0 ; jt < ite.thr.y ; jt++)
377 {
378 threadIdx.y = jt;
379 for (int kt = 0 ; kt < ite.thr.x ; kt++)
380 {
381 threadIdx.x = kt;
382 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
383 contexts[nc] = t.fctx;
384 work_to_do &= (t.data != 0);
385 nc++;
386 }
387 }
388 }
389 }
390 }
391 }
392 }
393}
394
395template<typename lambda_f, typename ite_type>
396static void exe_kernel_lambda(lambda_f f, ite_type & ite)
397{
398 if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
399
400 if (mem_stack.size() < ite.nthrs())
401 {
402 int old_size = mem_stack.size();
403 mem_stack.resize(ite.nthrs());
404
405 for (int i = old_size ; i < mem_stack.size() ; i++)
406 {
407 mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
408 }
409 }
410
411 // Resize contexts
412 contexts.resize(mem_stack.size());
413
414 bool is_sync_free = true;
415
416 bool first_block = true;
417
418 for (int i = 0 ; i < ite.wthr.z ; i++)
419 {
420 for (int j = 0 ; j < ite.wthr.y ; j++)
421 {
422 for (int k = 0 ; k < ite.wthr.x ; k++)
423 {
424 dim3 blockIdx;
425 dim3 threadIdx;
426 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
427 if (first_block == true || is_sync_free == false)
428 {
429 blockIdx.z = i;
430 blockIdx.y = j;
431 blockIdx.x = k;
432 int nc = 0;
433 for (int it = 0 ; it < ite.thr.z ; it++)
434 {
435 for (int jt = 0 ; jt < ite.thr.y ; jt++)
436 {
437 for (int kt = 0 ; kt < ite.thr.x ; kt++)
438 {
439 contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
440
441 nc++;
442 }
443 }
444 }
445
446 bool work_to_do = true;
447 while(work_to_do)
448 {
449 nc = 0;
450 // Work threads
451 for (int it = 0 ; it < ite.thr.z ; it++)
452 {
453 threadIdx.z = it;
454 for (int jt = 0 ; jt < ite.thr.y ; jt++)
455 {
456 threadIdx.y = jt;
457 for (int kt = 0 ; kt < ite.thr.x ; kt++)
458 {
459 threadIdx.x = kt;
460 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
461 contexts[nc] = t.fctx;
462
463 work_to_do &= (t.data != 0);
464 is_sync_free &= !(work_to_do);
465 nc++;
466 }
467 }
468 }
469 }
470 }
471 else
472 {
473 blockIdx.z = i;
474 blockIdx.y = j;
475 blockIdx.x = k;
476 int fb = 0;
477 // Work threads
478 for (int it = 0 ; it < ite.thr.z ; it++)
479 {
480 threadIdx.z = it;
481 for (int jt = 0 ; jt < ite.thr.y ; jt++)
482 {
483 threadIdx.y = jt;
484 for (int kt = 0 ; kt < ite.thr.x ; kt++)
485 {
486 threadIdx.x = kt;
487 f(blockIdx,threadIdx);
488 }
489 }
490 }
491 }
492
493 first_block = false;
494 }
495 }
496 }
497}
498
499template<typename lambda_f, typename ite_type>
500static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
501{
502 if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
503
504 if (mem_stack.size() < ite.nthrs())
505 {
506 int old_size = mem_stack.size();
507 mem_stack.resize(ite.nthrs());
508
509 for (int i = old_size ; i < mem_stack.size() ; i++)
510 {
511 mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
512 }
513 }
514
515 // Resize contexts
516 contexts.resize(mem_stack.size());
517
518 bool is_sync_free = true;
519
520 bool first_block = true;
521
522 for (int i = 0 ; i < ite.wthr.z ; i++)
523 {
524 for (int j = 0 ; j < ite.wthr.y ; j++)
525 {
526 for (int k = 0 ; k < ite.wthr.x ; k++)
527 {
528 Fun_enc<lambda_f> fe(f);
529 if (first_block == true || is_sync_free == false)
530 {
531 blockIdx.z = i;
532 blockIdx.y = j;
533 blockIdx.x = k;
534 int nc = 0;
535 for (int it = 0 ; it < ite.thr.z ; it++)
536 {
537 for (int jt = 0 ; jt < ite.thr.y ; jt++)
538 {
539 for (int kt = 0 ; kt < ite.thr.x ; kt++)
540 {
541 contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
542
543 nc++;
544 }
545 }
546 }
547
548 bool work_to_do = true;
549 while(work_to_do)
550 {
551 nc = 0;
552 // Work threads
553 for (int it = 0 ; it < ite.thr.z ; it++)
554 {
555 threadIdx.z = it;
556 for (int jt = 0 ; jt < ite.thr.y ; jt++)
557 {
558 threadIdx.y = jt;
559 for (int kt = 0 ; kt < ite.thr.x ; kt++)
560 {
561 threadIdx.x = kt;
562 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
563 contexts[nc] = t.fctx;
564
565 work_to_do &= (t.data != 0);
566 is_sync_free &= !(work_to_do);
567 nc++;
568 }
569 }
570 }
571 }
572 }
573 else
574 {
575 blockIdx.z = i;
576 blockIdx.y = j;
577 blockIdx.x = k;
578 int fb = 0;
579 // Work threads
580 for (int it = 0 ; it < ite.thr.z ; it++)
581 {
582 threadIdx.z = it;
583 for (int jt = 0 ; jt < ite.thr.y ; jt++)
584 {
585 threadIdx.y = jt;
586 for (int kt = 0 ; kt < ite.thr.x ; kt++)
587 {
588 threadIdx.x = kt;
589 f();
590 }
591 }
592 }
593 }
594
595 first_block = false;
596 }
597 }
598 }
599}
600
601template<typename lambda_f, typename ite_type>
602static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
603{
604 for (int i = 0 ; i < ite.wthr.z ; i++)
605 {
606 blockIdx.z = i;
607 for (int j = 0 ; j < ite.wthr.y ; j++)
608 {
609 blockIdx.y = j;
610 for (int k = 0 ; k < ite.wthr.x ; k++)
611 {
612 blockIdx.x = k;
613 int fb = 0;
614 // Work threads
615 for (int it = 0 ; it < ite.wthr.z ; it++)
616 {
617 threadIdx.z = it;
618 for (int jt = 0 ; jt < ite.wthr.y ; jt++)
619 {
620 threadIdx.y = jt;
621 for (int kt = 0 ; kt < ite.wthr.x ; kt++)
622 {
623 threadIdx.x = kt;
624 f();
625 }
626 }
627 }
628 }
629 }
630 }
631}
632
633#ifdef PRINT_CUDA_LAUNCHES
634
635#define CUDA_LAUNCH(cuda_call,ite, ...)\
636 \
637 gridDim.x = ite.wthr.x;\
638 gridDim.y = ite.wthr.y;\
639 gridDim.z = ite.wthr.z;\
640 \
641 blockDim.x = ite.thr.x;\
642 blockDim.y = ite.thr.y;\
643 blockDim.z = ite.thr.z;\
644 \
645 CHECK_SE_CLASS1_PRE\
646 \
647 std::cout << "Launching: " << #cuda_call << std::endl;\
648 \
649 exe_kernel(\
650 [&](boost::context::fiber && main) -> void {\
651 \
652 \
653\
654 cuda_call(__VA_ARGS__);\
655 },ite);\
656 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
657 }
658
659
660#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
661 {\
662 dim3 wthr__(wthr_);\
663 dim3 thr__(thr_);\
664 \
665 ite_gpu<1> itg;\
666 itg.wthr = wthr;\
667 itg.thr = thr;\
668 \
669 gridDim.x = wthr__.x;\
670 gridDim.y = wthr__.y;\
671 gridDim.z = wthr__.z;\
672 \
673 blockDim.x = thr__.x;\
674 blockDim.y = thr__.y;\
675 blockDim.z = thr__.z;\
676 \
677 CHECK_SE_CLASS1_PRE\
678 std::cout << "Launching: " << #cuda_call << std::endl;\
679 \
680 exe_kernel(\
681 [&] (boost::context::fiber && main) -> void {\
682 \
683 \
684\
685 cuda_call(__VA_ARGS__);\
686 \
687 \
688 });\
689 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
690 }
691
692#define CUDA_CHECK()
693
694#else
695
696#define CUDA_LAUNCH(cuda_call,ite, ...) \
697 {\
698 gridDim.x = ite.wthr.x;\
699 gridDim.y = ite.wthr.y;\
700 gridDim.z = ite.wthr.z;\
701 \
702 blockDim.x = ite.thr.x;\
703 blockDim.y = ite.thr.y;\
704 blockDim.z = ite.thr.z;\
705 \
706 CHECK_SE_CLASS1_PRE\
707 \
708 exe_kernel([&]() -> void {\
709 \
710 \
711 cuda_call(__VA_ARGS__);\
712 \
713 },ite);\
714 \
715 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
716 }
717
718#define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
719 {\
720 gridDim.x = ite.wthr.x;\
721 gridDim.y = ite.wthr.y;\
722 gridDim.z = ite.wthr.z;\
723 \
724 blockDim.x = ite.thr.x;\
725 blockDim.y = ite.thr.y;\
726 blockDim.z = ite.thr.z;\
727 \
728 CHECK_SE_CLASS1_PRE\
729 \
730 exe_kernel_lambda(lambda_f,ite);\
731 \
732 CHECK_SE_CLASS1_POST("lambda",0)\
733 }
734
735#define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
736 {\
737 gridDim.x = ite.wthr.x;\
738 gridDim.y = ite.wthr.y;\
739 gridDim.z = ite.wthr.z;\
740 \
741 blockDim.x = ite.thr.x;\
742 blockDim.y = ite.thr.y;\
743 blockDim.z = ite.thr.z;\
744 \
745 CHECK_SE_CLASS1_PRE\
746 \
747 exe_kernel_lambda_tls(lambda_f,ite);\
748 \
749 CHECK_SE_CLASS1_POST("lambda",0)\
750 }
751
752#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
753 {\
754 dim3 wthr__(wthr_);\
755 dim3 thr__(thr_);\
756 \
757 ite_gpu<1> itg;\
758 itg.wthr = wthr_;\
759 itg.thr = thr_;\
760 \
761 gridDim.x = wthr__.x;\
762 gridDim.y = wthr__.y;\
763 gridDim.z = wthr__.z;\
764 \
765 blockDim.x = thr__.x;\
766 blockDim.y = thr__.y;\
767 blockDim.z = thr__.z;\
768 \
769 CHECK_SE_CLASS1_PRE\
770 \
771 exe_kernel([&]() -> void {\
772 \
773 cuda_call(__VA_ARGS__);\
774 \
775 },itg);\
776 \
777 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
778 }
779
780#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
781 {\
782 dim3 wthr__(wthr_);\
783 dim3 thr__(thr_);\
784 \
785 ite_gpu<1> itg;\
786 itg.wthr = wthr_;\
787 itg.thr = thr_;\
788 gridDim.x = itg.wthr.x;\
789 gridDim.y = itg.wthr.y;\
790 gridDim.z = itg.wthr.z;\
791 \
792 blockDim.x = itg.thr.x;\
793 blockDim.y = itg.thr.y;\
794 blockDim.z = itg.thr.z;\
795 \
796 CHECK_SE_CLASS1_PRE\
797 \
798 exe_kernel_lambda_tls(lambda_f,itg);\
799 \
800 }
801
802#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
803 {\
804 dim3 wthr__(wthr_);\
805 dim3 thr__(thr_);\
806 \
807 ite_gpu<1> itg;\
808 itg.wthr = wthr_;\
809 itg.thr = thr_;\
810 \
811 gridDim.x = wthr__.x;\
812 gridDim.y = wthr__.y;\
813 gridDim.z = wthr__.z;\
814 \
815 blockDim.x = thr__.x;\
816 blockDim.y = thr__.y;\
817 blockDim.z = thr__.z;\
818 \
819 CHECK_SE_CLASS1_PRE\
820 \
821 exe_kernel([&]() -> void {\
822 \
823 cuda_call(__VA_ARGS__);\
824 \
825 },itg);\
826 \
827 }
828
829#define CUDA_CHECK()
830
831#endif
832
833#endif
834
835#else
836
837constexpr int default_kernel_wg_threads_ = 1024;
838
839#endif /* CUDIFY_SEQUENCIAL_HPP_ */
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)