1 #ifndef CUDIFY_SEQUENCIAL_HPP_
2 #define CUDIFY_SEQUENCIAL_HPP_
4 #define CUDA_ON_BACKEND CUDA_BACKEND_SEQUENTIAL
8 constexpr
int default_kernel_wg_threads_ = 1024;
10 #include "util/cudify/cudify_hardware_cpu.hpp"
12 #ifdef HAVE_BOOST_CONTEXT
14 #include <boost/bind/bind.hpp>
15 #include <type_traits>
16 #ifdef HAVE_BOOST_CONTEXT
17 #include <boost/context/continuation.hpp>
24 #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
25 #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
28 extern std::vector<void *>mem_stack;
30 extern thread_local dim3 threadIdx;
31 extern thread_local dim3 blockIdx;
36 extern std::vector<void *> mem_stack;
37 extern std::vector<boost::context::detail::fcontext_t> contexts;
38 extern thread_local
void * par_glob;
39 extern thread_local boost::context::detail::fcontext_t main_ctx;
41 static void __syncthreads()
43 boost::context::detail::jump_fcontext(main_ctx,par_glob);
48 extern int thread_local vct_atomic_add;
49 extern int thread_local vct_atomic_rem;
54 template<
typename T,
unsigned int dim>
58 typedef std::array<T,dim> TempStorage;
73 tmp[threadIdx.x] = in;
81 for (
int i = 1 ; i < dim ; i++)
83 auto next = tmp[i-1] + prec;
91 out = tmp[threadIdx.x];
98 template<
typename T,
typename T2>
99 static T atomicAdd(T * address, T2 val)
109 template<
typename type_t>
110 struct less_t :
public std::binary_function<type_t, type_t, bool> {
111 bool operator()(type_t a, type_t b)
const {
115 template<
typename type2_t,
typename type3_t>
116 bool operator()(type2_t a, type3_t b)
const {
126 template<
typename type_t>
127 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
128 bool operator()(type_t a, type_t b)
const {
132 template<
typename type2_t,
typename type3_t>
133 bool operator()(type2_t a, type3_t b)
const {
159 template<
typename type_t>
160 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
161 type_t operator()(type_t a, type_t b)
const {
165 type_t reduceInitValue()
const {
184 template<
typename type_t>
185 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
186 type_t operator()(type_t a, type_t b)
const {
187 return std::max(a, b);
190 type_t reduceInitValue()
const {
191 return std::numeric_limits<T>::min();
195 template<
typename type_t>
196 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
197 type_t operator()(type_t a, type_t b)
const {
198 return std::min(a, b);
201 type_t reduceInitValue()
const {
202 return std::numeric_limits<T>::max();
210 template<
typename input_it,
211 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
212 void segreduce(input_it input,
int count, segments_it segments,
213 int num_segments, output_it output, op_t op, type_t
init,
217 for ( ; i < num_segments - 1; i++)
220 output[i] = input[j];
222 for ( ; j < segments[i+1] ; j++)
224 output[i] = op(output[i],input[j]);
230 output[i] = input[j];
232 for ( ; j < count ; j++)
234 output[i] = op(output[i],input[j]);
239 template<
typename a_keys_it,
typename a_vals_it,
240 typename b_keys_it,
typename b_vals_it,
241 typename c_keys_it,
typename c_vals_it,
242 typename comp_t,
typename context_t>
243 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
244 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
245 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
251 while (a_it < a_count || b_it < b_count)
257 if (comp(b_keys[b_it],a_keys[a_it]))
259 c_keys[c_it] = b_keys[b_it];
260 c_vals[c_it] = b_vals[b_it];
266 c_keys[c_it] = a_keys[a_it];
267 c_vals[c_it] = a_vals[a_it];
274 c_keys[c_it] = a_keys[a_it];
275 c_vals[c_it] = a_vals[a_it];
282 c_keys[c_it] = b_keys[b_it];
283 c_vals[c_it] = b_vals[b_it];
291 static void init_wrappers()
294 template<
typename lambda_f>
309 template<
typename lambda_f>
316 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
317 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
322 Fn(blockIdx,threadIdx);
326 template<
typename Fun_enc_type>
327 void launch_kernel(boost::context::detail::transfer_t par)
331 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
335 boost::context::detail::jump_fcontext(par.fctx,0);
338 template<
typename lambda_f,
typename ite_type>
339 static void exe_kernel(lambda_f f,
const ite_type & ite)
341 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
343 if (mem_stack.size() < ite.nthrs())
345 int old_size = mem_stack.size();
346 mem_stack.resize(ite.nthrs());
348 for (
int i = old_size ; i < mem_stack.size() ; i++)
350 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
355 contexts.resize(mem_stack.size());
357 Fun_enc<lambda_f> fe(f);
359 for (
int i = 0 ; i < ite.wthr.z ; i++)
362 for (
int j = 0 ; j < ite.wthr.y ; j++)
365 for (
int k = 0 ; k < ite.wthr.x ; k++)
369 for (
int it = 0 ; it < ite.thr.z ; it++)
371 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
373 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
375 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
381 bool work_to_do =
true;
386 for (
int it = 0 ; it < ite.thr.z ; it++)
389 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
392 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
395 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
396 contexts[nc] = t.fctx;
397 work_to_do &= (t.data != 0);
408 template<
typename lambda_f,
typename ite_type>
409 static void exe_kernel_lambda(lambda_f f, ite_type & ite)
411 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
413 if (mem_stack.size() < ite.nthrs())
415 int old_size = mem_stack.size();
416 mem_stack.resize(ite.nthrs());
418 for (
int i = old_size ; i < mem_stack.size() ; i++)
420 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
425 contexts.resize(mem_stack.size());
427 bool is_sync_free =
true;
429 bool first_block =
true;
431 for (
int i = 0 ; i < ite.wthr.z ; i++)
433 for (
int j = 0 ; j < ite.wthr.y ; j++)
435 for (
int k = 0 ; k < ite.wthr.x ; k++)
439 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
440 if (first_block ==
true || is_sync_free ==
false)
446 for (
int it = 0 ; it < ite.thr.z ; it++)
448 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
450 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
452 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
459 bool work_to_do =
true;
464 for (
int it = 0 ; it < ite.thr.z ; it++)
467 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
470 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
473 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
474 contexts[nc] = t.fctx;
476 work_to_do &= (t.data != 0);
477 is_sync_free &= !(work_to_do);
491 for (
int it = 0 ; it < ite.thr.z ; it++)
494 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
497 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
500 f(blockIdx,threadIdx);
512 template<
typename lambda_f,
typename ite_type>
513 static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
515 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
517 if (mem_stack.size() < ite.nthrs())
519 int old_size = mem_stack.size();
520 mem_stack.resize(ite.nthrs());
522 for (
int i = old_size ; i < mem_stack.size() ; i++)
524 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
529 contexts.resize(mem_stack.size());
531 bool is_sync_free =
true;
533 bool first_block =
true;
535 for (
int i = 0 ; i < ite.wthr.z ; i++)
537 for (
int j = 0 ; j < ite.wthr.y ; j++)
539 for (
int k = 0 ; k < ite.wthr.x ; k++)
541 Fun_enc<lambda_f> fe(f);
542 if (first_block ==
true || is_sync_free ==
false)
548 for (
int it = 0 ; it < ite.thr.z ; it++)
550 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
552 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
554 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
561 bool work_to_do =
true;
566 for (
int it = 0 ; it < ite.thr.z ; it++)
569 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
572 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
575 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
576 contexts[nc] = t.fctx;
578 work_to_do &= (t.data != 0);
579 is_sync_free &= !(work_to_do);
593 for (
int it = 0 ; it < ite.thr.z ; it++)
596 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
599 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
614 template<
typename lambda_f,
typename ite_type>
615 static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
617 for (
int i = 0 ; i < ite.wthr.z ; i++)
620 for (
int j = 0 ; j < ite.wthr.y ; j++)
623 for (
int k = 0 ; k < ite.wthr.x ; k++)
628 for (
int it = 0 ; it < ite.wthr.z ; it++)
631 for (
int jt = 0 ; jt < ite.wthr.y ; jt++)
634 for (
int kt = 0 ; kt < ite.wthr.x ; kt++)
646 #ifdef PRINT_CUDA_LAUNCHES
648 #define CUDA_LAUNCH(cuda_call,ite, ...)\
650 gridDim.x = ite.wthr.x;\
651 gridDim.y = ite.wthr.y;\
652 gridDim.z = ite.wthr.z;\
654 blockDim.x = ite.thr.x;\
655 blockDim.y = ite.thr.y;\
656 blockDim.z = ite.thr.z;\
660 std::cout << "Launching: " << #cuda_call << std::endl;\
663 [&](boost::context::fiber && main) -> void {\
667 cuda_call(__VA_ARGS__);\
669 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
673 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
682 gridDim.x = wthr__.x;\
683 gridDim.y = wthr__.y;\
684 gridDim.z = wthr__.z;\
686 blockDim.x = thr__.x;\
687 blockDim.y = thr__.y;\
688 blockDim.z = thr__.z;\
691 std::cout << "Launching: " << #cuda_call << std::endl;\
694 [&] (boost::context::fiber && main) -> void {\
698 cuda_call(__VA_ARGS__);\
702 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
709 #define CUDA_LAUNCH(cuda_call,ite, ...) \
711 gridDim.x = ite.wthr.x;\
712 gridDim.y = ite.wthr.y;\
713 gridDim.z = ite.wthr.z;\
715 blockDim.x = ite.thr.x;\
716 blockDim.y = ite.thr.y;\
717 blockDim.z = ite.thr.z;\
721 exe_kernel([&]() -> void {\
724 cuda_call(__VA_ARGS__);\
728 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
731 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
733 gridDim.x = ite.wthr.x;\
734 gridDim.y = ite.wthr.y;\
735 gridDim.z = ite.wthr.z;\
737 blockDim.x = ite.thr.x;\
738 blockDim.y = ite.thr.y;\
739 blockDim.z = ite.thr.z;\
743 exe_kernel_lambda(lambda_f,ite);\
745 CHECK_SE_CLASS1_POST("lambda",0)\
748 #define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
750 gridDim.x = ite.wthr.x;\
751 gridDim.y = ite.wthr.y;\
752 gridDim.z = ite.wthr.z;\
754 blockDim.x = ite.thr.x;\
755 blockDim.y = ite.thr.y;\
756 blockDim.z = ite.thr.z;\
760 exe_kernel_lambda_tls(lambda_f,ite);\
762 CHECK_SE_CLASS1_POST("lambda",0)\
765 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
774 gridDim.x = wthr__.x;\
775 gridDim.y = wthr__.y;\
776 gridDim.z = wthr__.z;\
778 blockDim.x = thr__.x;\
779 blockDim.y = thr__.y;\
780 blockDim.z = thr__.z;\
784 exe_kernel([&]() -> void {\
786 cuda_call(__VA_ARGS__);\
790 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
793 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
801 gridDim.x = itg.wthr.x;\
802 gridDim.y = itg.wthr.y;\
803 gridDim.z = itg.wthr.z;\
805 blockDim.x = itg.thr.x;\
806 blockDim.y = itg.thr.y;\
807 blockDim.z = itg.thr.z;\
811 exe_kernel_lambda_tls(lambda_f,itg);\
815 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
824 gridDim.x = wthr__.x;\
825 gridDim.y = wthr__.y;\
826 gridDim.z = wthr__.z;\
828 blockDim.x = thr__.x;\
829 blockDim.y = thr__.y;\
830 blockDim.z = thr__.z;\
834 exe_kernel([&]() -> void {\
836 cuda_call(__VA_ARGS__);\
850 constexpr
int default_kernel_wg_threads_ = 1024;
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction