1 #ifndef CUDIFY_OPENMP_HPP_
2 #define CUDIFY_OPENMP_HPP_
6 constexpr
int default_kernel_wg_threads_ = 1024;
10 #define CUDA_ON_BACKEND CUDA_BACKEND_OPENMP
12 #include "util/cudify/cudify_hardware_cpu.hpp"
14 #ifdef HAVE_BOOST_CONTEXT
16 #include <boost/bind/bind.hpp>
17 #include <type_traits>
18 #ifdef HAVE_BOOST_CONTEXT
19 #include <boost/context/continuation.hpp>
25 #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
26 #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
29 extern std::vector<void *>mem_stack;
31 extern thread_local dim3 threadIdx;
32 extern thread_local dim3 blockIdx;
37 extern std::vector<void *> mem_stack;
38 extern std::vector<boost::context::detail::fcontext_t> contexts;
39 extern thread_local
void * par_glob;
40 extern thread_local boost::context::detail::fcontext_t main_ctx;
42 static void __syncthreads()
44 boost::context::detail::jump_fcontext(main_ctx,par_glob);
48 extern thread_local
int vct_atomic_add;
49 extern thread_local
int vct_atomic_rem;
54 template<
typename T,
unsigned int dim>
58 typedef std::array<T,dim> TempStorage;
73 tmp[threadIdx.x] = in;
81 for (
int i = 1 ; i < dim ; i++)
83 auto next = tmp[i-1] + prec;
91 out = tmp[threadIdx.x];
97 template<
typename T,
typename T2>
98 static T atomicAdd(T * address, T2 val)
100 return __atomic_fetch_add(address,val,__ATOMIC_RELAXED);
103 template<
typename T,
typename T2>
104 static T atomicAddShared(T * address, T2 val)
113 template<
typename type_t>
114 struct less_t :
public std::binary_function<type_t, type_t, bool> {
115 bool operator()(type_t a, type_t b)
const {
119 template<
typename type2_t,
typename type3_t>
120 bool operator()(type2_t a, type3_t b)
const {
130 template<
typename type_t>
131 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
132 bool operator()(type_t a, type_t b)
const {
136 template<
typename type2_t,
typename type3_t>
137 bool operator()(type2_t a, type3_t b)
const {
163 template<
typename type_t>
164 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
165 type_t operator()(type_t a, type_t b)
const {
169 type_t reduceInitValue()
const {
188 template<
typename type_t>
189 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
190 type_t operator()(type_t a, type_t b)
const {
191 return std::max(a, b);
194 type_t reduceInitValue()
const {
195 return std::numeric_limits<type_t>::min();
199 template<
typename type_t>
200 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
201 type_t operator()(type_t a, type_t b)
const {
202 return std::min(a, b);
205 type_t reduceInitValue()
const {
206 return std::numeric_limits<type_t>::max();
214 template<
typename input_it,
215 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
216 void segreduce(input_it input,
int count, segments_it segments,
217 int num_segments, output_it output, op_t op, type_t
init,
218 context_t& gpuContext)
221 for ( ; i < num_segments - 1; i++)
224 output[i] = input[j];
226 for ( ; j < segments[i+1] ; j++)
228 output[i] = op(output[i],input[j]);
234 output[i] = input[j];
236 for ( ; j < count ; j++)
238 output[i] = op(output[i],input[j]);
243 template<
typename a_keys_it,
typename a_vals_it,
244 typename b_keys_it,
typename b_vals_it,
245 typename c_keys_it,
typename c_vals_it,
246 typename comp_t,
typename context_t>
247 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
248 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
249 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& gpuContext)
255 while (a_it < a_count || b_it < b_count)
261 if (comp(b_keys[b_it],a_keys[a_it]))
263 c_keys[c_it] = b_keys[b_it];
264 c_vals[c_it] = b_vals[b_it];
270 c_keys[c_it] = a_keys[a_it];
271 c_vals[c_it] = a_vals[a_it];
278 c_keys[c_it] = a_keys[a_it];
279 c_vals[c_it] = a_vals[a_it];
286 c_keys[c_it] = b_keys[b_it];
287 c_vals[c_it] = b_vals[b_it];
295 extern size_t n_workers;
297 extern bool init_wrappers_call;
299 extern unsigned int * tid_x[OPENMP_MAX_NUM_THREADS];
300 extern unsigned int * tid_y[OPENMP_MAX_NUM_THREADS];
301 extern unsigned int * tid_z[OPENMP_MAX_NUM_THREADS];
303 static void init_wrappers()
305 init_wrappers_call =
true;
309 n_workers = omp_get_num_threads();
312 #pragma omp parallel for
313 for (
int s = 0 ; s < n_workers ; s++)
315 unsigned int tid = omp_get_thread_num();
316 tid_x[tid] = &threadIdx.x;
317 tid_y[tid] = &threadIdx.y;
318 tid_z[tid] = &threadIdx.z;
324 template<
typename lambda_f>
339 template<
typename lambda_f>
346 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
347 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
352 Fn(blockIdx,threadIdx);
356 template<
typename Fun_enc_type>
357 void launch_kernel(boost::context::detail::transfer_t par)
361 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
365 boost::context::detail::jump_fcontext(par.fctx,0);
369 template<
typename lambda_f,
typename ite_type>
370 static void exe_kernel(lambda_f f,
const ite_type & ite)
373 if (init_wrappers_call ==
false)
375 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
379 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
381 if (mem_stack.size() < ite.nthrs()*n_workers)
383 int old_size = mem_stack.size();
384 mem_stack.resize(ite.nthrs()*n_workers);
386 for (
int i = old_size ; i < mem_stack.size() ; i++)
388 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
392 size_t stride = ite.nthrs();
395 contexts.resize(mem_stack.size());
397 Fun_enc<lambda_f> fe(f);
398 bool is_sync_free =
true;
400 bool first_block =
true;
402 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
403 for (
int i = 0 ; i < ite.wthr.z ; i++)
405 for (
int j = 0 ; j < ite.wthr.y ; j++)
407 for (
int k = 0 ; k < ite.wthr.x ; k++)
409 size_t tid = omp_get_thread_num();
411 if (first_block ==
true || is_sync_free ==
false)
417 for (
int it = 0 ; it < ite.thr.z ; it++)
419 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
421 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
423 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
430 bool work_to_do =
true;
435 for (
int it = 0 ; it < ite.thr.z ; it++)
438 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
441 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
444 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
445 contexts[nc + tid*stride] = t.fctx;
447 work_to_do &= (t.data != 0);
448 is_sync_free &= !(work_to_do);
462 for (
int it = 0 ; it < ite.thr.z ; it++)
465 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
468 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
483 template<
typename lambda_f,
typename ite_type>
484 static void exe_kernel_lambda(lambda_f f, ite_type & ite)
487 if (init_wrappers_call ==
false)
489 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
493 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
495 if (mem_stack.size() < ite.nthrs()*n_workers)
497 int old_size = mem_stack.size();
498 mem_stack.resize(ite.nthrs()*n_workers);
500 for (
int i = old_size ; i < mem_stack.size() ; i++)
502 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
506 size_t stride = ite.nthrs();
509 contexts.resize(mem_stack.size());
511 bool is_sync_free =
true;
513 bool first_block =
true;
515 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
516 for (
int i = 0 ; i < ite.wthr.z ; i++)
518 for (
int j = 0 ; j < ite.wthr.y ; j++)
520 for (
int k = 0 ; k < ite.wthr.x ; k++)
524 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
525 if (first_block ==
true || is_sync_free ==
false)
527 size_t tid = omp_get_thread_num();
533 for (
int it = 0 ; it < ite.thr.z ; it++)
535 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
537 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
539 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
546 bool work_to_do =
true;
551 for (
int it = 0 ; it < ite.thr.z ; it++)
554 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
557 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
560 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
561 contexts[nc + tid*stride] = t.fctx;
563 work_to_do &= (t.data != 0);
564 is_sync_free &= !(work_to_do);
578 for (
int it = 0 ; it < ite.thr.z ; it++)
581 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
584 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
587 f(blockIdx,threadIdx);
599 template<
typename lambda_f,
typename ite_type>
600 static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
603 if (init_wrappers_call ==
false)
605 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
609 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
611 if (mem_stack.size() < ite.nthrs()*n_workers)
613 int old_size = mem_stack.size();
614 mem_stack.resize(ite.nthrs()*n_workers);
616 for (
int i = old_size ; i < mem_stack.size() ; i++)
618 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
622 size_t stride = ite.nthrs();
625 contexts.resize(mem_stack.size());
627 bool is_sync_free =
true;
629 bool first_block =
true;
631 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
632 for (
int i = 0 ; i < ite.wthr.z ; i++)
634 for (
int j = 0 ; j < ite.wthr.y ; j++)
636 for (
int k = 0 ; k < ite.wthr.x ; k++)
638 Fun_enc<lambda_f> fe(f);
639 if (first_block ==
true || is_sync_free ==
false)
641 size_t tid = omp_get_thread_num();
647 for (
int it = 0 ; it < ite.thr.z ; it++)
649 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
651 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
653 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
660 bool work_to_do =
true;
665 for (
int it = 0 ; it < ite.thr.z ; it++)
668 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
671 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
674 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
675 contexts[nc + tid*stride] = t.fctx;
677 work_to_do &= (t.data != 0);
678 is_sync_free &= !(work_to_do);
692 for (
int it = 0 ; it < ite.thr.z ; it++)
695 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
698 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
713 template<
typename lambda_f,
typename ite_type>
714 static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
716 #pragma omp parallel for collapse(3)
717 for (
int i = 0 ; i < ite.wthr.z ; i++)
719 for (
int j = 0 ; j < ite.wthr.y ; j++)
721 for (
int k = 0 ; k < ite.wthr.x ; k++)
728 for (
int it = 0 ; it < ite.thr.z ; it++)
731 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
734 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
746 #ifdef PRINT_CUDA_LAUNCHES
748 #define CUDA_LAUNCH(cuda_call,ite, ...) \
750 gridDim.x = ite.wthr.x;\
751 gridDim.y = ite.wthr.y;\
752 gridDim.z = ite.wthr.z;\
754 blockDim.x = ite.thr.x;\
755 blockDim.y = ite.thr.y;\
756 blockDim.z = ite.thr.z;\
760 std::cout << "Launching: " << #cuda_call << " (" << ite.wthr.x << "," << ite.wthr.y << "," << ite.wthr.z << ") (" << ite.thr.x << "," << ite.thr.y << "," << ite.thr.z << ")" << std::endl;\
762 exe_kernel([&]() -> void {\
765 cuda_call(__VA_ARGS__);\
769 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
773 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
782 gridDim.x = wthr__.x;\
783 gridDim.y = wthr__.y;\
784 gridDim.z = wthr__.z;\
786 blockDim.x = thr__.x;\
787 blockDim.y = thr__.y;\
788 blockDim.z = thr__.z;\
791 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
793 exe_kernel([&]() -> void {\
795 cuda_call(__VA_ARGS__);\
799 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
803 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
812 gridDim.x = wthr__.x;\
813 gridDim.y = wthr__.y;\
814 gridDim.z = wthr__.z;\
816 blockDim.x = thr__.x;\
817 blockDim.y = thr__.y;\
818 blockDim.z = thr__.z;\
821 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
823 exe_kernel([&]() -> void {\
825 cuda_call(__VA_ARGS__);\
836 #define CUDA_LAUNCH(cuda_call,ite, ...) \
838 gridDim.x = ite.wthr.x;\
839 gridDim.y = ite.wthr.y;\
840 gridDim.z = ite.wthr.z;\
842 blockDim.x = ite.thr.x;\
843 blockDim.y = ite.thr.y;\
844 blockDim.z = ite.thr.z;\
848 exe_kernel([&]() -> void {\
851 cuda_call(__VA_ARGS__);\
855 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
858 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
860 gridDim.x = ite.wthr.x;\
861 gridDim.y = ite.wthr.y;\
862 gridDim.z = ite.wthr.z;\
864 blockDim.x = ite.thr.x;\
865 blockDim.y = ite.thr.y;\
866 blockDim.z = ite.thr.z;\
870 exe_kernel_lambda(lambda_f,ite);\
872 CHECK_SE_CLASS1_POST("lambda",0)\
875 #define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
877 gridDim.x = ite.wthr.x;\
878 gridDim.y = ite.wthr.y;\
879 gridDim.z = ite.wthr.z;\
881 blockDim.x = ite.thr.x;\
882 blockDim.y = ite.thr.y;\
883 blockDim.z = ite.thr.z;\
887 exe_kernel_lambda_tls(lambda_f,ite);\
889 CHECK_SE_CLASS1_POST("lambda",0)\
892 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
900 gridDim.x = itg.wthr.x;\
901 gridDim.y = itg.wthr.y;\
902 gridDim.z = itg.wthr.z;\
904 blockDim.x = itg.thr.x;\
905 blockDim.y = itg.thr.y;\
906 blockDim.z = itg.thr.z;\
910 exe_kernel_lambda_tls(lambda_f,itg);\
912 CHECK_SE_CLASS1_POST("lambda",0)\
915 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
924 gridDim.x = wthr__.x;\
925 gridDim.y = wthr__.y;\
926 gridDim.z = wthr__.z;\
928 blockDim.x = thr__.x;\
929 blockDim.y = thr__.y;\
930 blockDim.z = thr__.z;\
935 exe_kernel([&]() -> void {\
937 cuda_call(__VA_ARGS__);\
941 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
944 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
953 gridDim.x = wthr__.x;\
954 gridDim.y = wthr__.y;\
955 gridDim.z = wthr__.z;\
957 blockDim.x = thr__.x;\
958 blockDim.y = thr__.y;\
959 blockDim.z = thr__.z;\
964 exe_kernel([&]() -> void {\
966 cuda_call(__VA_ARGS__);\
972 #define CUDA_LAUNCH_NOSYNC(cuda_call,ite, ...) \
974 gridDim.x = ite.wthr.x;\
975 gridDim.y = ite.wthr.y;\
976 gridDim.z = ite.wthr.z;\
978 blockDim.x = ite.thr.x;\
979 blockDim.y = ite.thr.y;\
980 blockDim.z = ite.thr.z;\
984 exe_kernel_no_sync([&]() -> void {\
987 cuda_call(__VA_ARGS__);\
991 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
995 #define CUDA_LAUNCH_DIM3_NOSYNC(cuda_call,wthr_,thr_, ...)\
1004 gridDim.x = wthr__.x;\
1005 gridDim.y = wthr__.y;\
1006 gridDim.z = wthr__.z;\
1008 blockDim.x = thr__.x;\
1009 blockDim.y = thr__.y;\
1010 blockDim.z = thr__.z;\
1012 CHECK_SE_CLASS1_PRE\
1014 exe_kernel_no_sync([&]() -> void {\
1016 cuda_call(__VA_ARGS__);\
1020 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
1023 #define CUDA_CHECK()
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction