1#ifndef CUDIFY_OPENMP_HPP_
2#define CUDIFY_OPENMP_HPP_
8constexpr int default_kernel_wg_threads_ = 1024;
13#define CUDA_ON_BACKEND CUDA_BACKEND_OPENMP
14#define GPU_HOST_DEVICE
16#include "util/cudify/cudify_hardware_cpu.hpp"
18#ifdef HAVE_BOOST_CONTEXT
20#include <boost/bind/bind.hpp>
22#ifdef HAVE_BOOST_CONTEXT
23#include <boost/context/continuation.hpp>
29#ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
30#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
33extern std::vector<void *>mem_stack;
35extern thread_local dim3 threadIdx;
36extern thread_local dim3 blockIdx;
41extern std::vector<void *> mem_stack;
42extern std::vector<boost::context::detail::fcontext_t> contexts;
43extern thread_local void * par_glob;
44extern thread_local boost::context::detail::fcontext_t main_ctx;
46static void __syncthreads()
48 boost::context::detail::jump_fcontext(main_ctx,par_glob);
52extern thread_local int vct_atomic_add;
53extern thread_local int vct_atomic_rem;
58 template<
typename T,
unsigned int dim>
62 typedef std::array<T,dim> TempStorage;
77 tmp[threadIdx.x] = in;
85 for (
int i = 1 ; i < dim ; i++)
87 auto next = tmp[i-1] + prec;
95 out = tmp[threadIdx.x];
101template<
typename T,
typename T2>
102static T atomicAdd(T * address, T2 val)
104 return __atomic_fetch_add(address,val,__ATOMIC_RELAXED);
107template<
typename T,
typename T2>
108static T atomicAddShared(T * address, T2 val)
117 template<
typename type_t>
118 struct less_t :
public std::binary_function<type_t, type_t, bool> {
119 bool operator()(type_t a, type_t b)
const {
122 template<
typename type2_t,
typename type3_t>
123 bool operator()(type2_t a, type3_t b)
const {
133 template<
typename type_t>
134 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
135 bool operator()(type_t a, type_t b)
const {
138 template<
typename type2_t,
typename type3_t>
139 bool operator()(type2_t a, type3_t b)
const {
165 template<
typename type_t>
166 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
167 type_t operator()(type_t a, type_t b)
const {
186 template<
typename type_t>
187 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
188 type_t operator()(type_t a, type_t b)
const {
189 return std::max(a, b);
193 template<
typename type_t>
194 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
195 type_t operator()(type_t a, type_t b)
const {
196 return std::min(a, b);
204 template<
typename input_it,
205 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
206 void segreduce(input_it input,
int count, segments_it segments,
207 int num_segments, output_it output, op_t op, type_t init,
211 for ( ; i < num_segments - 1; i++)
214 output[i] = input[j];
216 for ( ; j < segments[i+1] ; j++)
218 output[i] = op(output[i],input[j]);
224 output[i] = input[j];
226 for ( ; j < count ; j++)
228 output[i] = op(output[i],input[j]);
233 template<
typename a_keys_it,
typename a_vals_it,
234 typename b_keys_it,
typename b_vals_it,
235 typename c_keys_it,
typename c_vals_it,
236 typename comp_t,
typename context_t>
237 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
238 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
239 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
245 while (a_it < a_count || b_it < b_count)
251 if (comp(b_keys[b_it],a_keys[a_it]))
253 c_keys[c_it] = b_keys[b_it];
254 c_vals[c_it] = b_vals[b_it];
260 c_keys[c_it] = a_keys[a_it];
261 c_vals[c_it] = a_vals[a_it];
268 c_keys[c_it] = a_keys[a_it];
269 c_vals[c_it] = a_vals[a_it];
276 c_keys[c_it] = b_keys[b_it];
277 c_vals[c_it] = b_vals[b_it];
285extern size_t n_workers;
287extern bool init_wrappers_call;
289extern unsigned int * tid_x[OPENMP_MAX_NUM_THREADS];
290extern unsigned int * tid_y[OPENMP_MAX_NUM_THREADS];
291extern unsigned int * tid_z[OPENMP_MAX_NUM_THREADS];
293static void init_wrappers()
295 init_wrappers_call =
true;
299 n_workers = omp_get_num_threads();
302 #pragma omp parallel for
303 for (
int s = 0 ; s < n_workers ; s++)
305 unsigned int tid = omp_get_thread_num();
306 tid_x[tid] = &threadIdx.x;
307 tid_y[tid] = &threadIdx.y;
308 tid_z[tid] = &threadIdx.z;
314template<
typename lambda_f>
329template<
typename lambda_f>
336 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
337 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
342 Fn(blockIdx,threadIdx);
346template<
typename Fun_enc_type>
347void launch_kernel(boost::context::detail::transfer_t par)
351 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
355 boost::context::detail::jump_fcontext(par.fctx,0);
359template<
typename lambda_f,
typename ite_type>
360static void exe_kernel(lambda_f f, ite_type & ite)
363 if (init_wrappers_call ==
false)
365 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
369 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
371 if (mem_stack.size() < ite.nthrs()*n_workers)
373 int old_size = mem_stack.size();
374 mem_stack.resize(ite.nthrs()*n_workers);
376 for (
int i = old_size ; i < mem_stack.size() ; i++)
378 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
382 size_t stride = ite.nthrs();
385 contexts.resize(mem_stack.size());
387 Fun_enc<lambda_f> fe(f);
388 bool is_sync_free =
true;
390 bool first_block =
true;
392 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
393 for (
int i = 0 ; i < ite.wthr.z ; i++)
395 for (
int j = 0 ; j < ite.wthr.y ; j++)
397 for (
int k = 0 ; k < ite.wthr.x ; k++)
399 size_t tid = omp_get_thread_num();
401 if (first_block ==
true || is_sync_free ==
false)
407 for (
int it = 0 ; it < ite.thr.z ; it++)
409 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
411 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
413 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
420 bool work_to_do =
true;
425 for (
int it = 0 ; it < ite.thr.z ; it++)
428 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
431 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
434 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
435 contexts[nc + tid*stride] = t.fctx;
437 work_to_do &= (t.data != 0);
438 is_sync_free &= !(work_to_do);
452 for (
int it = 0 ; it < ite.thr.z ; it++)
455 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
458 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
473template<
typename lambda_f,
typename ite_type>
474static void exe_kernel_lambda(lambda_f f, ite_type & ite)
477 if (init_wrappers_call ==
false)
479 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
483 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
485 if (mem_stack.size() < ite.nthrs()*n_workers)
487 int old_size = mem_stack.size();
488 mem_stack.resize(ite.nthrs()*n_workers);
490 for (
int i = old_size ; i < mem_stack.size() ; i++)
492 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
496 size_t stride = ite.nthrs();
499 contexts.resize(mem_stack.size());
501 bool is_sync_free =
true;
503 bool first_block =
true;
505 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
506 for (
int i = 0 ; i < ite.wthr.z ; i++)
508 for (
int j = 0 ; j < ite.wthr.y ; j++)
510 for (
int k = 0 ; k < ite.wthr.x ; k++)
514 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
515 if (first_block ==
true || is_sync_free ==
false)
517 size_t tid = omp_get_thread_num();
523 for (
int it = 0 ; it < ite.thr.z ; it++)
525 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
527 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
529 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
536 bool work_to_do =
true;
541 for (
int it = 0 ; it < ite.thr.z ; it++)
544 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
547 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
550 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
551 contexts[nc + tid*stride] = t.fctx;
553 work_to_do &= (t.data != 0);
554 is_sync_free &= !(work_to_do);
568 for (
int it = 0 ; it < ite.thr.z ; it++)
571 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
574 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
577 f(blockIdx,threadIdx);
589template<
typename lambda_f,
typename ite_type>
590static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
593 if (init_wrappers_call ==
false)
595 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
599 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
601 if (mem_stack.size() < ite.nthrs()*n_workers)
603 int old_size = mem_stack.size();
604 mem_stack.resize(ite.nthrs()*n_workers);
606 for (
int i = old_size ; i < mem_stack.size() ; i++)
608 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
612 size_t stride = ite.nthrs();
615 contexts.resize(mem_stack.size());
617 bool is_sync_free =
true;
619 bool first_block =
true;
621 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free)
622 for (
int i = 0 ; i < ite.wthr.z ; i++)
624 for (
int j = 0 ; j < ite.wthr.y ; j++)
626 for (
int k = 0 ; k < ite.wthr.x ; k++)
628 Fun_enc<lambda_f> fe(f);
629 if (first_block ==
true || is_sync_free ==
false)
631 size_t tid = omp_get_thread_num();
637 for (
int it = 0 ; it < ite.thr.z ; it++)
639 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
641 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
643 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
650 bool work_to_do =
true;
655 for (
int it = 0 ; it < ite.thr.z ; it++)
658 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
661 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
664 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
665 contexts[nc + tid*stride] = t.fctx;
667 work_to_do &= (t.data != 0);
668 is_sync_free &= !(work_to_do);
682 for (
int it = 0 ; it < ite.thr.z ; it++)
685 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
688 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
703template<
typename lambda_f,
typename ite_type>
704static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
706 #pragma omp parallel for collapse(3)
707 for (
int i = 0 ; i < ite.wthr.z ; i++)
709 for (
int j = 0 ; j < ite.wthr.y ; j++)
711 for (
int k = 0 ; k < ite.wthr.x ; k++)
718 for (
int it = 0 ; it < ite.thr.z ; it++)
721 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
724 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
736#ifdef PRINT_CUDA_LAUNCHES
738#define CUDA_LAUNCH(cuda_call,ite, ...) \
740 gridDim.x = ite.wthr.x;\
741 gridDim.y = ite.wthr.y;\
742 gridDim.z = ite.wthr.z;\
744 blockDim.x = ite.thr.x;\
745 blockDim.y = ite.thr.y;\
746 blockDim.z = ite.thr.z;\
750 std::cout << "Launching: " << #cuda_call << " (" << ite.wthr.x << "," << ite.wthr.y << "," << ite.wthr.z << ") (" << ite.thr.x << "," << ite.thr.y << "," << ite.thr.z << ")" << std::endl;\
752 exe_kernel([&]() -> void {\
755 cuda_call(__VA_ARGS__);\
759 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
763#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
772 gridDim.x = wthr__.x;\
773 gridDim.y = wthr__.y;\
774 gridDim.z = wthr__.z;\
776 blockDim.x = thr__.x;\
777 blockDim.y = thr__.y;\
778 blockDim.z = thr__.z;\
781 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
783 exe_kernel([&]() -> void {\
785 cuda_call(__VA_ARGS__);\
789 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
793#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
802 gridDim.x = wthr__.x;\
803 gridDim.y = wthr__.y;\
804 gridDim.z = wthr__.z;\
806 blockDim.x = thr__.x;\
807 blockDim.y = thr__.y;\
808 blockDim.z = thr__.z;\
811 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
813 exe_kernel([&]() -> void {\
815 cuda_call(__VA_ARGS__);\
825#define CUDA_LAUNCH(cuda_call,ite, ...) \
827 gridDim.x = ite.wthr.x;\
828 gridDim.y = ite.wthr.y;\
829 gridDim.z = ite.wthr.z;\
831 blockDim.x = ite.thr.x;\
832 blockDim.y = ite.thr.y;\
833 blockDim.z = ite.thr.z;\
837 exe_kernel([&]() -> void {\
840 cuda_call(__VA_ARGS__);\
844 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
847#define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
849 gridDim.x = ite.wthr.x;\
850 gridDim.y = ite.wthr.y;\
851 gridDim.z = ite.wthr.z;\
853 blockDim.x = ite.thr.x;\
854 blockDim.y = ite.thr.y;\
855 blockDim.z = ite.thr.z;\
859 exe_kernel_lambda(lambda_f,ite);\
861 CHECK_SE_CLASS1_POST("lambda",0)\
864#define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
866 gridDim.x = ite.wthr.x;\
867 gridDim.y = ite.wthr.y;\
868 gridDim.z = ite.wthr.z;\
870 blockDim.x = ite.thr.x;\
871 blockDim.y = ite.thr.y;\
872 blockDim.z = ite.thr.z;\
876 exe_kernel_lambda_tls(lambda_f,ite);\
878 CHECK_SE_CLASS1_POST("lambda",0)\
881#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
889 gridDim.x = itg.wthr.x;\
890 gridDim.y = itg.wthr.y;\
891 gridDim.z = itg.wthr.z;\
893 blockDim.x = itg.thr.x;\
894 blockDim.y = itg.thr.y;\
895 blockDim.z = itg.thr.z;\
899 exe_kernel_lambda_tls(lambda_f,itg);\
901 CHECK_SE_CLASS1_POST("lambda",0)\
904#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
913 gridDim.x = wthr__.x;\
914 gridDim.y = wthr__.y;\
915 gridDim.z = wthr__.z;\
917 blockDim.x = thr__.x;\
918 blockDim.y = thr__.y;\
919 blockDim.z = thr__.z;\
924 exe_kernel([&]() -> void {\
926 cuda_call(__VA_ARGS__);\
930 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
933#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
942 gridDim.x = wthr__.x;\
943 gridDim.y = wthr__.y;\
944 gridDim.z = wthr__.z;\
946 blockDim.x = thr__.x;\
947 blockDim.y = thr__.y;\
948 blockDim.z = thr__.z;\
953 exe_kernel([&]() -> void {\
955 cuda_call(__VA_ARGS__);\
961#define CUDA_LAUNCH_NOSYNC(cuda_call,ite, ...) \
963 gridDim.x = ite.wthr.x;\
964 gridDim.y = ite.wthr.y;\
965 gridDim.z = ite.wthr.z;\
967 blockDim.x = ite.thr.x;\
968 blockDim.y = ite.thr.y;\
969 blockDim.z = ite.thr.z;\
973 exe_kernel_no_sync([&]() -> void {\
976 cuda_call(__VA_ARGS__);\
980 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
984#define CUDA_LAUNCH_DIM3_NOSYNC(cuda_call,wthr_,thr_, ...)\
993 gridDim.x = wthr__.x;\
994 gridDim.y = wthr__.y;\
995 gridDim.z = wthr__.z;\
997 blockDim.x = thr__.x;\
998 blockDim.y = thr__.y;\
999 blockDim.z = thr__.z;\
1001 CHECK_SE_CLASS1_PRE\
1003 exe_kernel_no_sync([&]() -> void {\
1005 cuda_call(__VA_ARGS__);\
1009 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)