1 #ifndef CUDIFY_OPENMP_HPP_ 2 #define CUDIFY_OPENMP_HPP_ 8 constexpr
int default_kernel_wg_threads_ = 1024;
13 #define CUDA_ON_BACKEND CUDA_BACKEND_OPENMP 15 #include "cudify_hardware_common.hpp" 17 #ifdef HAVE_BOOST_CONTEXT 19 #include <boost/bind/bind.hpp> 20 #include <type_traits> 21 #ifdef HAVE_BOOST_CONTEXT 22 #include <boost/context/continuation.hpp> 28 #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE 29 #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192 32 extern std::vector<void *>mem_stack;
34 extern thread_local dim3 threadIdx;
35 extern thread_local dim3 blockIdx;
40 extern std::vector<void *> mem_stack;
41 extern std::vector<boost::context::detail::fcontext_t> contexts;
42 extern thread_local
void * par_glob;
43 extern thread_local boost::context::detail::fcontext_t main_ctx;
45 static void __syncthreads()
47 boost::context::detail::jump_fcontext(main_ctx,par_glob);
51 extern thread_local
int vct_atomic_add;
52 extern thread_local
int vct_atomic_rem;
57 template<
typename T,
unsigned int dim>
61 typedef std::array<T,dim> TempStorage;
76 tmp[threadIdx.x] = in;
84 for (
int i = 1 ; i < dim ; i++)
86 auto next = tmp[i-1] + prec;
94 out = tmp[threadIdx.x];
100 template<
typename T,
typename T2>
101 static T atomicAdd(T * address, T2 val)
103 return __atomic_fetch_add(address,val,__ATOMIC_RELAXED);
106 template<
typename T,
typename T2>
107 static T atomicAddShared(T * address, T2 val)
114 #define MGPU_HOST_DEVICE 118 template<
typename type_t>
119 struct less_t :
public std::binary_function<type_t, type_t, bool> {
120 bool operator()(type_t a, type_t b)
const {
123 template<
typename type2_t,
typename type3_t>
124 bool operator()(type2_t a, type3_t b)
const {
134 template<
typename type_t>
135 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
136 MGPU_HOST_DEVICE
bool operator()(type_t a, type_t b)
const {
139 template<
typename type2_t,
typename type3_t>
140 MGPU_HOST_DEVICE
bool operator()(type2_t a, type3_t b)
const {
166 template<
typename type_t>
167 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
168 type_t operator()(type_t a, type_t b)
const {
187 template<
typename type_t>
188 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
189 type_t operator()(type_t a, type_t b)
const {
190 return std::max(a, b);
194 template<
typename type_t>
195 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
196 type_t operator()(type_t a, type_t b)
const {
197 return std::min(a, b);
205 template<
typename input_it,
206 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
207 void segreduce(input_it input,
int count, segments_it segments,
208 int num_segments, output_it output, op_t op, type_t
init,
212 for ( ; i < num_segments - 1; i++)
215 output[i] = input[j];
217 for ( ; j < segments[i+1] ; j++)
219 output[i] = op(output[i],input[j]);
225 output[i] = input[j];
227 for ( ; j < count ; j++)
229 output[i] = op(output[i],input[j]);
234 template<
typename a_keys_it,
typename a_vals_it,
235 typename b_keys_it,
typename b_vals_it,
236 typename c_keys_it,
typename c_vals_it,
237 typename comp_t,
typename context_t>
238 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
239 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
240 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
246 while (a_it < a_count || b_it < b_count)
252 if (comp(b_keys[b_it],a_keys[a_it]))
254 c_keys[c_it] = b_keys[b_it];
255 c_vals[c_it] = b_vals[b_it];
261 c_keys[c_it] = a_keys[a_it];
262 c_vals[c_it] = a_vals[a_it];
269 c_keys[c_it] = a_keys[a_it];
270 c_vals[c_it] = a_vals[a_it];
277 c_keys[c_it] = b_keys[b_it];
278 c_vals[c_it] = b_vals[b_it];
286 extern size_t n_workers;
288 extern bool init_wrappers_call;
290 extern unsigned int * tid_x[OPENMP_MAX_NUM_THREADS];
291 extern unsigned int * tid_y[OPENMP_MAX_NUM_THREADS];
292 extern unsigned int * tid_z[OPENMP_MAX_NUM_THREADS];
294 static void init_wrappers()
296 init_wrappers_call =
true;
300 n_workers = omp_get_num_threads();
303 #pragma omp parallel for 304 for (
int s = 0 ; s < n_workers ; s++)
306 unsigned int tid = omp_get_thread_num();
307 tid_x[tid] = &threadIdx.x;
308 tid_y[tid] = &threadIdx.y;
309 tid_z[tid] = &threadIdx.z;
315 template<
typename lambda_f>
330 template<
typename lambda_f>
337 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
338 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
343 Fn(blockIdx,threadIdx);
347 template<
typename Fun_enc_type>
348 void launch_kernel(boost::context::detail::transfer_t par)
352 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
356 boost::context::detail::jump_fcontext(par.fctx,0);
360 template<
typename lambda_f,
typename ite_type>
361 static void exe_kernel(lambda_f f, ite_type & ite)
364 if (init_wrappers_call ==
false)
366 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
370 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
372 if (mem_stack.size() < ite.nthrs()*n_workers)
374 int old_size = mem_stack.size();
375 mem_stack.resize(ite.nthrs()*n_workers);
377 for (
int i = old_size ; i < mem_stack.size() ; i++)
379 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
383 size_t stride = ite.nthrs();
386 contexts.resize(mem_stack.size());
388 Fun_enc<lambda_f> fe(f);
389 bool is_sync_free =
true;
391 bool first_block =
true;
393 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free) 394 for (
int i = 0 ; i < ite.wthr.z ; i++)
396 for (
int j = 0 ; j < ite.wthr.y ; j++)
398 for (
int k = 0 ; k < ite.wthr.x ; k++)
400 size_t tid = omp_get_thread_num();
402 if (first_block ==
true || is_sync_free ==
false)
408 for (
int it = 0 ; it < ite.thr.z ; it++)
410 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
412 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
414 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
421 bool work_to_do =
true;
426 for (
int it = 0 ; it < ite.thr.z ; it++)
429 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
432 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
435 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
436 contexts[nc + tid*stride] = t.fctx;
438 work_to_do &= (t.data != 0);
439 is_sync_free &= !(work_to_do);
453 for (
int it = 0 ; it < ite.thr.z ; it++)
456 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
459 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
474 template<
typename lambda_f,
typename ite_type>
475 static void exe_kernel_lambda(lambda_f f, ite_type & ite)
478 if (init_wrappers_call ==
false)
480 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
484 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
486 if (mem_stack.size() < ite.nthrs()*n_workers)
488 int old_size = mem_stack.size();
489 mem_stack.resize(ite.nthrs()*n_workers);
491 for (
int i = old_size ; i < mem_stack.size() ; i++)
493 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
497 size_t stride = ite.nthrs();
500 contexts.resize(mem_stack.size());
502 bool is_sync_free =
true;
504 bool first_block =
true;
506 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free) 507 for (
int i = 0 ; i < ite.wthr.z ; i++)
509 for (
int j = 0 ; j < ite.wthr.y ; j++)
511 for (
int k = 0 ; k < ite.wthr.x ; k++)
515 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
516 if (first_block ==
true || is_sync_free ==
false)
518 size_t tid = omp_get_thread_num();
524 for (
int it = 0 ; it < ite.thr.z ; it++)
526 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
528 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
530 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
537 bool work_to_do =
true;
542 for (
int it = 0 ; it < ite.thr.z ; it++)
545 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
548 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
551 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
552 contexts[nc + tid*stride] = t.fctx;
554 work_to_do &= (t.data != 0);
555 is_sync_free &= !(work_to_do);
569 for (
int it = 0 ; it < ite.thr.z ; it++)
572 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
575 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
578 f(blockIdx,threadIdx);
590 template<
typename lambda_f,
typename ite_type>
591 static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
594 if (init_wrappers_call ==
false)
596 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, you must call init_wrappers to use cuda openmp backend" << std::endl;
600 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
602 if (mem_stack.size() < ite.nthrs()*n_workers)
604 int old_size = mem_stack.size();
605 mem_stack.resize(ite.nthrs()*n_workers);
607 for (
int i = old_size ; i < mem_stack.size() ; i++)
609 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
613 size_t stride = ite.nthrs();
616 contexts.resize(mem_stack.size());
618 bool is_sync_free =
true;
620 bool first_block =
true;
622 #pragma omp parallel for collapse(3) firstprivate(first_block) firstprivate(is_sync_free) 623 for (
int i = 0 ; i < ite.wthr.z ; i++)
625 for (
int j = 0 ; j < ite.wthr.y ; j++)
627 for (
int k = 0 ; k < ite.wthr.x ; k++)
629 Fun_enc<lambda_f> fe(f);
630 if (first_block ==
true || is_sync_free ==
false)
632 size_t tid = omp_get_thread_num();
638 for (
int it = 0 ; it < ite.thr.z ; it++)
640 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
642 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
644 contexts[nc + tid*stride] = boost::context::detail::make_fcontext((
char *)mem_stack[nc + tid*stride]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
651 bool work_to_do =
true;
656 for (
int it = 0 ; it < ite.thr.z ; it++)
659 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
662 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
665 auto t = boost::context::detail::jump_fcontext(contexts[nc + tid*stride],&fe);
666 contexts[nc + tid*stride] = t.fctx;
668 work_to_do &= (t.data != 0);
669 is_sync_free &= !(work_to_do);
683 for (
int it = 0 ; it < ite.thr.z ; it++)
686 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
689 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
704 template<
typename lambda_f,
typename ite_type>
705 static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
707 #pragma omp parallel for collapse(3) 708 for (
int i = 0 ; i < ite.wthr.z ; i++)
710 for (
int j = 0 ; j < ite.wthr.y ; j++)
712 for (
int k = 0 ; k < ite.wthr.x ; k++)
719 for (
int it = 0 ; it < ite.thr.z ; it++)
722 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
725 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
737 #ifdef PRINT_CUDA_LAUNCHES 739 #define CUDA_LAUNCH(cuda_call,ite, ...) \ 741 gridDim.x = ite.wthr.x;\ 742 gridDim.y = ite.wthr.y;\ 743 gridDim.z = ite.wthr.z;\ 745 blockDim.x = ite.thr.x;\ 746 blockDim.y = ite.thr.y;\ 747 blockDim.z = ite.thr.z;\ 751 std::cout << "Launching: " << #cuda_call << " (" << ite.wthr.x << "," << ite.wthr.y << "," << ite.wthr.z << ") (" << ite.thr.x << "," << ite.thr.y << "," << ite.thr.z << ")" << std::endl;\ 753 exe_kernel([&]() -> void {\ 756 cuda_call(__VA_ARGS__);\ 760 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 764 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 773 gridDim.x = wthr__.x;\ 774 gridDim.y = wthr__.y;\ 775 gridDim.z = wthr__.z;\ 777 blockDim.x = thr__.x;\ 778 blockDim.y = thr__.y;\ 779 blockDim.z = thr__.z;\ 782 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\ 784 exe_kernel([&]() -> void {\ 786 cuda_call(__VA_ARGS__);\ 790 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 794 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\ 803 gridDim.x = wthr__.x;\ 804 gridDim.y = wthr__.y;\ 805 gridDim.z = wthr__.z;\ 807 blockDim.x = thr__.x;\ 808 blockDim.y = thr__.y;\ 809 blockDim.z = thr__.z;\ 812 std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\ 814 exe_kernel([&]() -> void {\ 816 cuda_call(__VA_ARGS__);\ 826 #define CUDA_LAUNCH(cuda_call,ite, ...) \ 828 gridDim.x = ite.wthr.x;\ 829 gridDim.y = ite.wthr.y;\ 830 gridDim.z = ite.wthr.z;\ 832 blockDim.x = ite.thr.x;\ 833 blockDim.y = ite.thr.y;\ 834 blockDim.z = ite.thr.z;\ 838 exe_kernel([&]() -> void {\ 841 cuda_call(__VA_ARGS__);\ 845 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 848 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \ 850 gridDim.x = ite.wthr.x;\ 851 gridDim.y = ite.wthr.y;\ 852 gridDim.z = ite.wthr.z;\ 854 blockDim.x = ite.thr.x;\ 855 blockDim.y = ite.thr.y;\ 856 blockDim.z = ite.thr.z;\ 860 exe_kernel_lambda(lambda_f,ite);\ 862 CHECK_SE_CLASS1_POST("lambda",0)\ 865 #define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \ 867 gridDim.x = ite.wthr.x;\ 868 gridDim.y = ite.wthr.y;\ 869 gridDim.z = ite.wthr.z;\ 871 blockDim.x = ite.thr.x;\ 872 blockDim.y = ite.thr.y;\ 873 blockDim.z = ite.thr.z;\ 877 exe_kernel_lambda_tls(lambda_f,ite);\ 879 CHECK_SE_CLASS1_POST("lambda",0)\ 882 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \ 890 gridDim.x = itg.wthr.x;\ 891 gridDim.y = itg.wthr.y;\ 892 gridDim.z = itg.wthr.z;\ 894 blockDim.x = itg.thr.x;\ 895 blockDim.y = itg.thr.y;\ 896 blockDim.z = itg.thr.z;\ 900 exe_kernel_lambda_tls(lambda_f,itg);\ 902 CHECK_SE_CLASS1_POST("lambda",0)\ 905 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 914 gridDim.x = wthr__.x;\ 915 gridDim.y = wthr__.y;\ 916 gridDim.z = wthr__.z;\ 918 blockDim.x = thr__.x;\ 919 blockDim.y = thr__.y;\ 920 blockDim.z = thr__.z;\ 925 exe_kernel([&]() -> void {\ 927 cuda_call(__VA_ARGS__);\ 931 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 934 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\ 943 gridDim.x = wthr__.x;\ 944 gridDim.y = wthr__.y;\ 945 gridDim.z = wthr__.z;\ 947 blockDim.x = thr__.x;\ 948 blockDim.y = thr__.y;\ 949 blockDim.z = thr__.z;\ 954 exe_kernel([&]() -> void {\ 956 cuda_call(__VA_ARGS__);\ 962 #define CUDA_LAUNCH_NOSYNC(cuda_call,ite, ...) \ 964 gridDim.x = ite.wthr.x;\ 965 gridDim.y = ite.wthr.y;\ 966 gridDim.z = ite.wthr.z;\ 968 blockDim.x = ite.thr.x;\ 969 blockDim.y = ite.thr.y;\ 970 blockDim.z = ite.thr.z;\ 974 exe_kernel_no_sync([&]() -> void {\ 977 cuda_call(__VA_ARGS__);\ 981 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 985 #define CUDA_LAUNCH_DIM3_NOSYNC(cuda_call,wthr_,thr_, ...)\ 994 gridDim.x = wthr__.x;\ 995 gridDim.y = wthr__.y;\ 996 gridDim.z = wthr__.z;\ 998 blockDim.x = thr__.x;\ 999 blockDim.y = thr__.y;\ 1000 blockDim.z = thr__.z;\ 1002 CHECK_SE_CLASS1_PRE\ 1004 exe_kernel_no_sync([&]() -> void {\ 1006 cuda_call(__VA_ARGS__);\ 1010 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 1013 #define CUDA_CHECK() Optional outer namespace(s)
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....