1 #ifndef CUDIFY_SEQUENCIAL_HPP_ 2 #define CUDIFY_SEQUENCIAL_HPP_ 4 #define CUDA_ON_BACKEND CUDA_BACKEND_SEQUENTIAL 8 constexpr
int default_kernel_wg_threads_ = 1024;
10 #include "cudify_hardware_common.hpp" 12 #ifdef HAVE_BOOST_CONTEXT 14 #include "util/cuda_util.hpp" 15 #include <boost/bind/bind.hpp> 16 #include <type_traits> 17 #ifdef HAVE_BOOST_CONTEXT 18 #include <boost/context/continuation.hpp> 24 #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE 25 #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192 28 extern std::vector<void *>mem_stack;
30 extern thread_local dim3 threadIdx;
31 extern thread_local dim3 blockIdx;
36 extern std::vector<void *> mem_stack;
37 extern std::vector<boost::context::detail::fcontext_t> contexts;
38 extern thread_local
void * par_glob;
39 extern thread_local boost::context::detail::fcontext_t main_ctx;
41 static void __syncthreads()
43 boost::context::detail::jump_fcontext(main_ctx,par_glob);
48 extern int thread_local vct_atomic_add;
49 extern int thread_local vct_atomic_rem;
54 template<
typename T,
unsigned int dim>
58 typedef std::array<T,dim> TempStorage;
73 tmp[threadIdx.x] = in;
81 for (
int i = 1 ; i < dim ; i++)
83 auto next = tmp[i-1] + prec;
91 out = tmp[threadIdx.x];
98 template<
typename T,
typename T2>
99 static T atomicAdd(T * address, T2 val)
106 #define MGPU_HOST_DEVICE 110 template<
typename type_t>
111 struct less_t :
public std::binary_function<type_t, type_t, bool> {
112 bool operator()(type_t a, type_t b)
const {
115 template<
typename type2_t,
typename type3_t>
116 bool operator()(type2_t a, type3_t b)
const {
126 template<
typename type_t>
127 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
128 MGPU_HOST_DEVICE
bool operator()(type_t a, type_t b)
const {
131 template<
typename type2_t,
typename type3_t>
132 MGPU_HOST_DEVICE
bool operator()(type2_t a, type3_t b)
const {
158 template<
typename type_t>
159 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
160 type_t operator()(type_t a, type_t b)
const {
179 template<
typename type_t>
180 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
181 type_t operator()(type_t a, type_t b)
const {
182 return std::max(a, b);
186 template<
typename type_t>
187 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
188 type_t operator()(type_t a, type_t b)
const {
189 return std::min(a, b);
197 template<
typename input_it,
198 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
199 void segreduce(input_it input,
int count, segments_it segments,
200 int num_segments, output_it output, op_t op, type_t
init,
204 for ( ; i < num_segments - 1; i++)
207 output[i] = input[j];
209 for ( ; j < segments[i+1] ; j++)
211 output[i] = op(output[i],input[j]);
217 output[i] = input[j];
219 for ( ; j < count ; j++)
221 output[i] = op(output[i],input[j]);
226 template<
typename a_keys_it,
typename a_vals_it,
227 typename b_keys_it,
typename b_vals_it,
228 typename c_keys_it,
typename c_vals_it,
229 typename comp_t,
typename context_t>
230 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
231 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
232 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
238 while (a_it < a_count || b_it < b_count)
244 if (comp(b_keys[b_it],a_keys[a_it]))
246 c_keys[c_it] = b_keys[b_it];
247 c_vals[c_it] = b_vals[b_it];
253 c_keys[c_it] = a_keys[a_it];
254 c_vals[c_it] = a_vals[a_it];
261 c_keys[c_it] = a_keys[a_it];
262 c_vals[c_it] = a_vals[a_it];
269 c_keys[c_it] = b_keys[b_it];
270 c_vals[c_it] = b_vals[b_it];
278 static void init_wrappers()
281 template<
typename lambda_f>
296 template<
typename lambda_f>
303 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
304 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
309 Fn(blockIdx,threadIdx);
313 template<
typename Fun_enc_type>
314 void launch_kernel(boost::context::detail::transfer_t par)
318 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
322 boost::context::detail::jump_fcontext(par.fctx,0);
325 template<
typename lambda_f,
typename ite_type>
326 static void exe_kernel(lambda_f f, ite_type & ite)
328 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
330 if (mem_stack.size() < ite.nthrs())
332 int old_size = mem_stack.size();
333 mem_stack.resize(ite.nthrs());
335 for (
int i = old_size ; i < mem_stack.size() ; i++)
337 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
342 contexts.resize(mem_stack.size());
344 Fun_enc<lambda_f> fe(f);
346 for (
int i = 0 ; i < ite.wthr.z ; i++)
349 for (
int j = 0 ; j < ite.wthr.y ; j++)
352 for (
int k = 0 ; k < ite.wthr.x ; k++)
356 for (
int it = 0 ; it < ite.thr.z ; it++)
358 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
360 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
362 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
368 bool work_to_do =
true;
373 for (
int it = 0 ; it < ite.thr.z ; it++)
376 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
379 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
382 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
383 contexts[nc] = t.fctx;
384 work_to_do &= (t.data != 0);
395 template<
typename lambda_f,
typename ite_type>
396 static void exe_kernel_lambda(lambda_f f, ite_type & ite)
398 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
400 if (mem_stack.size() < ite.nthrs())
402 int old_size = mem_stack.size();
403 mem_stack.resize(ite.nthrs());
405 for (
int i = old_size ; i < mem_stack.size() ; i++)
407 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
412 contexts.resize(mem_stack.size());
414 bool is_sync_free =
true;
416 bool first_block =
true;
418 for (
int i = 0 ; i < ite.wthr.z ; i++)
420 for (
int j = 0 ; j < ite.wthr.y ; j++)
422 for (
int k = 0 ; k < ite.wthr.x ; k++)
426 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
427 if (first_block ==
true || is_sync_free ==
false)
433 for (
int it = 0 ; it < ite.thr.z ; it++)
435 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
437 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
439 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
446 bool work_to_do =
true;
451 for (
int it = 0 ; it < ite.thr.z ; it++)
454 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
457 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
460 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
461 contexts[nc] = t.fctx;
463 work_to_do &= (t.data != 0);
464 is_sync_free &= !(work_to_do);
478 for (
int it = 0 ; it < ite.thr.z ; it++)
481 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
484 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
487 f(blockIdx,threadIdx);
499 template<
typename lambda_f,
typename ite_type>
500 static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
502 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
504 if (mem_stack.size() < ite.nthrs())
506 int old_size = mem_stack.size();
507 mem_stack.resize(ite.nthrs());
509 for (
int i = old_size ; i < mem_stack.size() ; i++)
511 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
516 contexts.resize(mem_stack.size());
518 bool is_sync_free =
true;
520 bool first_block =
true;
522 for (
int i = 0 ; i < ite.wthr.z ; i++)
524 for (
int j = 0 ; j < ite.wthr.y ; j++)
526 for (
int k = 0 ; k < ite.wthr.x ; k++)
528 Fun_enc<lambda_f> fe(f);
529 if (first_block ==
true || is_sync_free ==
false)
535 for (
int it = 0 ; it < ite.thr.z ; it++)
537 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
539 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
541 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
548 bool work_to_do =
true;
553 for (
int it = 0 ; it < ite.thr.z ; it++)
556 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
559 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
562 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
563 contexts[nc] = t.fctx;
565 work_to_do &= (t.data != 0);
566 is_sync_free &= !(work_to_do);
580 for (
int it = 0 ; it < ite.thr.z ; it++)
583 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
586 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
601 template<
typename lambda_f,
typename ite_type>
602 static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
604 for (
int i = 0 ; i < ite.wthr.z ; i++)
607 for (
int j = 0 ; j < ite.wthr.y ; j++)
610 for (
int k = 0 ; k < ite.wthr.x ; k++)
615 for (
int it = 0 ; it < ite.wthr.z ; it++)
618 for (
int jt = 0 ; jt < ite.wthr.y ; jt++)
621 for (
int kt = 0 ; kt < ite.wthr.x ; kt++)
633 #ifdef PRINT_CUDA_LAUNCHES 635 #define CUDA_LAUNCH(cuda_call,ite, ...)\ 637 gridDim.x = ite.wthr.x;\ 638 gridDim.y = ite.wthr.y;\ 639 gridDim.z = ite.wthr.z;\ 641 blockDim.x = ite.thr.x;\ 642 blockDim.y = ite.thr.y;\ 643 blockDim.z = ite.thr.z;\ 647 std::cout << "Launching: " << #cuda_call << std::endl;\ 650 [&](boost::context::fiber && main) -> void {\ 654 cuda_call(__VA_ARGS__);\ 656 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 660 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 669 gridDim.x = wthr__.x;\ 670 gridDim.y = wthr__.y;\ 671 gridDim.z = wthr__.z;\ 673 blockDim.x = thr__.x;\ 674 blockDim.y = thr__.y;\ 675 blockDim.z = thr__.z;\ 678 std::cout << "Launching: " << #cuda_call << std::endl;\ 681 [&] (boost::context::fiber && main) -> void {\ 685 cuda_call(__VA_ARGS__);\ 689 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 696 #define CUDA_LAUNCH(cuda_call,ite, ...) \ 698 gridDim.x = ite.wthr.x;\ 699 gridDim.y = ite.wthr.y;\ 700 gridDim.z = ite.wthr.z;\ 702 blockDim.x = ite.thr.x;\ 703 blockDim.y = ite.thr.y;\ 704 blockDim.z = ite.thr.z;\ 708 exe_kernel([&]() -> void {\ 711 cuda_call(__VA_ARGS__);\ 715 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 718 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \ 720 gridDim.x = ite.wthr.x;\ 721 gridDim.y = ite.wthr.y;\ 722 gridDim.z = ite.wthr.z;\ 724 blockDim.x = ite.thr.x;\ 725 blockDim.y = ite.thr.y;\ 726 blockDim.z = ite.thr.z;\ 730 exe_kernel_lambda(lambda_f,ite);\ 732 CHECK_SE_CLASS1_POST("lambda",0)\ 735 #define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \ 737 gridDim.x = ite.wthr.x;\ 738 gridDim.y = ite.wthr.y;\ 739 gridDim.z = ite.wthr.z;\ 741 blockDim.x = ite.thr.x;\ 742 blockDim.y = ite.thr.y;\ 743 blockDim.z = ite.thr.z;\ 747 exe_kernel_lambda_tls(lambda_f,ite);\ 749 CHECK_SE_CLASS1_POST("lambda",0)\ 752 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 761 gridDim.x = wthr__.x;\ 762 gridDim.y = wthr__.y;\ 763 gridDim.z = wthr__.z;\ 765 blockDim.x = thr__.x;\ 766 blockDim.y = thr__.y;\ 767 blockDim.z = thr__.z;\ 771 exe_kernel([&]() -> void {\ 773 cuda_call(__VA_ARGS__);\ 777 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 780 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \ 788 gridDim.x = itg.wthr.x;\ 789 gridDim.y = itg.wthr.y;\ 790 gridDim.z = itg.wthr.z;\ 792 blockDim.x = itg.thr.x;\ 793 blockDim.y = itg.thr.y;\ 794 blockDim.z = itg.thr.z;\ 798 exe_kernel_lambda_tls(lambda_f,itg);\ 802 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\ 811 gridDim.x = wthr__.x;\ 812 gridDim.y = wthr__.y;\ 813 gridDim.z = wthr__.z;\ 815 blockDim.x = thr__.x;\ 816 blockDim.y = thr__.y;\ 817 blockDim.z = thr__.z;\ 821 exe_kernel([&]() -> void {\ 823 cuda_call(__VA_ARGS__);\ 837 constexpr
int default_kernel_wg_threads_ = 1024;
Optional outer namespace(s)
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....