1#ifndef CUDIFY_SEQUENCIAL_HPP_
2#define CUDIFY_SEQUENCIAL_HPP_
4#define CUDA_ON_BACKEND CUDA_BACKEND_SEQUENTIAL
9constexpr int default_kernel_wg_threads_ = 1024;
11#include "util/cudify/cudify_hardware_cpu.hpp"
13#ifdef HAVE_BOOST_CONTEXT
15#include "util/cuda_util.hpp"
16#include <boost/bind/bind.hpp>
18#ifdef HAVE_BOOST_CONTEXT
19#include <boost/context/continuation.hpp>
25#ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
26#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
29extern std::vector<void *>mem_stack;
31extern thread_local dim3 threadIdx;
32extern thread_local dim3 blockIdx;
37extern std::vector<void *> mem_stack;
38extern std::vector<boost::context::detail::fcontext_t> contexts;
39extern thread_local void * par_glob;
40extern thread_local boost::context::detail::fcontext_t main_ctx;
42static void __syncthreads()
44 boost::context::detail::jump_fcontext(main_ctx,par_glob);
49extern int thread_local vct_atomic_add;
50extern int thread_local vct_atomic_rem;
55 template<
typename T,
unsigned int dim>
59 typedef std::array<T,dim> TempStorage;
74 tmp[threadIdx.x] = in;
82 for (
int i = 1 ; i < dim ; i++)
84 auto next = tmp[i-1] + prec;
92 out = tmp[threadIdx.x];
99template<
typename T,
typename T2>
100static T atomicAdd(T * address, T2 val)
110 template<
typename type_t>
111 struct less_t :
public std::binary_function<type_t, type_t, bool> {
112 bool operator()(type_t a, type_t b)
const {
115 template<
typename type2_t,
typename type3_t>
116 bool operator()(type2_t a, type3_t b)
const {
126 template<
typename type_t>
127 struct greater_t :
public std::binary_function<type_t, type_t, bool> {
128 bool operator()(type_t a, type_t b)
const {
131 template<
typename type2_t,
typename type3_t>
132 bool operator()(type2_t a, type3_t b)
const {
158 template<
typename type_t>
159 struct plus_t :
public std::binary_function<type_t, type_t, type_t> {
160 type_t operator()(type_t a, type_t b)
const {
179 template<
typename type_t>
180 struct maximum_t :
public std::binary_function<type_t, type_t, type_t> {
181 type_t operator()(type_t a, type_t b)
const {
182 return std::max(a, b);
186 template<
typename type_t>
187 struct minimum_t :
public std::binary_function<type_t, type_t, type_t> {
188 type_t operator()(type_t a, type_t b)
const {
189 return std::min(a, b);
197 template<
typename input_it,
198 typename segments_it,
typename output_it,
typename op_t,
typename type_t,
typename context_t>
199 void segreduce(input_it input,
int count, segments_it segments,
200 int num_segments, output_it output, op_t op, type_t init,
204 for ( ; i < num_segments - 1; i++)
207 output[i] = input[j];
209 for ( ; j < segments[i+1] ; j++)
211 output[i] = op(output[i],input[j]);
217 output[i] = input[j];
219 for ( ; j < count ; j++)
221 output[i] = op(output[i],input[j]);
226 template<
typename a_keys_it,
typename a_vals_it,
227 typename b_keys_it,
typename b_vals_it,
228 typename c_keys_it,
typename c_vals_it,
229 typename comp_t,
typename context_t>
230 void merge(a_keys_it a_keys, a_vals_it a_vals,
int a_count,
231 b_keys_it b_keys, b_vals_it b_vals,
int b_count,
232 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
238 while (a_it < a_count || b_it < b_count)
244 if (comp(b_keys[b_it],a_keys[a_it]))
246 c_keys[c_it] = b_keys[b_it];
247 c_vals[c_it] = b_vals[b_it];
253 c_keys[c_it] = a_keys[a_it];
254 c_vals[c_it] = a_vals[a_it];
261 c_keys[c_it] = a_keys[a_it];
262 c_vals[c_it] = a_vals[a_it];
269 c_keys[c_it] = b_keys[b_it];
270 c_vals[c_it] = b_vals[b_it];
278static void init_wrappers()
281template<
typename lambda_f>
296template<
typename lambda_f>
303 Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx)
304 :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx)
309 Fn(blockIdx,threadIdx);
313template<
typename Fun_enc_type>
314void launch_kernel(boost::context::detail::transfer_t par)
318 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
322 boost::context::detail::jump_fcontext(par.fctx,0);
325template<
typename lambda_f,
typename ite_type>
326static void exe_kernel(lambda_f f, ite_type & ite)
328 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
330 if (mem_stack.size() < ite.nthrs())
332 int old_size = mem_stack.size();
333 mem_stack.resize(ite.nthrs());
335 for (
int i = old_size ; i < mem_stack.size() ; i++)
337 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
342 contexts.resize(mem_stack.size());
344 Fun_enc<lambda_f> fe(f);
346 for (
int i = 0 ; i < ite.wthr.z ; i++)
349 for (
int j = 0 ; j < ite.wthr.y ; j++)
352 for (
int k = 0 ; k < ite.wthr.x ; k++)
356 for (
int it = 0 ; it < ite.thr.z ; it++)
358 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
360 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
362 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
368 bool work_to_do =
true;
373 for (
int it = 0 ; it < ite.thr.z ; it++)
376 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
379 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
382 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
383 contexts[nc] = t.fctx;
384 work_to_do &= (t.data != 0);
395template<
typename lambda_f,
typename ite_type>
396static void exe_kernel_lambda(lambda_f f, ite_type & ite)
398 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
400 if (mem_stack.size() < ite.nthrs())
402 int old_size = mem_stack.size();
403 mem_stack.resize(ite.nthrs());
405 for (
int i = old_size ; i < mem_stack.size() ; i++)
407 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
412 contexts.resize(mem_stack.size());
414 bool is_sync_free =
true;
416 bool first_block =
true;
418 for (
int i = 0 ; i < ite.wthr.z ; i++)
420 for (
int j = 0 ; j < ite.wthr.y ; j++)
422 for (
int k = 0 ; k < ite.wthr.x ; k++)
426 Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx);
427 if (first_block ==
true || is_sync_free ==
false)
433 for (
int it = 0 ; it < ite.thr.z ; it++)
435 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
437 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
439 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>);
446 bool work_to_do =
true;
451 for (
int it = 0 ; it < ite.thr.z ; it++)
454 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
457 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
460 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
461 contexts[nc] = t.fctx;
463 work_to_do &= (t.data != 0);
464 is_sync_free &= !(work_to_do);
478 for (
int it = 0 ; it < ite.thr.z ; it++)
481 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
484 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
487 f(blockIdx,threadIdx);
499template<
typename lambda_f,
typename ite_type>
500static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite)
502 if (ite.nthrs() == 0 || ite.nblocks() == 0) {
return;}
504 if (mem_stack.size() < ite.nthrs())
506 int old_size = mem_stack.size();
507 mem_stack.resize(ite.nthrs());
509 for (
int i = old_size ; i < mem_stack.size() ; i++)
511 mem_stack[i] =
new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE];
516 contexts.resize(mem_stack.size());
518 bool is_sync_free =
true;
520 bool first_block =
true;
522 for (
int i = 0 ; i < ite.wthr.z ; i++)
524 for (
int j = 0 ; j < ite.wthr.y ; j++)
526 for (
int k = 0 ; k < ite.wthr.x ; k++)
528 Fun_enc<lambda_f> fe(f);
529 if (first_block ==
true || is_sync_free ==
false)
535 for (
int it = 0 ; it < ite.thr.z ; it++)
537 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
539 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
541 contexts[nc] = boost::context::detail::make_fcontext((
char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);
548 bool work_to_do =
true;
553 for (
int it = 0 ; it < ite.thr.z ; it++)
556 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
559 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
562 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
563 contexts[nc] = t.fctx;
565 work_to_do &= (t.data != 0);
566 is_sync_free &= !(work_to_do);
580 for (
int it = 0 ; it < ite.thr.z ; it++)
583 for (
int jt = 0 ; jt < ite.thr.y ; jt++)
586 for (
int kt = 0 ; kt < ite.thr.x ; kt++)
601template<
typename lambda_f,
typename ite_type>
602static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
604 for (
int i = 0 ; i < ite.wthr.z ; i++)
607 for (
int j = 0 ; j < ite.wthr.y ; j++)
610 for (
int k = 0 ; k < ite.wthr.x ; k++)
615 for (
int it = 0 ; it < ite.wthr.z ; it++)
618 for (
int jt = 0 ; jt < ite.wthr.y ; jt++)
621 for (
int kt = 0 ; kt < ite.wthr.x ; kt++)
633#ifdef PRINT_CUDA_LAUNCHES
635#define CUDA_LAUNCH(cuda_call,ite, ...)\
637 gridDim.x = ite.wthr.x;\
638 gridDim.y = ite.wthr.y;\
639 gridDim.z = ite.wthr.z;\
641 blockDim.x = ite.thr.x;\
642 blockDim.y = ite.thr.y;\
643 blockDim.z = ite.thr.z;\
647 std::cout << "Launching: " << #cuda_call << std::endl;\
650 [&](boost::context::fiber && main) -> void {\
654 cuda_call(__VA_ARGS__);\
656 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
660#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
669 gridDim.x = wthr__.x;\
670 gridDim.y = wthr__.y;\
671 gridDim.z = wthr__.z;\
673 blockDim.x = thr__.x;\
674 blockDim.y = thr__.y;\
675 blockDim.z = thr__.z;\
678 std::cout << "Launching: " << #cuda_call << std::endl;\
681 [&] (boost::context::fiber && main) -> void {\
685 cuda_call(__VA_ARGS__);\
689 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
696#define CUDA_LAUNCH(cuda_call,ite, ...) \
698 gridDim.x = ite.wthr.x;\
699 gridDim.y = ite.wthr.y;\
700 gridDim.z = ite.wthr.z;\
702 blockDim.x = ite.thr.x;\
703 blockDim.y = ite.thr.y;\
704 blockDim.z = ite.thr.z;\
708 exe_kernel([&]() -> void {\
711 cuda_call(__VA_ARGS__);\
715 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
718#define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \
720 gridDim.x = ite.wthr.x;\
721 gridDim.y = ite.wthr.y;\
722 gridDim.z = ite.wthr.z;\
724 blockDim.x = ite.thr.x;\
725 blockDim.y = ite.thr.y;\
726 blockDim.z = ite.thr.z;\
730 exe_kernel_lambda(lambda_f,ite);\
732 CHECK_SE_CLASS1_POST("lambda",0)\
735#define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \
737 gridDim.x = ite.wthr.x;\
738 gridDim.y = ite.wthr.y;\
739 gridDim.z = ite.wthr.z;\
741 blockDim.x = ite.thr.x;\
742 blockDim.y = ite.thr.y;\
743 blockDim.z = ite.thr.z;\
747 exe_kernel_lambda_tls(lambda_f,ite);\
749 CHECK_SE_CLASS1_POST("lambda",0)\
752#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
761 gridDim.x = wthr__.x;\
762 gridDim.y = wthr__.y;\
763 gridDim.z = wthr__.z;\
765 blockDim.x = thr__.x;\
766 blockDim.y = thr__.y;\
767 blockDim.z = thr__.z;\
771 exe_kernel([&]() -> void {\
773 cuda_call(__VA_ARGS__);\
777 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
780#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \
788 gridDim.x = itg.wthr.x;\
789 gridDim.y = itg.wthr.y;\
790 gridDim.z = itg.wthr.z;\
792 blockDim.x = itg.thr.x;\
793 blockDim.y = itg.thr.y;\
794 blockDim.z = itg.thr.z;\
798 exe_kernel_lambda_tls(lambda_f,itg);\
802#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
811 gridDim.x = wthr__.x;\
812 gridDim.y = wthr__.y;\
813 gridDim.z = wthr__.z;\
815 blockDim.x = thr__.x;\
816 blockDim.y = thr__.y;\
817 blockDim.z = thr__.z;\
821 exe_kernel([&]() -> void {\
823 cuda_call(__VA_ARGS__);\
837constexpr int default_kernel_wg_threads_ = 1024;
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Optional outer namespace(s)