8 #ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
9 #define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
17 #if CUDART_VERSION < 11000
20 #include "util/cuda_util.hpp"
23 #if !defined(CUDA_ON_CPU)
24 #include "util/cudify/cuda/operators.hpp"
29 template<
typename type_t>
31 __device__ __host__ type_t operator()()
const {
36 template<
typename type_t>
38 __device__ __host__ type_t operator()()
const {
39 return std::numeric_limits<type_t>::max();
43 template<
typename type_t>
45 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
50 template<
unsigned int prp>
53 typedef boost::mpl::int_<prp> prop;
57 template<
typename red_type>
58 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
63 static bool is_special()
69 template<
typename seg_type,
typename output_type>
70 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
74 template<
typename type_t>
76 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
81 template<
unsigned int prp>
84 typedef boost::mpl::int_<prp> prop;
88 template<
typename red_type>
89 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
94 static bool is_special()
100 template<
typename seg_type,
typename output_type>
101 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
105 template<
unsigned int prp>
108 typedef boost::mpl::int_<prp> prop;
115 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
120 static bool is_special()
126 template<
typename seg_type,
typename output_type>
127 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
133 template<
typename vect_type>
134 __global__
void set_one_insert_buffer(vect_type vadd)
137 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
139 if (p >= vadd.size())
142 vadd.template get<0>(p) = 1;
145 template<
typename type_t,
unsigned int blockLength>
146 struct plus_block_t {
147 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
149 for (
int i=0; i<blockLength; ++i)
151 res[i] = a[i] + b[i];
159 template<
unsigned int prp,
unsigned int blockLength>
162 typedef boost::mpl::int_<prp> prop;
165 template<
typename red_type>
using op_red = plus_block_t<red_type, blockLength>;
169 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
172 for (
int i=0; i<blockLength; ++i)
174 res[i] = r1[i] + r2[i];
179 static bool is_special()
185 template<
typename seg_type,
typename output_type>
186 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
190 template<
unsigned int prp>
193 typedef boost::mpl::int_<prp> prop;
200 template<
typename red_type>
201 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
203 return (r1 < r2)?r2:r1;
206 static bool is_special()
212 template<
typename seg_type,
typename output_type>
213 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
219 template<
typename type_t,
unsigned int blockLength>
220 struct maximum_block_t {
221 __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b)
const {
223 for (
int i=0; i<blockLength; ++i)
225 res[i] = max(a[i], b[i]);
233 template<
unsigned int prp,
unsigned int blockLength>
236 typedef boost::mpl::int_<prp> prop;
239 template<
typename red_type>
using op_red = maximum_block_t<red_type, blockLength>;
243 template<
typename red_type>
244 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
247 for (
int i=0; i<blockLength; ++i)
249 res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
254 static bool is_special()
260 template<
typename seg_type,
typename output_type>
261 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
267 template<
unsigned int prp>
270 typedef boost::mpl::int_<prp> prop;
277 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
279 return (r1 < r2)?r1:r2;
282 static bool is_special()
288 template<
typename seg_type,
typename output_type>
289 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
295 template<
typename type_t,
unsigned int blockLength>
296 struct minimum_block_t {
297 __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b)
const {
299 for (
int i=0; i<blockLength; ++i)
301 res[i] = min(a[i], b[i]);
309 template<
unsigned int prp,
unsigned int blockLength>
312 typedef boost::mpl::int_<prp> prop;
315 template<
typename red_type>
using op_red = minimum_block_t<red_type, blockLength>;
319 template<
typename red_type>
320 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
323 for (
int i=0; i<blockLength; ++i)
325 res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
330 static bool is_special()
336 template<
typename seg_type,
typename output_type>
337 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
344 template<
typename type_t>
346 __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b)
const {
351 template<
unsigned int prp>
354 typedef boost::mpl::int_<prp>
prop;
356 template<
typename red_type>
using op_red = bitwiseOr_t<red_type>;
358 template<
typename red_type>
359 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
364 static bool is_special()
370 template<
typename seg_type,
typename output_type>
371 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
376 template<
unsigned int prp>
379 typedef boost::mpl::int_<prp>
prop;
384 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
389 static bool is_special()
395 template<
typename seg_type,
typename output_type>
396 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
398 output.template get<0>(i) = seg_prev;
402 template<
unsigned int prp>
405 typedef boost::mpl::int_<prp>
prop;
409 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
414 static bool is_special()
420 template<
typename seg_type,
typename output_type>
421 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
423 output.template get<0>(i) = seg_next;
427 template<
unsigned int prp>
430 typedef boost::mpl::int_<prp>
prop;
434 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
439 static bool is_special()
445 template<
typename seg_type,
typename output_type>
446 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
448 output.template get<0>(i) = seg_next - seg_prev;
453 template<
typename vector_index_type>
454 __global__
void construct_insert_list_key_only(vector_index_type vit_block_data,
455 vector_index_type vit_block_n,
456 vector_index_type vit_block_scan,
457 vector_index_type vit_list_0,
458 vector_index_type vit_list_1,
461 int n_move = vit_block_n.template get<0>(blockIdx.x);
462 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
463 int start = vit_block_scan.template get<0>(blockIdx.x);
466 for ( ; i < n_block_move ; i++)
468 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
469 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
473 if (threadIdx.x < n_move - i*blockDim.x )
475 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
476 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
480 template<
typename vector_index_type>
481 __global__
void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
482 vector_index_type vit_block_n,
483 vector_index_type vit_block_scan,
484 vector_index_type vit_list_0,
485 vector_index_type vit_list_1,
488 int p = blockIdx.x * blockDim.x + threadIdx.x;
490 if (p >= vit_block_data.size()) {
return;}
492 int pool_id = p / nslot;
493 int thr_id = p % nslot;
494 int start = vit_block_scan.template get<0>(pool_id);
495 int n = vit_block_scan.template get<0>(pool_id+1) - start;
500 vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id);
501 vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id;
506 template<
typename vector_index_type>
507 __global__
void construct_remove_list(vector_index_type vit_block_data,
508 vector_index_type vit_block_n,
509 vector_index_type vit_block_scan,
510 vector_index_type vit_list_0,
511 vector_index_type vit_list_1,
514 int n_move = vit_block_n.template get<0>(blockIdx.x);
515 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
516 int start = vit_block_scan.template get<0>(blockIdx.x);
519 for ( ; i < n_block_move ; i++)
521 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
522 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
526 if (threadIdx.x < n_move - i*blockDim.x )
528 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
529 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
534 template<
typename e_type,
typename v_reduce>
542 __device__ __host__
inline data_merger(
const e_type & src1,
const e_type & src2,
const e_type & dst)
543 :src1(src1),src2(src2),dst(dst)
550 __device__ __host__
inline void operator()(T& t)
const
552 typedef typename boost::mpl::at<v_reduce,T>::type red_type;
554 dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
558 template<
typename vector_index_type,
typename vector_data_type,
typename vector_index_type2,
unsigned int block_dim,
typename ... v_reduce>
559 __global__
void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
560 vector_index_type merge_index, vector_data_type vct_add_data,
561 vector_index_type vct_index_out, vector_data_type vct_data_out,
562 vector_index_type2 vct_tot_out,
565 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
570 __shared__
typename BlockScan::TempStorage temp_storage;
573 int p = blockIdx.x * blockDim.x + threadIdx.x;
578 if (p < vct_index.size())
580 index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
581 predicate = vct_index.template get<0>(p) != id_check;
587 BlockScan(temp_storage).ExclusiveSum(scan, scan);
589 size_t vct_index_out_index = blockIdx.x*block_dim + scan;
591 if (predicate == 1 && p < vct_index.size() && vct_index_out_index < vct_index_out.size())
593 vct_index_out.template get<0>(vct_index_out_index) = vct_index.template get<0>(p);
595 int index1 = merge_index.template get<0>(p);
597 auto e = vct_data_out.get(vct_index_out_index);
601 e = vct_data.get(index1);
602 vct_data_out.get(vct_index_out_index) = e;
606 e = vct_add_data.get(index1 - base);
607 vct_data_out.get(vct_index_out_index) = e;
613 if (predicate == 0 && p < vct_index.size() && vct_index_out_index < vct_index_out.size())
616 if (threadIdx.x == blockDim.x-1)
617 {vct_index_out.template get<0>(vct_index_out_index) = vct_index.template get<0>(p);}
621 typedef boost::mpl::vector<v_reduce ...> v_reduce_;
623 int index1 = merge_index.template get<0>(p);
624 int index2 = merge_index.template get<0>(p+1) - base;
626 data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
627 vct_add_data.get(index2),
628 vct_data_out.get(vct_index_out_index));
631 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(v_reduce)>>(dm);
634 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
636 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
637 vct_tot_out.template get<2>(blockIdx.x) = predicate;
642 template<
typename vector_index_type,
typename vector_index_type2,
unsigned int block_dim>
643 __global__
void solve_conflicts_remove(vector_index_type vct_index,
644 vector_index_type merge_index,
645 vector_index_type vct_index_out,
646 vector_index_type vct_index_out_ps,
647 vector_index_type2 vct_tot_out,
650 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
655 __shared__
typename BlockScan::TempStorage temp_storage;
657 int p = blockIdx.x * blockDim.x + threadIdx.x;
661 if (p < vct_index.size())
663 index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
664 index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1);
665 index_type id_check = vct_index.template get<0>(p);
666 predicate = id_check != id_check_p;
667 predicate &= id_check != id_check_n;
668 int mi = merge_index.template get<0>(p);
669 predicate &= (mi < base);
674 BlockScan(temp_storage).ExclusiveSum(scan, scan);
676 if (predicate == 1 && p < vct_index.size())
678 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
679 vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p);
684 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
686 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
690 template<
typename vector_type,
typename vector_type2,
typename red_op>
691 __global__
void reduce_from_offset(
vector_type segment_offset,
693 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type max_index)
695 int p = blockIdx.x * blockDim.x + threadIdx.x;
697 if (p >= segment_offset.size())
return;
699 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type v;
700 if (p == segment_offset.size()-1)
703 {v = segment_offset.template get<1>(p+1);}
705 red_op::set(v,segment_offset.template get<1>(p),output,p);
708 template<
typename vector_index_type,
typename vector_data_type,
typename vector_index_type2>
709 __global__
void realign(vector_index_type vct_index, vector_data_type vct_data,
710 vector_index_type vct_index_out, vector_data_type vct_data_out,
711 vector_index_type2 vct_tot_out_scan)
713 int p = blockIdx.x * blockDim.x + threadIdx.x;
715 if (p >= vct_index.size())
return;
717 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
722 if (threadIdx.x > tot)
725 if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1)
730 if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0)
733 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
735 if (ds + threadIdx.x >= vct_index_out.size())
738 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
740 auto src = vct_data.get(p);
741 auto dst = vct_data_out.get(ds+threadIdx.x);
746 template<
typename vector_index_type,
typename vct_data_type,
typename vector_index_type2>
747 __global__
void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
748 vector_index_type vct_index_out, vct_data_type vct_data_out,
749 vector_index_type2 vct_tot_out_scan)
751 int p = blockIdx.x * blockDim.x + threadIdx.x;
753 if (p >= vct_index.size())
return;
755 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
757 if (threadIdx.x >= tot)
760 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
762 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
764 int oi = vct_m_index.template get<0>(p);
766 auto src = vct_data.get(oi);
767 auto dst = vct_data_out.get(ds+threadIdx.x);
772 template<
typename vector_index_type,
typename vector_data_type>
773 __global__
void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
775 int p = blockIdx.x * blockDim.x + threadIdx.x;
777 if (p >= vi.size())
return;
781 v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p));
784 template<
typename vector_index_type>
785 __global__
void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
787 int p = blockIdx.x * blockDim.x + threadIdx.x;
789 if (p >= vi.size())
return;
793 seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p));
797 template<
unsigned int prp,
typename vector_type>
798 __global__
void set_indexes(
vector_type vd,
int off)
800 int p = blockIdx.x * blockDim.x + threadIdx.x;
802 if (p >= vd.size()) {
return;}
804 vd.template get<prp>(p) = p + off;
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
temporal buffer for reductions