8#ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
9#define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
17#if CUDART_VERSION < 11000
20#include "util/cuda_launch.hpp"
23#if !defined(CUDA_ON_CPU)
24#include "util/cudify/cuda/operators.hpp"
29template<
typename type_t>
31 __device__ __host__ type_t operator()()
const {
36template<
typename type_t>
38 __device__ __host__ type_t operator()()
const {
39 return std::numeric_limits<type_t>::max();
43template<
typename type_t>
45 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
50template<
unsigned int prp>
53 typedef boost::mpl::int_<prp> prop;
57 template<
typename red_type>
58 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
63 static bool is_special()
69 template<
typename seg_type,
typename output_type>
70 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
74template<
typename type_t>
75struct leftOperand_t :
public std::binary_function<type_t, type_t, type_t> {
76 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
81template<
unsigned int prp>
84 typedef boost::mpl::int_<prp> prop;
88 template<
typename red_type>
89 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
94 static bool is_special()
100 template<
typename seg_type,
typename output_type>
101 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
105template<
unsigned int prp>
108 typedef boost::mpl::int_<prp> prop;
115 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
120 static bool is_special()
126 template<
typename seg_type,
typename output_type>
127 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
133template<
typename vect_type>
134__global__
void set_one_insert_buffer(vect_type vadd)
137 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
139 if (p >= vadd.size())
142 vadd.template get<0>(p) = 1;
145template<
typename type_t,
unsigned int blockLength>
146struct plus_block_t :
public std::binary_function<type_t, type_t, type_t> {
147 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
149 for (
int i=0; i<blockLength; ++i)
151 res[i] = a[i] + b[i];
159template<
unsigned int prp,
unsigned int blockLength>
162 typedef boost::mpl::int_<prp> prop;
165 template<
typename red_type>
using op_red = plus_block_t<red_type, blockLength>;
169 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
172 for (
int i=0; i<blockLength; ++i)
174 res[i] = r1[i] + r2[i];
179 static bool is_special()
185 template<
typename seg_type,
typename output_type>
186 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
190template<
unsigned int prp>
193 typedef boost::mpl::int_<prp> prop;
200 template<
typename red_type>
201 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
203 return (r1 < r2)?r2:r1;
206 static bool is_special()
212 template<
typename seg_type,
typename output_type>
213 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
219template<
typename type_t,
unsigned int blockLength>
220struct maximum_block_t :
public std::binary_function<type_t, type_t, type_t> {
221 GPU_HOST_DEVICE type_t operator()(type_t a, type_t b)
const {
223 for (
int i=0; i<blockLength; ++i)
225 res[i] = max(a[i], b[i]);
233template<
unsigned int prp,
unsigned int blockLength>
236 typedef boost::mpl::int_<prp> prop;
239 template<
typename red_type>
using op_red = maximum_block_t<red_type, blockLength>;
243 template<
typename red_type>
244 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
247 for (
int i=0; i<blockLength; ++i)
249 res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
254 static bool is_special()
260 template<
typename seg_type,
typename output_type>
261 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
267template<
unsigned int prp>
270 typedef boost::mpl::int_<prp> prop;
277 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
279 return (r1 < r2)?r1:r2;
282 static bool is_special()
288 template<
typename seg_type,
typename output_type>
289 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
295template<
typename type_t,
unsigned int blockLength>
296struct minimum_block_t :
public std::binary_function<type_t, type_t, type_t> {
297 GPU_HOST_DEVICE type_t operator()(type_t a, type_t b)
const {
299 for (
int i=0; i<blockLength; ++i)
301 res[i] = min(a[i], b[i]);
309template<
unsigned int prp,
unsigned int blockLength>
312 typedef boost::mpl::int_<prp> prop;
315 template<
typename red_type>
using op_red = minimum_block_t<red_type, blockLength>;
319 template<
typename red_type>
320 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
323 for (
int i=0; i<blockLength; ++i)
325 res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
330 static bool is_special()
336 template<
typename seg_type,
typename output_type>
337 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
344template<
typename type_t>
345struct bitwiseOr_t :
public std::binary_function<type_t, type_t, type_t> {
346 GPU_HOST_DEVICE type_t operator()(type_t a, type_t b)
const {
351template<
unsigned int prp>
354 typedef boost::mpl::int_<prp>
prop;
356 template<
typename red_type>
using op_red = bitwiseOr_t<red_type>;
358 template<
typename red_type>
359 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
364 static bool is_special()
370 template<
typename seg_type,
typename output_type>
371 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
376template<
unsigned int prp>
379 typedef boost::mpl::int_<prp>
prop;
384 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
389 static bool is_special()
395 template<
typename seg_type,
typename output_type>
396 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
398 output.template get<0>(i) = seg_prev;
402template<
unsigned int prp>
405 typedef boost::mpl::int_<prp>
prop;
409 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
414 static bool is_special()
420 template<
typename seg_type,
typename output_type>
421 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
423 output.template get<0>(i) = seg_next;
427template<
unsigned int prp>
430 typedef boost::mpl::int_<prp>
prop;
434 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
439 static bool is_special()
445 template<
typename seg_type,
typename output_type>
446 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
448 output.template get<0>(i) = seg_next - seg_prev;
453template<
typename vector_index_type>
454__global__
void construct_insert_list_key_only(vector_index_type vit_block_data,
455 vector_index_type vit_block_n,
456 vector_index_type vit_block_scan,
457 vector_index_type vit_list_0,
458 vector_index_type vit_list_1,
461 int n_move = vit_block_n.template get<0>(blockIdx.x);
462 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
463 int start = vit_block_scan.template get<0>(blockIdx.x);
466 for ( ; i < n_block_move ; i++)
468 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
469 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
473 if (threadIdx.x < n_move - i*blockDim.x )
475 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
476 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
480template<
typename vector_index_type>
481__global__
void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
482 vector_index_type vit_block_n,
483 vector_index_type vit_block_scan,
484 vector_index_type vit_list_0,
485 vector_index_type vit_list_1,
488 int p = blockIdx.x * blockDim.x + threadIdx.x;
490 if (p >= vit_block_data.size()) {
return;}
492 int pool_id = p / nslot;
493 int thr_id = p % nslot;
494 int start = vit_block_scan.template get<0>(pool_id);
495 int n = vit_block_scan.template get<0>(pool_id+1) - start;
500 vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id);
501 vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id;
506template<
typename vector_index_type>
507__global__
void construct_remove_list(vector_index_type vit_block_data,
508 vector_index_type vit_block_n,
509 vector_index_type vit_block_scan,
510 vector_index_type vit_list_0,
511 vector_index_type vit_list_1,
514 int n_move = vit_block_n.template get<0>(blockIdx.x);
515 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
516 int start = vit_block_scan.template get<0>(blockIdx.x);
519 for ( ; i < n_block_move ; i++)
521 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
522 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
526 if (threadIdx.x < n_move - i*blockDim.x )
528 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
529 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
534template<
typename e_type,
typename v_reduce>
542 __device__ __host__
inline data_merger(
const e_type & src1,
const e_type & src2,
const e_type & dst)
543 :src1(src1),src2(src2),dst(dst)
550 __device__ __host__
inline void operator()(T& t)
const
552 typedef typename boost::mpl::at<v_reduce,T>::type red_type;
554 dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
558template<
typename vector_index_type,
typename vector_data_type,
typename vector_index_type2,
unsigned int block_dim,
typename ... v_reduce>
559__global__
void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
560 vector_index_type merge_index, vector_data_type vct_add_data,
561 vector_index_type vct_index_out, vector_data_type vct_data_out,
562 vector_index_type2 vct_tot_out,
565 typedef typename std::remove_reference<
decltype(vct_index.template get<0>(0))>::type index_type;
570 __shared__
typename BlockScan::TempStorage temp_storage;
572 int p = blockIdx.x * blockDim.x + threadIdx.x;
577 if (p < vct_index.size())
579 index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
580 predicate = vct_index.template get<0>(p) != id_check;
586 BlockScan(temp_storage).ExclusiveSum(scan, scan);
588 if (predicate == 1 && p < vct_index.size())
590 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
592 int index1 = merge_index.template get<0>(p);
594 auto e = vct_data_out.get(blockIdx.x*block_dim + scan);
598 e = vct_data.get(index1);
599 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
603 e = vct_add_data.get(index1 - base);
604 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
610 if (predicate == 0 && p < vct_index.size())
613 if (threadIdx.x == blockDim.x-1)
614 {vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);}
618 typedef boost::mpl::vector<v_reduce ...> v_reduce_;
620 int index1 = merge_index.template get<0>(p);
621 int index2 = merge_index.template get<0>(p+1) - base;
623 data_merger<
decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
624 vct_add_data.get(index2),
625 vct_data_out.get(blockIdx.x*block_dim + scan));
628 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(v_reduce)>>(dm);
631 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
633 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
634 vct_tot_out.template get<2>(blockIdx.x) = predicate;
639template<
typename vector_index_type,
typename vector_index_type2,
unsigned int block_dim>
640__global__
void solve_conflicts_remove(vector_index_type vct_index,
641 vector_index_type merge_index,
642 vector_index_type vct_index_out,
643 vector_index_type vct_index_out_ps,
644 vector_index_type2 vct_tot_out,
647 typedef typename std::remove_reference<
decltype(vct_index.template get<0>(0))>::type index_type;
652 __shared__
typename BlockScan::TempStorage temp_storage;
654 int p = blockIdx.x * blockDim.x + threadIdx.x;
658 if (p < vct_index.size())
660 index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
661 index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1);
662 index_type id_check = vct_index.template get<0>(p);
663 predicate = id_check != id_check_p;
664 predicate &= id_check != id_check_n;
665 int mi = merge_index.template get<0>(p);
666 predicate &= (mi < base);
671 BlockScan(temp_storage).ExclusiveSum(scan, scan);
673 if (predicate == 1 && p < vct_index.size())
675 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
676 vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p);
681 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
683 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
687template<
typename vector_type,
typename vector_type2,
typename red_op>
688__global__
void reduce_from_offset(
vector_type segment_offset,
690 typename std::remove_reference<
decltype(segment_offset.template get<1>(0))>::type max_index)
692 int p = blockIdx.x * blockDim.x + threadIdx.x;
694 if (p >= segment_offset.size())
return;
696 typename std::remove_reference<
decltype(segment_offset.template get<1>(0))>::type v;
697 if (p == segment_offset.size()-1)
700 {v = segment_offset.template get<1>(p+1);}
702 red_op::set(v,segment_offset.template get<1>(p),output,p);
705template<
typename vector_index_type,
typename vector_data_type,
typename vector_index_type2>
706__global__
void realign(vector_index_type vct_index, vector_data_type vct_data,
707 vector_index_type vct_index_out, vector_data_type vct_data_out,
708 vector_index_type2 vct_tot_out_scan)
710 int p = blockIdx.x * blockDim.x + threadIdx.x;
712 if (p >= vct_index.size())
return;
714 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
719 if (threadIdx.x > tot)
722 if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1)
727 if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0)
730 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
732 if (ds + threadIdx.x >= vct_index_out.size())
735 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
737 auto src = vct_data.get(p);
738 auto dst = vct_data_out.get(ds+threadIdx.x);
743template<
typename vector_index_type,
typename vct_data_type,
typename vector_index_type2>
744__global__
void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
745 vector_index_type vct_index_out, vct_data_type vct_data_out,
746 vector_index_type2 vct_tot_out_scan)
748 int p = blockIdx.x * blockDim.x + threadIdx.x;
750 if (p >= vct_index.size())
return;
752 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
754 if (threadIdx.x >= tot)
757 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
759 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
761 int oi = vct_m_index.template get<0>(p);
763 auto src = vct_data.get(oi);
764 auto dst = vct_data_out.get(ds+threadIdx.x);
769template<
typename vector_index_type,
typename vector_data_type>
770__global__
void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
772 int p = blockIdx.x * blockDim.x + threadIdx.x;
774 if (p >= vi.size())
return;
778 v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p));
781template<
typename vector_index_type>
782__global__
void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
784 int p = blockIdx.x * blockDim.x + threadIdx.x;
786 if (p >= vi.size())
return;
790 seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p));
794template<
unsigned int prp,
typename vector_type>
795__global__
void set_indexes(
vector_type vd,
int off)
797 int p = blockIdx.x * blockDim.x + threadIdx.x;
799 if (p >= vd.size()) {
return;}
801 vd.template get<prp>(p) = p + off;
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
temporal buffer for reductions