8 #ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ 9 #define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ 15 #if CUDART_VERSION < 11000 18 #include "util/cuda/moderngpu/operators.hxx" 19 #include "util/cuda_launch.hpp" 21 #if !defined(CUDA_ON_CPU) 22 #include "util/cuda/moderngpu/operators.hxx" 28 template<
typename type_t>
29 struct rightOperand_t :
public std::binary_function<type_t, type_t, type_t> {
30 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
35 template<
unsigned int prp>
38 typedef boost::mpl::int_<prp> prop;
42 template<
typename red_type>
43 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
48 static bool is_special()
54 template<
typename seg_type,
typename output_type>
55 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
59 template<
typename type_t>
60 struct leftOperand_t :
public std::binary_function<type_t, type_t, type_t> {
61 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
66 template<
unsigned int prp>
69 typedef boost::mpl::int_<prp> prop;
73 template<
typename red_type>
74 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
79 static bool is_special()
85 template<
typename seg_type,
typename output_type>
86 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
90 template<
unsigned int prp>
93 typedef boost::mpl::int_<prp> prop;
99 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
104 static bool is_special()
110 template<
typename seg_type,
typename output_type>
111 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
117 template<
typename vect_type>
118 __global__
void set_one_insert_buffer(vect_type vadd)
121 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
123 if (p >= vadd.size())
126 vadd.template get<0>(p) = 1;
129 template<
typename type_t,
unsigned int blockLength>
130 struct plus_block_t :
public std::binary_function<type_t, type_t, type_t> {
131 __device__ __host__ type_t operator()(type_t a, type_t b)
const {
133 for (
int i=0; i<blockLength; ++i)
135 res[i] = a[i] + b[i];
143 template<
unsigned int prp,
unsigned int blockLength>
146 typedef boost::mpl::int_<prp> prop;
149 template<
typename red_type>
using op_red = plus_block_t<red_type, blockLength>;
152 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
155 for (
int i=0; i<blockLength; ++i)
157 res[i] = r1[i] + r2[i];
162 static bool is_special()
168 template<
typename seg_type,
typename output_type>
169 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
173 template<
unsigned int prp>
176 typedef boost::mpl::int_<prp> prop;
182 template<
typename red_type>
183 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
185 return (r1 < r2)?r2:r1;
188 static bool is_special()
194 template<
typename seg_type,
typename output_type>
195 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
201 template<
typename type_t,
unsigned int blockLength>
202 struct maximum_block_t :
public std::binary_function<type_t, type_t, type_t> {
203 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b)
const {
205 for (
int i=0; i<blockLength; ++i)
207 res[i] = max(a[i], b[i]);
215 template<
unsigned int prp,
unsigned int blockLength>
218 typedef boost::mpl::int_<prp> prop;
221 template<
typename red_type>
using op_red = maximum_block_t<red_type, blockLength>;
224 template<
typename red_type>
225 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
228 for (
int i=0; i<blockLength; ++i)
230 res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
235 static bool is_special()
241 template<
typename seg_type,
typename output_type>
242 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
248 template<
unsigned int prp>
251 typedef boost::mpl::int_<prp> prop;
257 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
259 return (r1 < r2)?r1:r2;
262 static bool is_special()
268 template<
typename seg_type,
typename output_type>
269 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
275 template<
typename type_t,
unsigned int blockLength>
276 struct minimum_block_t :
public std::binary_function<type_t, type_t, type_t> {
277 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b)
const {
279 for (
int i=0; i<blockLength; ++i)
281 res[i] = min(a[i], b[i]);
289 template<
unsigned int prp,
unsigned int blockLength>
292 typedef boost::mpl::int_<prp> prop;
295 template<
typename red_type>
using op_red = minimum_block_t<red_type, blockLength>;
298 template<
typename red_type>
299 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
302 for (
int i=0; i<blockLength; ++i)
304 res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
309 static bool is_special()
315 template<
typename seg_type,
typename output_type>
316 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
323 template<
typename type_t>
324 struct bitwiseOr_t :
public std::binary_function<type_t, type_t, type_t> {
325 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b)
const {
330 template<
unsigned int prp>
333 typedef boost::mpl::int_<prp> prop;
335 template<
typename red_type>
using op_red = bitwiseOr_t<red_type>;
337 template<
typename red_type>
338 __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
343 static bool is_special()
349 template<
typename seg_type,
typename output_type>
350 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
355 template<
unsigned int prp>
358 typedef boost::mpl::int_<prp> prop;
362 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
367 static bool is_special()
373 template<
typename seg_type,
typename output_type>
374 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
376 output.template get<0>(i) = seg_prev;
380 template<
unsigned int prp>
383 typedef boost::mpl::int_<prp> prop;
387 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
392 static bool is_special()
398 template<
typename seg_type,
typename output_type>
399 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
401 output.template get<0>(i) = seg_next;
405 template<
unsigned int prp>
408 typedef boost::mpl::int_<prp> prop;
412 template<
typename red_type> __device__ __host__
static red_type
red(red_type & r1, red_type & r2)
417 static bool is_special()
423 template<
typename seg_type,
typename output_type>
424 __device__ __host__
static void set(seg_type seg_next, seg_type seg_prev, output_type & output,
int i)
426 output.template get<0>(i) = seg_next - seg_prev;
431 template<
typename vector_index_type>
432 __global__
void construct_insert_list_key_only(vector_index_type vit_block_data,
433 vector_index_type vit_block_n,
434 vector_index_type vit_block_scan,
435 vector_index_type vit_list_0,
436 vector_index_type vit_list_1,
439 int n_move = vit_block_n.template get<0>(blockIdx.x);
440 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
441 int start = vit_block_scan.template get<0>(blockIdx.x);
444 for ( ; i < n_block_move ; i++)
446 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
447 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
451 if (threadIdx.x < n_move - i*blockDim.x )
453 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
454 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
458 template<
typename vector_index_type>
459 __global__
void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
460 vector_index_type vit_block_n,
461 vector_index_type vit_block_scan,
462 vector_index_type vit_list_0,
463 vector_index_type vit_list_1,
466 int p = blockIdx.x * blockDim.x + threadIdx.x;
468 if (p >= vit_block_data.size()) {
return;}
470 int pool_id = p / nslot;
471 int thr_id = p % nslot;
472 int start = vit_block_scan.template get<0>(pool_id);
473 int n = vit_block_scan.template get<0>(pool_id+1) - start;
478 vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id);
479 vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id;
484 template<
typename vector_index_type>
485 __global__
void construct_remove_list(vector_index_type vit_block_data,
486 vector_index_type vit_block_n,
487 vector_index_type vit_block_scan,
488 vector_index_type vit_list_0,
489 vector_index_type vit_list_1,
492 int n_move = vit_block_n.template get<0>(blockIdx.x);
493 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
494 int start = vit_block_scan.template get<0>(blockIdx.x);
497 for ( ; i < n_block_move ; i++)
499 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
500 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
504 if (threadIdx.x < n_move - i*blockDim.x )
506 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
507 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
512 template<
typename e_type,
typename v_reduce>
520 __device__ __host__
inline data_merger(
const e_type & src1,
const e_type & src2,
const e_type & dst)
521 :src1(src1),src2(src2),dst(dst)
528 __device__ __host__
inline void operator()(T& t)
const 530 typedef typename boost::mpl::at<v_reduce,T>::type red_type;
532 dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
536 template<
typename vector_index_type,
typename vector_data_type,
typename vector_index_type2,
unsigned int block_dim,
typename ... v_reduce>
537 __global__
void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
538 vector_index_type merge_index, vector_data_type vct_add_data,
539 vector_index_type vct_index_out, vector_data_type vct_data_out,
540 vector_index_type2 vct_tot_out,
543 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
548 __shared__
typename BlockScan::TempStorage temp_storage;
550 int p = blockIdx.x * blockDim.x + threadIdx.x;
555 if (p < vct_index.size())
557 index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
558 predicate = vct_index.template get<0>(p) != id_check;
564 BlockScan(temp_storage).ExclusiveSum(scan, scan);
566 if (predicate == 1 && p < vct_index.size())
568 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
570 int index1 = merge_index.template get<0>(p);
572 auto e = vct_data_out.get(blockIdx.x*block_dim + scan);
576 e = vct_data.get(index1);
577 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
581 e = vct_add_data.get(index1 - base);
582 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
588 if (predicate == 0 && p < vct_index.size())
591 if (threadIdx.x == blockDim.x-1)
592 {vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);}
596 typedef boost::mpl::vector<v_reduce ...> v_reduce_;
598 int index1 = merge_index.template get<0>(p);
599 int index2 = merge_index.template get<0>(p+1) - base;
601 data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
602 vct_add_data.get(index2),
603 vct_data_out.get(blockIdx.x*block_dim + scan));
606 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(v_reduce)>>(dm);
609 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
611 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
612 vct_tot_out.template get<2>(blockIdx.x) = predicate;
617 template<
typename vector_index_type,
typename vector_index_type2,
unsigned int block_dim>
618 __global__
void solve_conflicts_remove(vector_index_type vct_index,
619 vector_index_type merge_index,
620 vector_index_type vct_index_out,
621 vector_index_type vct_index_out_ps,
622 vector_index_type2 vct_tot_out,
625 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
630 __shared__
typename BlockScan::TempStorage temp_storage;
632 int p = blockIdx.x * blockDim.x + threadIdx.x;
636 if (p < vct_index.size())
638 index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
639 index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1);
640 index_type id_check = vct_index.template get<0>(p);
641 predicate = id_check != id_check_p;
642 predicate &= id_check != id_check_n;
643 int mi = merge_index.template get<0>(p);
644 predicate &= (mi < base);
649 BlockScan(temp_storage).ExclusiveSum(scan, scan);
651 if (predicate == 1 && p < vct_index.size())
653 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
654 vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p);
659 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
661 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
665 template<
typename vector_type,
typename vector_type2,
typename red_op>
666 __global__
void reduce_from_offset(
vector_type segment_offset,
668 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type max_index)
670 int p = blockIdx.x * blockDim.x + threadIdx.x;
672 if (p >= segment_offset.size())
return;
674 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type v;
675 if (p == segment_offset.size()-1)
678 {v = segment_offset.template get<1>(p+1);}
680 red_op::set(v,segment_offset.template get<1>(p),output,p);
683 template<
typename vector_index_type,
typename vector_data_type,
typename vector_index_type2>
684 __global__
void realign(vector_index_type vct_index, vector_data_type vct_data,
685 vector_index_type vct_index_out, vector_data_type vct_data_out,
686 vector_index_type2 vct_tot_out_scan)
688 int p = blockIdx.x * blockDim.x + threadIdx.x;
690 if (p >= vct_index.size())
return;
692 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
697 if (threadIdx.x > tot)
700 if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1)
705 if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0)
708 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
710 if (ds + threadIdx.x >= vct_index_out.size())
713 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
715 auto src = vct_data.get(p);
716 auto dst = vct_data_out.get(ds+threadIdx.x);
721 template<
typename vector_index_type,
typename vct_data_type,
typename vector_index_type2>
722 __global__
void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
723 vector_index_type vct_index_out, vct_data_type vct_data_out,
724 vector_index_type2 vct_tot_out_scan)
726 int p = blockIdx.x * blockDim.x + threadIdx.x;
728 if (p >= vct_index.size())
return;
730 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
732 if (threadIdx.x >= tot)
735 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
737 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
739 int oi = vct_m_index.template get<0>(p);
741 auto src = vct_data.get(oi);
742 auto dst = vct_data_out.get(ds+threadIdx.x);
747 template<
typename vector_index_type,
typename vector_data_type>
748 __global__
void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
750 int p = blockIdx.x * blockDim.x + threadIdx.x;
752 if (p >= vi.size())
return;
756 v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p));
759 template<
typename vector_index_type>
760 __global__
void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
762 int p = blockIdx.x * blockDim.x + threadIdx.x;
764 if (p >= vi.size())
return;
768 seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p));
772 template<
unsigned int prp,
typename vector_type>
773 __global__
void set_indexes(
vector_type vd,
int off)
775 int p = blockIdx.x * blockDim.x + threadIdx.x;
777 if (p >= vd.size()) {
return;}
779 vd.template get<prp>(p) = p + off;
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
temporal buffer for reductions