OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
map_vector_sparse_cuda_kernels.cuh
1 /*
2  * map_vector_sparse_cuda_kernels.cuh
3  *
4  * Created on: Jan 26, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
9 #define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
10 
11 #ifdef __NVCC__
12 
13 #include "config.h"
14 
15 #if CUDART_VERSION < 11000
18 #include "util/cuda/moderngpu/operators.hxx"
19 #include "util/cuda_launch.hpp"
20 #else
21  #if !defined(CUDA_ON_CPU)
22  #include "util/cuda/moderngpu/operators.hxx"
23  #endif
24 #endif
25 
26 #endif
27 
28 template<typename type_t>
29 struct rightOperand_t : public std::binary_function<type_t, type_t, type_t> {
30  __device__ __host__ type_t operator()(type_t a, type_t b) const {
31  return b;
32  }
33 };
34 
35 template<unsigned int prp>
36 struct sRight_
37 {
38  typedef boost::mpl::int_<prp> prop;
39 
40  template<typename red_type> using op_red = rightOperand_t<red_type>;
41 
42  template<typename red_type>
43  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
44  {
45  return r2;
46  }
47 
48  static bool is_special()
49  {
50  return false;
51  }
52 
54  template<typename seg_type, typename output_type>
55  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
56  {}
57 };
58 
59 template<typename type_t>
60 struct leftOperand_t : public std::binary_function<type_t, type_t, type_t> {
61  __device__ __host__ type_t operator()(type_t a, type_t b) const {
62  return a;
63  }
64 };
65 
66 template<unsigned int prp>
67 struct sLeft_
68 {
69  typedef boost::mpl::int_<prp> prop;
70 
71  template<typename red_type> using op_red = leftOperand_t<red_type>;
72 
73  template<typename red_type>
74  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
75  {
76  return r1;
77  }
78 
79  static bool is_special()
80  {
81  return false;
82  }
83 
85  template<typename seg_type, typename output_type>
86  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
87  {}
88 };
89 
90 template<unsigned int prp>
91 struct sadd_
92 {
93  typedef boost::mpl::int_<prp> prop;
94 
95 #ifdef __NVCC__
96  template<typename red_type> using op_red = mgpu::plus_t<red_type>;
97 #endif
98 
99  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
100  {
101  return r1 + r2;
102  }
103 
104  static bool is_special()
105  {
106  return false;
107  }
108 
110  template<typename seg_type, typename output_type>
111  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
112  {}
113 };
114 
115 #ifdef __NVCC__
116 
117 template<typename vect_type>
118 __global__ void set_one_insert_buffer(vect_type vadd)
119 {
120  // points
121  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
122 
123  if (p >= vadd.size())
124  {return;}
125 
126  vadd.template get<0>(p) = 1;
127 }
128 
129 template<typename type_t, unsigned int blockLength>
130 struct plus_block_t : public std::binary_function<type_t, type_t, type_t> {
131  __device__ __host__ type_t operator()(type_t a, type_t b) const {
132  type_t res;
133  for (int i=0; i<blockLength; ++i)
134  {
135  res[i] = a[i] + b[i];
136  }
137  return res;
138  }
139 };
140 
141 #endif
142 
143 template<unsigned int prp, unsigned int blockLength>
145 {
146  typedef boost::mpl::int_<prp> prop;
147 
148 #ifdef __NVCC__
149  template<typename red_type> using op_red = plus_block_t<red_type, blockLength>;
150 #endif
151 
152  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
153  {
154  red_type res;
155  for (int i=0; i<blockLength; ++i)
156  {
157  res[i] = r1[i] + r2[i];
158  }
159  return res;
160  }
161 
162  static bool is_special()
163  {
164  return false;
165  }
166 
168  template<typename seg_type, typename output_type>
169  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
170  {}
171 };
172 
173 template<unsigned int prp>
174 struct smax_
175 {
176  typedef boost::mpl::int_<prp> prop;
177 
178 #ifdef __NVCC__
179  template<typename red_type> using op_red = mgpu::maximum_t<red_type>;
180 #endif
181 
182  template<typename red_type>
183  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
184  {
185  return (r1 < r2)?r2:r1;
186  }
187 
188  static bool is_special()
189  {
190  return false;
191  }
192 
194  template<typename seg_type, typename output_type>
195  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
196  {}
197 };
198 
199 #ifdef __NVCC__
200 
201 template<typename type_t, unsigned int blockLength>
202 struct maximum_block_t : public std::binary_function<type_t, type_t, type_t> {
203  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
204  type_t res;
205  for (int i=0; i<blockLength; ++i)
206  {
207  res[i] = max(a[i], b[i]);
208  }
209  return res;
210  }
211 };
212 
213 #endif
214 
215 template<unsigned int prp, unsigned int blockLength>
217 {
218  typedef boost::mpl::int_<prp> prop;
219 
220 #ifdef __NVCC__
221  template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>;
222 #endif
223 
224  template<typename red_type>
225  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
226  {
227  red_type res;
228  for (int i=0; i<blockLength; ++i)
229  {
230  res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
231  }
232  return res;
233  }
234 
235  static bool is_special()
236  {
237  return false;
238  }
239 
241  template<typename seg_type, typename output_type>
242  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
243  {}
244 };
245 
246 
247 
248 template<unsigned int prp>
249 struct smin_
250 {
251  typedef boost::mpl::int_<prp> prop;
252 
253 #ifdef __NVCC__
254  template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
255 #endif
256 
257  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
258  {
259  return (r1 < r2)?r1:r2;
260  }
261 
262  static bool is_special()
263  {
264  return false;
265  }
266 
268  template<typename seg_type, typename output_type>
269  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
270  {}
271 };
272 
273 #ifdef __NVCC__
274 
275 template<typename type_t, unsigned int blockLength>
276 struct minimum_block_t : public std::binary_function<type_t, type_t, type_t> {
277  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
278  type_t res;
279  for (int i=0; i<blockLength; ++i)
280  {
281  res[i] = min(a[i], b[i]);
282  }
283  return res;
284  }
285 };
286 
287 #endif
288 
289 template<unsigned int prp, unsigned int blockLength>
291 {
292  typedef boost::mpl::int_<prp> prop;
293 
294 #ifdef __NVCC__
295  template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>;
296 #endif
297 
298  template<typename red_type>
299  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
300  {
301  red_type res;
302  for (int i=0; i<blockLength; ++i)
303  {
304  res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
305  }
306  return res;
307  }
308 
309  static bool is_special()
310  {
311  return false;
312  }
313 
315  template<typename seg_type, typename output_type>
316  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
317  {}
318 };
319 
320 
321 #ifdef __NVCC__
322 
323 template<typename type_t>
324 struct bitwiseOr_t : public std::binary_function<type_t, type_t, type_t> {
325  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
326  return a|b;
327  }
328 };
329 
330 template<unsigned int prp>
331 struct sBitwiseOr_
332 {
333  typedef boost::mpl::int_<prp> prop;
334 
335  template<typename red_type> using op_red = bitwiseOr_t<red_type>;
336 
337  template<typename red_type>
338  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
339  {
340  return r1|r2;
341  }
342 
343  static bool is_special()
344  {
345  return false;
346  }
347 
349  template<typename seg_type, typename output_type>
350  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
351  {}
352 };
353 
354 
355 template<unsigned int prp>
356 struct sstart_
357 {
358  typedef boost::mpl::int_<prp> prop;
359 
360  template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
361 
362  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
363  {
364  return 0;
365  }
366 
367  static bool is_special()
368  {
369  return true;
370  }
371 
373  template<typename seg_type, typename output_type>
374  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
375  {
376  output.template get<0>(i) = seg_prev;
377  }
378 };
379 
380 template<unsigned int prp>
381 struct sstop_
382 {
383  typedef boost::mpl::int_<prp> prop;
384 
385  template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
386 
387  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
388  {
389  return 0;
390  }
391 
392  static bool is_special()
393  {
394  return true;
395  }
396 
398  template<typename seg_type, typename output_type>
399  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
400  {
401  output.template get<0>(i) = seg_next;
402  }
403 };
404 
405 template<unsigned int prp>
406 struct snum_
407 {
408  typedef boost::mpl::int_<prp> prop;
409 
410  template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
411 
412  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
413  {
414  return 0;
415  }
416 
417  static bool is_special()
418  {
419  return true;
420  }
421 
423  template<typename seg_type, typename output_type>
424  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
425  {
426  output.template get<0>(i) = seg_next - seg_prev;
427  }
428 };
429 
430 
431 template<typename vector_index_type>
432 __global__ void construct_insert_list_key_only(vector_index_type vit_block_data,
433  vector_index_type vit_block_n,
434  vector_index_type vit_block_scan,
435  vector_index_type vit_list_0,
436  vector_index_type vit_list_1,
437  int nslot)
438 {
439  int n_move = vit_block_n.template get<0>(blockIdx.x);
440  int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
441  int start = vit_block_scan.template get<0>(blockIdx.x);
442 
443  int i = 0;
444  for ( ; i < n_block_move ; i++)
445  {
446  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
447  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
448  }
449 
450  // move remaining
451  if (threadIdx.x < n_move - i*blockDim.x )
452  {
453  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
454  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
455  }
456 }
457 
458 template<typename vector_index_type>
459 __global__ void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
460  vector_index_type vit_block_n,
461  vector_index_type vit_block_scan,
462  vector_index_type vit_list_0,
463  vector_index_type vit_list_1,
464  int nslot)
465 {
466  int p = blockIdx.x * blockDim.x + threadIdx.x;
467 
468  if (p >= vit_block_data.size()) {return;}
469 
470  int pool_id = p / nslot;
471  int thr_id = p % nslot;
472  int start = vit_block_scan.template get<0>(pool_id);
473  int n = vit_block_scan.template get<0>(pool_id+1) - start;
474 
475  // move remaining
476  if (thr_id < n )
477  {
478  vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id);
479  vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id;
480  }
481 }
482 
483 
484 template<typename vector_index_type>
485 __global__ void construct_remove_list(vector_index_type vit_block_data,
486  vector_index_type vit_block_n,
487  vector_index_type vit_block_scan,
488  vector_index_type vit_list_0,
489  vector_index_type vit_list_1,
490  int nslot)
491 {
492  int n_move = vit_block_n.template get<0>(blockIdx.x);
493  int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
494  int start = vit_block_scan.template get<0>(blockIdx.x);
495 
496  int i = 0;
497  for ( ; i < n_block_move ; i++)
498  {
499  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
500  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
501  }
502 
503  // move remaining
504  if (threadIdx.x < n_move - i*blockDim.x )
505  {
506  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
507  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
508  }
509 }
510 
511 
512 template<typename e_type, typename v_reduce>
513 struct data_merger
514 {
515  e_type src1;
516  e_type src2;
517 
518  e_type dst;
519 
520  __device__ __host__ inline data_merger(const e_type & src1, const e_type & src2, const e_type & dst)
521  :src1(src1),src2(src2),dst(dst)
522  {
523  };
524 
525 
527  template<typename T>
528  __device__ __host__ inline void operator()(T& t) const
529  {
530  typedef typename boost::mpl::at<v_reduce,T>::type red_type;
531 
532  dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
533  }
534 };
535 
536 template<typename vector_index_type, typename vector_data_type, typename vector_index_type2, unsigned int block_dim, typename ... v_reduce>
537 __global__ void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
538  vector_index_type merge_index, vector_data_type vct_add_data,
539  vector_index_type vct_index_out, vector_data_type vct_data_out,
540  vector_index_type2 vct_tot_out,
541  int base)
542 {
543  typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
544 
545  // Specialize BlockScan for a 1D block of 256 threads on type int
546  typedef cub::BlockScan<int, block_dim> BlockScan;
547  // Allocate shared memory for BlockScan
548  __shared__ typename BlockScan::TempStorage temp_storage;
549 
550  int p = blockIdx.x * blockDim.x + threadIdx.x;
551 
552  int scan = 0;
553  int predicate = 0;
554 
555  if (p < vct_index.size())
556  {
557  index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
558  predicate = vct_index.template get<0>(p) != id_check;
559 
560  scan = predicate;
561  }
562 
563  // in shared memory scan
564  BlockScan(temp_storage).ExclusiveSum(scan, scan);
565 
566  if (predicate == 1 && p < vct_index.size())
567  {
568  vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
569 
570  int index1 = merge_index.template get<0>(p);
571 
572  auto e = vct_data_out.get(blockIdx.x*block_dim + scan);
573 
574  if (index1 < base)
575  {
576  e = vct_data.get(index1);
577  vct_data_out.get(blockIdx.x*block_dim + scan) = e;
578  }
579  else
580  {
581  e = vct_add_data.get(index1 - base);
582  vct_data_out.get(blockIdx.x*block_dim + scan) = e;
583  }
584  }
585 
586  __syncthreads();
587 
588  if (predicate == 0 && p < vct_index.size())
589  {
591  if (threadIdx.x == blockDim.x-1)
592  {vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);}
593 
594  // we have to merge the data
595 
596  typedef boost::mpl::vector<v_reduce ...> v_reduce_;
597 
598  int index1 = merge_index.template get<0>(p);
599  int index2 = merge_index.template get<0>(p+1) - base;
600 
601  data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
602  vct_add_data.get(index2),
603  vct_data_out.get(blockIdx.x*block_dim + scan));
604 
605  // data_merge
606  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(dm);
607  }
608 
609  if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
610  {
611  vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
612  vct_tot_out.template get<2>(blockIdx.x) = predicate;
613  }
614 }
615 
616 
617 template<typename vector_index_type, typename vector_index_type2, unsigned int block_dim>
618 __global__ void solve_conflicts_remove(vector_index_type vct_index,
619  vector_index_type merge_index,
620  vector_index_type vct_index_out,
621  vector_index_type vct_index_out_ps,
622  vector_index_type2 vct_tot_out,
623  int base)
624 {
625  typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
626 
627  // Specialize BlockScan for a 1D block of 256 threads on type int
628  typedef cub::BlockScan<int, block_dim> BlockScan;
629  // Allocate shared memory for BlockScan
630  __shared__ typename BlockScan::TempStorage temp_storage;
631 
632  int p = blockIdx.x * blockDim.x + threadIdx.x;
633 
634  int scan = 0;
635  int predicate = 0;
636  if (p < vct_index.size())
637  {
638  index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
639  index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1);
640  index_type id_check = vct_index.template get<0>(p);
641  predicate = id_check != id_check_p;
642  predicate &= id_check != id_check_n;
643  int mi = merge_index.template get<0>(p);
644  predicate &= (mi < base);
645 
646  scan = predicate;
647  }
648  // in shared memory scan
649  BlockScan(temp_storage).ExclusiveSum(scan, scan);
650 
651  if (predicate == 1 && p < vct_index.size())
652  {
653  vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
654  vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p);
655  }
656 
657  __syncthreads();
658 
659  if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
660  {
661  vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
662  }
663 }
664 
665 template<typename vector_type, typename vector_type2, typename red_op>
666 __global__ void reduce_from_offset(vector_type segment_offset,
667  vector_type2 output,
668  typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type max_index)
669 {
670  int p = blockIdx.x * blockDim.x + threadIdx.x;
671 
672  if (p >= segment_offset.size()) return;
673 
674  typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type v;
675  if (p == segment_offset.size()-1)
676  {v = max_index;}
677  else
678  {v = segment_offset.template get<1>(p+1);}
679 
680  red_op::set(v,segment_offset.template get<1>(p),output,p);
681 }
682 
683 template<typename vector_index_type, typename vector_data_type, typename vector_index_type2>
684 __global__ void realign(vector_index_type vct_index, vector_data_type vct_data,
685  vector_index_type vct_index_out, vector_data_type vct_data_out,
686  vector_index_type2 vct_tot_out_scan)
687 {
688  int p = blockIdx.x * blockDim.x + threadIdx.x;
689 
690  if (p >= vct_index.size()) return;
691 
692  int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
693 
697  if (threadIdx.x > tot) // NOTE COMMENT UP !!!!!!!!!
698  {return;}
699 
700  if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1)
701  {return;}
702 
703  // this branch exist if the previous block (last thread) had a predicate == 0 in resolve_conflict in that case
704  // the thread 0 of the next block does not have to produce any data
705  if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0)
706  {return;}
707 
708  int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
709 
710  if (ds + threadIdx.x >= vct_index_out.size())
711  {return;}
712 
713  vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
714 
715  auto src = vct_data.get(p);
716  auto dst = vct_data_out.get(ds+threadIdx.x);
717 
718  dst = src;
719 }
720 
721 template<typename vector_index_type, typename vct_data_type, typename vector_index_type2>
722 __global__ void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
723  vector_index_type vct_index_out, vct_data_type vct_data_out,
724  vector_index_type2 vct_tot_out_scan)
725 {
726  int p = blockIdx.x * blockDim.x + threadIdx.x;
727 
728  if (p >= vct_index.size()) return;
729 
730  int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
731 
732  if (threadIdx.x >= tot)
733  {return;}
734 
735  int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
736 
737  vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
738 
739  int oi = vct_m_index.template get<0>(p);
740 
741  auto src = vct_data.get(oi);
742  auto dst = vct_data_out.get(ds+threadIdx.x);
743 
744  dst = src;
745 }
746 
747 template<typename vector_index_type, typename vector_data_type>
748 __global__ void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
749 {
750  int p = blockIdx.x * blockDim.x + threadIdx.x;
751 
752  if (p >= vi.size()) return;
753 
754  // reorder based on v_ord the data vector
755 
756  v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p));
757 }
758 
759 template<typename vector_index_type>
760 __global__ void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
761 {
762  int p = blockIdx.x * blockDim.x + threadIdx.x;
763 
764  if (p >= vi.size()) return;
765 
766  // reorder based on v_ord the data vector
767 
768  seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p));
769 }
770 
771 
772 template<unsigned int prp, typename vector_type>
773 __global__ void set_indexes(vector_type vd, int off)
774 {
775  int p = blockIdx.x * blockDim.x + threadIdx.x;
776 
777  if (p >= vd.size()) {return;}
778 
779  vd.template get<prp>(p) = p + off;
780 }
781 
782 #endif
783 
784 #endif /* MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ */
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition: block_scan.cuh:193
Distributed vector.
temporal buffer for reductions