OpenFPM  5.2.0
Project that contain the implementation of distributed structures
map_vector_sparse_cuda_kernels.cuh
1 /*
2  * map_vector_sparse_cuda_kernels.cuh
3  *
4  * Created on: Jan 26, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
9 #define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
10 
11 #ifdef __NVCC__
12 
13 #include "config.h"
14 
15 #include <limits>
16 
17 #if CUDART_VERSION < 11000
20 #include "util/cuda_util.hpp"
21 #endif
22 
23 #if !defined(CUDA_ON_CPU)
24 #include "util/cudify/cuda/operators.hpp"
25 #endif
26 
27 #endif
28 
29 template<typename type_t>
30 struct zero_t {
31  __device__ __host__ type_t operator()() const {
32  return 0;
33  }
34 };
35 
36 template<typename type_t>
37 struct limit_max_t {
38  __device__ __host__ type_t operator()() const {
39  return std::numeric_limits<type_t>::max();
40  }
41 };
42 
43 template<typename type_t>
45  __device__ __host__ type_t operator()(type_t a, type_t b) const {
46  return b;
47  }
48 };
49 
50 template<unsigned int prp>
51 struct sRight_
52 {
53  typedef boost::mpl::int_<prp> prop;
54 
55  template<typename red_type> using op_red = rightOperand_t<red_type>;
56 
57  template<typename red_type>
58  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
59  {
60  return r2;
61  }
62 
63  static bool is_special()
64  {
65  return false;
66  }
67 
69  template<typename seg_type, typename output_type>
70  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
71  {}
72 };
73 
74 template<typename type_t>
75 struct leftOperand_t {
76  __device__ __host__ type_t operator()(type_t a, type_t b) const {
77  return a;
78  }
79 };
80 
81 template<unsigned int prp>
82 struct sLeft_
83 {
84  typedef boost::mpl::int_<prp> prop;
85 
86  template<typename red_type> using op_red = leftOperand_t<red_type>;
87 
88  template<typename red_type>
89  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
90  {
91  return r1;
92  }
93 
94  static bool is_special()
95  {
96  return false;
97  }
98 
100  template<typename seg_type, typename output_type>
101  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
102  {}
103 };
104 
105 template<unsigned int prp>
106 struct sadd_
107 {
108  typedef boost::mpl::int_<prp> prop;
109 
110 #ifdef __NVCC__
111  template<typename red_type> using op_red = gpu::plus_t<red_type>;
112  template<typename red_type> using op_initial_value = zero_t<red_type>;
113 #endif
114 
115  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
116  {
117  return r1 + r2;
118  }
119 
120  static bool is_special()
121  {
122  return false;
123  }
124 
126  template<typename seg_type, typename output_type>
127  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
128  {}
129 };
130 
131 #ifdef __NVCC__
132 
133 template<typename vect_type>
134 __global__ void set_one_insert_buffer(vect_type vadd)
135 {
136  // points
137  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
138 
139  if (p >= vadd.size())
140  {return;}
141 
142  vadd.template get<0>(p) = 1;
143 }
144 
145 template<typename type_t, unsigned int blockLength>
146 struct plus_block_t {
147  __device__ __host__ type_t operator()(type_t a, type_t b) const {
148  type_t res;
149  for (int i=0; i<blockLength; ++i)
150  {
151  res[i] = a[i] + b[i];
152  }
153  return res;
154  }
155 };
156 
157 #endif
158 
159 template<unsigned int prp, unsigned int blockLength>
161 {
162  typedef boost::mpl::int_<prp> prop;
163 
164 #ifdef __NVCC__
165  template<typename red_type> using op_red = plus_block_t<red_type, blockLength>;
166  template<typename red_type> using op_initial_value = zero_t<red_type>;
167 #endif
168 
169  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
170  {
171  red_type res;
172  for (int i=0; i<blockLength; ++i)
173  {
174  res[i] = r1[i] + r2[i];
175  }
176  return res;
177  }
178 
179  static bool is_special()
180  {
181  return false;
182  }
183 
185  template<typename seg_type, typename output_type>
186  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
187  {}
188 };
189 
190 template<unsigned int prp>
191 struct smax_
192 {
193  typedef boost::mpl::int_<prp> prop;
194 
195 #ifdef __NVCC__
196  template<typename red_type> using op_red = gpu::maximum_t<red_type>;
197  template<typename red_type> using op_initial_value = zero_t<red_type>;
198 #endif
199 
200  template<typename red_type>
201  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
202  {
203  return (r1 < r2)?r2:r1;
204  }
205 
206  static bool is_special()
207  {
208  return false;
209  }
210 
212  template<typename seg_type, typename output_type>
213  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
214  {}
215 };
216 
217 #ifdef __NVCC__
218 
219 template<typename type_t, unsigned int blockLength>
220 struct maximum_block_t {
221  __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const {
222  type_t res;
223  for (int i=0; i<blockLength; ++i)
224  {
225  res[i] = max(a[i], b[i]);
226  }
227  return res;
228  }
229 };
230 
231 #endif
232 
233 template<unsigned int prp, unsigned int blockLength>
235 {
236  typedef boost::mpl::int_<prp> prop;
237 
238 #ifdef __NVCC__
239  template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>;
240  template<typename red_type> using op_initial_value = zero_t<red_type>;
241 #endif
242 
243  template<typename red_type>
244  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
245  {
246  red_type res;
247  for (int i=0; i<blockLength; ++i)
248  {
249  res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
250  }
251  return res;
252  }
253 
254  static bool is_special()
255  {
256  return false;
257  }
258 
260  template<typename seg_type, typename output_type>
261  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
262  {}
263 };
264 
265 
266 
267 template<unsigned int prp>
268 struct smin_
269 {
270  typedef boost::mpl::int_<prp> prop;
271 
272 #ifdef __NVCC__
273  template<typename red_type> using op_red = gpu::minimum_t<red_type>;
274  template<typename red_type> using op_initial_value = limit_max_t<red_type>;
275 #endif
276 
277  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
278  {
279  return (r1 < r2)?r1:r2;
280  }
281 
282  static bool is_special()
283  {
284  return false;
285  }
286 
288  template<typename seg_type, typename output_type>
289  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
290  {}
291 };
292 
293 #ifdef __NVCC__
294 
295 template<typename type_t, unsigned int blockLength>
296 struct minimum_block_t {
297  __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const {
298  type_t res;
299  for (int i=0; i<blockLength; ++i)
300  {
301  res[i] = min(a[i], b[i]);
302  }
303  return res;
304  }
305 };
306 
307 #endif
308 
309 template<unsigned int prp, unsigned int blockLength>
311 {
312  typedef boost::mpl::int_<prp> prop;
313 
314 #ifdef __NVCC__
315  template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>;
316  template<typename red_type> using op_initial_value = limit_max_t<red_type>;
317 #endif
318 
319  template<typename red_type>
320  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
321  {
322  red_type res;
323  for (int i=0; i<blockLength; ++i)
324  {
325  res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
326  }
327  return res;
328  }
329 
330  static bool is_special()
331  {
332  return false;
333  }
334 
336  template<typename seg_type, typename output_type>
337  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
338  {}
339 };
340 
341 
342 #ifdef __NVCC__
343 
344 template<typename type_t>
345 struct bitwiseOr_t {
346  __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const {
347  return a|b;
348  }
349 };
350 
351 template<unsigned int prp>
352 struct sBitwiseOr_
353 {
354  typedef boost::mpl::int_<prp> prop;
355 
356  template<typename red_type> using op_red = bitwiseOr_t<red_type>;
357 
358  template<typename red_type>
359  __device__ __host__ static red_type red(red_type & r1, red_type & r2)
360  {
361  return r1|r2;
362  }
363 
364  static bool is_special()
365  {
366  return false;
367  }
368 
370  template<typename seg_type, typename output_type>
371  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
372  {}
373 };
374 
375 
376 template<unsigned int prp>
377 struct sstart_
378 {
379  typedef boost::mpl::int_<prp> prop;
380 
381  template<typename red_type> using op_red = gpu::minimum_t<red_type>;
382  template<typename red_type> using op_initial_value = zero_t<red_type>;
383 
384  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
385  {
386  return 0;
387  }
388 
389  static bool is_special()
390  {
391  return true;
392  }
393 
395  template<typename seg_type, typename output_type>
396  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
397  {
398  output.template get<0>(i) = seg_prev;
399  }
400 };
401 
402 template<unsigned int prp>
403 struct sstop_
404 {
405  typedef boost::mpl::int_<prp> prop;
406 
407  template<typename red_type> using op_red = gpu::minimum_t<red_type>;
408 
409  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
410  {
411  return 0;
412  }
413 
414  static bool is_special()
415  {
416  return true;
417  }
418 
420  template<typename seg_type, typename output_type>
421  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
422  {
423  output.template get<0>(i) = seg_next;
424  }
425 };
426 
427 template<unsigned int prp>
428 struct snum_
429 {
430  typedef boost::mpl::int_<prp> prop;
431 
432  template<typename red_type> using op_red = gpu::minimum_t<red_type>;
433 
434  template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
435  {
436  return 0;
437  }
438 
439  static bool is_special()
440  {
441  return true;
442  }
443 
445  template<typename seg_type, typename output_type>
446  __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
447  {
448  output.template get<0>(i) = seg_next - seg_prev;
449  }
450 };
451 
452 
453 template<typename vector_index_type>
454 __global__ void construct_insert_list_key_only(vector_index_type vit_block_data,
455  vector_index_type vit_block_n,
456  vector_index_type vit_block_scan,
457  vector_index_type vit_list_0,
458  vector_index_type vit_list_1,
459  int nslot)
460 {
461  int n_move = vit_block_n.template get<0>(blockIdx.x);
462  int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
463  int start = vit_block_scan.template get<0>(blockIdx.x);
464 
465  int i = 0;
466  for ( ; i < n_block_move ; i++)
467  {
468  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
469  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
470  }
471 
472  // move remaining
473  if (threadIdx.x < n_move - i*blockDim.x )
474  {
475  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
476  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
477  }
478 }
479 
480 template<typename vector_index_type>
481 __global__ void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
482  vector_index_type vit_block_n,
483  vector_index_type vit_block_scan,
484  vector_index_type vit_list_0,
485  vector_index_type vit_list_1,
486  int nslot)
487 {
488  int p = blockIdx.x * blockDim.x + threadIdx.x;
489 
490  if (p >= vit_block_data.size()) {return;}
491 
492  int pool_id = p / nslot;
493  int thr_id = p % nslot;
494  int start = vit_block_scan.template get<0>(pool_id);
495  int n = vit_block_scan.template get<0>(pool_id+1) - start;
496 
497  // move remaining
498  if (thr_id < n )
499  {
500  vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id);
501  vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id;
502  }
503 }
504 
505 
506 template<typename vector_index_type>
507 __global__ void construct_remove_list(vector_index_type vit_block_data,
508  vector_index_type vit_block_n,
509  vector_index_type vit_block_scan,
510  vector_index_type vit_list_0,
511  vector_index_type vit_list_1,
512  int nslot)
513 {
514  int n_move = vit_block_n.template get<0>(blockIdx.x);
515  int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
516  int start = vit_block_scan.template get<0>(blockIdx.x);
517 
518  int i = 0;
519  for ( ; i < n_block_move ; i++)
520  {
521  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
522  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
523  }
524 
525  // move remaining
526  if (threadIdx.x < n_move - i*blockDim.x )
527  {
528  vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
529  vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
530  }
531 }
532 
533 
534 template<typename e_type, typename v_reduce>
535 struct data_merger
536 {
537  e_type src1;
538  e_type src2;
539 
540  e_type dst;
541 
542  __device__ __host__ inline data_merger(const e_type & src1, const e_type & src2, const e_type & dst)
543  :src1(src1),src2(src2),dst(dst)
544  {
545  };
546 
547 
549  template<typename T>
550  __device__ __host__ inline void operator()(T& t) const
551  {
552  typedef typename boost::mpl::at<v_reduce,T>::type red_type;
553 
554  dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
555  }
556 };
557 
558 template<typename vector_index_type, typename vector_data_type, typename vector_index_type2, unsigned int block_dim, typename ... v_reduce>
559 __global__ void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
560  vector_index_type merge_index, vector_data_type vct_add_data,
561  vector_index_type vct_index_out, vector_data_type vct_data_out,
562  vector_index_type2 vct_tot_out,
563  int base)
564 {
565  typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
566 
567  // Specialize BlockScan for a 1D block of 256 threads on type int
568  typedef cub::BlockScan<int, block_dim> BlockScan;
569  // Allocate shared memory for BlockScan
570  __shared__ typename BlockScan::TempStorage temp_storage;
571 
572 
573  int p = blockIdx.x * blockDim.x + threadIdx.x;
574 
575  int scan = 0;
576  int predicate = 0;
577 
578  if (p < vct_index.size())
579  {
580  index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
581  predicate = vct_index.template get<0>(p) != id_check;
582 
583  scan = predicate;
584  }
585 
586  // in shared memory scan
587  BlockScan(temp_storage).ExclusiveSum(scan, scan);
588 
589  size_t vct_index_out_index = blockIdx.x*block_dim + scan;
590 
591  if (predicate == 1 && p < vct_index.size() && vct_index_out_index < vct_index_out.size())
592  {
593  vct_index_out.template get<0>(vct_index_out_index) = vct_index.template get<0>(p);
594 
595  int index1 = merge_index.template get<0>(p);
596 
597  auto e = vct_data_out.get(vct_index_out_index);
598 
599  if (index1 < base)
600  {
601  e = vct_data.get(index1);
602  vct_data_out.get(vct_index_out_index) = e;
603  }
604  else
605  {
606  e = vct_add_data.get(index1 - base);
607  vct_data_out.get(vct_index_out_index) = e;
608  }
609  }
610 
611  __syncthreads();
612 
613  if (predicate == 0 && p < vct_index.size() && vct_index_out_index < vct_index_out.size())
614  {
616  if (threadIdx.x == blockDim.x-1)
617  {vct_index_out.template get<0>(vct_index_out_index) = vct_index.template get<0>(p);}
618 
619  // we have to merge the data
620 
621  typedef boost::mpl::vector<v_reduce ...> v_reduce_;
622 
623  int index1 = merge_index.template get<0>(p);
624  int index2 = merge_index.template get<0>(p+1) - base;
625 
626  data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
627  vct_add_data.get(index2),
628  vct_data_out.get(vct_index_out_index));
629 
630  // data_merge
631  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(dm);
632  }
633 
634  if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
635  {
636  vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
637  vct_tot_out.template get<2>(blockIdx.x) = predicate;
638  }
639 }
640 
641 
642 template<typename vector_index_type, typename vector_index_type2, unsigned int block_dim>
643 __global__ void solve_conflicts_remove(vector_index_type vct_index,
644  vector_index_type merge_index,
645  vector_index_type vct_index_out,
646  vector_index_type vct_index_out_ps,
647  vector_index_type2 vct_tot_out,
648  int base)
649 {
650  typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
651 
652  // Specialize BlockScan for a 1D block of 256 threads on type int
653  typedef cub::BlockScan<int, block_dim> BlockScan;
654  // Allocate shared memory for BlockScan
655  __shared__ typename BlockScan::TempStorage temp_storage;
656 
657  int p = blockIdx.x * blockDim.x + threadIdx.x;
658 
659  int scan = 0;
660  int predicate = 0;
661  if (p < vct_index.size())
662  {
663  index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
664  index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1);
665  index_type id_check = vct_index.template get<0>(p);
666  predicate = id_check != id_check_p;
667  predicate &= id_check != id_check_n;
668  int mi = merge_index.template get<0>(p);
669  predicate &= (mi < base);
670 
671  scan = predicate;
672  }
673  // in shared memory scan
674  BlockScan(temp_storage).ExclusiveSum(scan, scan);
675 
676  if (predicate == 1 && p < vct_index.size())
677  {
678  vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
679  vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p);
680  }
681 
682  __syncthreads();
683 
684  if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
685  {
686  vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
687  }
688 }
689 
690 template<typename vector_type, typename vector_type2, typename red_op>
691 __global__ void reduce_from_offset(vector_type segment_offset,
692  vector_type2 output,
693  typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type max_index)
694 {
695  int p = blockIdx.x * blockDim.x + threadIdx.x;
696 
697  if (p >= segment_offset.size()) return;
698 
699  typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type v;
700  if (p == segment_offset.size()-1)
701  {v = max_index;}
702  else
703  {v = segment_offset.template get<1>(p+1);}
704 
705  red_op::set(v,segment_offset.template get<1>(p),output,p);
706 }
707 
708 template<typename vector_index_type, typename vector_data_type, typename vector_index_type2>
709 __global__ void realign(vector_index_type vct_index, vector_data_type vct_data,
710  vector_index_type vct_index_out, vector_data_type vct_data_out,
711  vector_index_type2 vct_tot_out_scan)
712 {
713  int p = blockIdx.x * blockDim.x + threadIdx.x;
714 
715  if (p >= vct_index.size()) return;
716 
717  int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
718 
722  if (threadIdx.x > tot) // NOTE COMMENT UP !!!!!!!!!
723  {return;}
724 
725  if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1)
726  {return;}
727 
728  // this branch exist if the previous block (last thread) had a predicate == 0 in resolve_conflict in that case
729  // the thread 0 of the next block does not have to produce any data
730  if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0)
731  {return;}
732 
733  int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
734 
735  if (ds + threadIdx.x >= vct_index_out.size())
736  {return;}
737 
738  vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
739 
740  auto src = vct_data.get(p);
741  auto dst = vct_data_out.get(ds+threadIdx.x);
742 
743  dst = src;
744 }
745 
746 template<typename vector_index_type, typename vct_data_type, typename vector_index_type2>
747 __global__ void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
748  vector_index_type vct_index_out, vct_data_type vct_data_out,
749  vector_index_type2 vct_tot_out_scan)
750 {
751  int p = blockIdx.x * blockDim.x + threadIdx.x;
752 
753  if (p >= vct_index.size()) return;
754 
755  int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
756 
757  if (threadIdx.x >= tot)
758  {return;}
759 
760  int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
761 
762  vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
763 
764  int oi = vct_m_index.template get<0>(p);
765 
766  auto src = vct_data.get(oi);
767  auto dst = vct_data_out.get(ds+threadIdx.x);
768 
769  dst = src;
770 }
771 
772 template<typename vector_index_type, typename vector_data_type>
773 __global__ void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
774 {
775  int p = blockIdx.x * blockDim.x + threadIdx.x;
776 
777  if (p >= vi.size()) return;
778 
779  // reorder based on v_ord the data vector
780 
781  v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p));
782 }
783 
784 template<typename vector_index_type>
785 __global__ void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
786 {
787  int p = blockIdx.x * blockDim.x + threadIdx.x;
788 
789  if (p >= vi.size()) return;
790 
791  // reorder based on v_ord the data vector
792 
793  seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p));
794 }
795 
796 
797 template<unsigned int prp, typename vector_type>
798 __global__ void set_indexes(vector_type vd, int off)
799 {
800  int p = blockIdx.x * blockDim.x + threadIdx.x;
801 
802  if (p >= vd.size()) {return;}
803 
804  vd.template get<prp>(p) = p + off;
805 }
806 
807 #endif
808 
809 #endif /* MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ */
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition: block_scan.cuh:194
Distributed vector.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
temporal buffer for reductions