OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
map_vector_sparse_cuda_kernels.cuh
1/*
2 * map_vector_sparse_cuda_kernels.cuh
3 *
4 * Created on: Jan 26, 2019
5 * Author: i-bird
6 */
7
8#ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
9#define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
10
11#ifdef __NVCC__
12
13#include "config.h"
14
15#include <limits>
16
17#if CUDART_VERSION < 11000
20#include "util/cuda_launch.hpp"
21#endif
22
23#if !defined(CUDA_ON_CPU)
24#include "util/cudify/cuda/operators.hpp"
25#endif
26
27#endif
28
29template<typename type_t>
30struct zero_t {
31 __device__ __host__ type_t operator()() const {
32 return 0;
33 }
34};
35
36template<typename type_t>
38 __device__ __host__ type_t operator()() const {
39 return std::numeric_limits<type_t>::max();
40 }
41};
42
43template<typename type_t>
44struct rightOperand_t : public std::binary_function<type_t, type_t, type_t> {
45 __device__ __host__ type_t operator()(type_t a, type_t b) const {
46 return b;
47 }
48};
49
50template<unsigned int prp>
51struct sRight_
52{
53 typedef boost::mpl::int_<prp> prop;
54
55 template<typename red_type> using op_red = rightOperand_t<red_type>;
56
57 template<typename red_type>
58 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
59 {
60 return r2;
61 }
62
63 static bool is_special()
64 {
65 return false;
66 }
67
69 template<typename seg_type, typename output_type>
70 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
71 {}
72};
73
74template<typename type_t>
75struct leftOperand_t : public std::binary_function<type_t, type_t, type_t> {
76 __device__ __host__ type_t operator()(type_t a, type_t b) const {
77 return a;
78 }
79};
80
81template<unsigned int prp>
82struct sLeft_
83{
84 typedef boost::mpl::int_<prp> prop;
85
86 template<typename red_type> using op_red = leftOperand_t<red_type>;
87
88 template<typename red_type>
89 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
90 {
91 return r1;
92 }
93
94 static bool is_special()
95 {
96 return false;
97 }
98
100 template<typename seg_type, typename output_type>
101 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
102 {}
103};
104
105template<unsigned int prp>
106struct sadd_
107{
108 typedef boost::mpl::int_<prp> prop;
109
110#ifdef __NVCC__
111 template<typename red_type> using op_red = gpu::plus_t<red_type>;
112 template<typename red_type> using op_initial_value = zero_t<red_type>;
113#endif
114
115 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
116 {
117 return r1 + r2;
118 }
119
120 static bool is_special()
121 {
122 return false;
123 }
124
126 template<typename seg_type, typename output_type>
127 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
128 {}
129};
130
131#ifdef __NVCC__
132
133template<typename vect_type>
134__global__ void set_one_insert_buffer(vect_type vadd)
135{
136 // points
137 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
138
139 if (p >= vadd.size())
140 {return;}
141
142 vadd.template get<0>(p) = 1;
143}
144
145template<typename type_t, unsigned int blockLength>
146struct plus_block_t : public std::binary_function<type_t, type_t, type_t> {
147 __device__ __host__ type_t operator()(type_t a, type_t b) const {
148 type_t res;
149 for (int i=0; i<blockLength; ++i)
150 {
151 res[i] = a[i] + b[i];
152 }
153 return res;
154 }
155};
156
157#endif
158
159template<unsigned int prp, unsigned int blockLength>
161{
162 typedef boost::mpl::int_<prp> prop;
163
164#ifdef __NVCC__
165 template<typename red_type> using op_red = plus_block_t<red_type, blockLength>;
166 template<typename red_type> using op_initial_value = zero_t<red_type>;
167#endif
168
169 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
170 {
171 red_type res;
172 for (int i=0; i<blockLength; ++i)
173 {
174 res[i] = r1[i] + r2[i];
175 }
176 return res;
177 }
178
179 static bool is_special()
180 {
181 return false;
182 }
183
185 template<typename seg_type, typename output_type>
186 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
187 {}
188};
189
190template<unsigned int prp>
191struct smax_
192{
193 typedef boost::mpl::int_<prp> prop;
194
195#ifdef __NVCC__
196 template<typename red_type> using op_red = gpu::maximum_t<red_type>;
197 template<typename red_type> using op_initial_value = zero_t<red_type>;
198#endif
199
200 template<typename red_type>
201 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
202 {
203 return (r1 < r2)?r2:r1;
204 }
205
206 static bool is_special()
207 {
208 return false;
209 }
210
212 template<typename seg_type, typename output_type>
213 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
214 {}
215};
216
217#ifdef __NVCC__
218
219template<typename type_t, unsigned int blockLength>
220struct maximum_block_t : public std::binary_function<type_t, type_t, type_t> {
221 GPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
222 type_t res;
223 for (int i=0; i<blockLength; ++i)
224 {
225 res[i] = max(a[i], b[i]);
226 }
227 return res;
228 }
229};
230
231#endif
232
233template<unsigned int prp, unsigned int blockLength>
235{
236 typedef boost::mpl::int_<prp> prop;
237
238#ifdef __NVCC__
239 template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>;
240 template<typename red_type> using op_initial_value = zero_t<red_type>;
241#endif
242
243 template<typename red_type>
244 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
245 {
246 red_type res;
247 for (int i=0; i<blockLength; ++i)
248 {
249 res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
250 }
251 return res;
252 }
253
254 static bool is_special()
255 {
256 return false;
257 }
258
260 template<typename seg_type, typename output_type>
261 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
262 {}
263};
264
265
266
267template<unsigned int prp>
268struct smin_
269{
270 typedef boost::mpl::int_<prp> prop;
271
272#ifdef __NVCC__
273 template<typename red_type> using op_red = gpu::minimum_t<red_type>;
274 template<typename red_type> using op_initial_value = limit_max_t<red_type>;
275#endif
276
277 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
278 {
279 return (r1 < r2)?r1:r2;
280 }
281
282 static bool is_special()
283 {
284 return false;
285 }
286
288 template<typename seg_type, typename output_type>
289 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
290 {}
291};
292
293#ifdef __NVCC__
294
295template<typename type_t, unsigned int blockLength>
296struct minimum_block_t : public std::binary_function<type_t, type_t, type_t> {
297 GPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
298 type_t res;
299 for (int i=0; i<blockLength; ++i)
300 {
301 res[i] = min(a[i], b[i]);
302 }
303 return res;
304 }
305};
306
307#endif
308
309template<unsigned int prp, unsigned int blockLength>
311{
312 typedef boost::mpl::int_<prp> prop;
313
314#ifdef __NVCC__
315 template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>;
316 template<typename red_type> using op_initial_value = limit_max_t<red_type>;
317#endif
318
319 template<typename red_type>
320 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
321 {
322 red_type res;
323 for (int i=0; i<blockLength; ++i)
324 {
325 res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
326 }
327 return res;
328 }
329
330 static bool is_special()
331 {
332 return false;
333 }
334
336 template<typename seg_type, typename output_type>
337 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
338 {}
339};
340
341
342#ifdef __NVCC__
343
344template<typename type_t>
345struct bitwiseOr_t : public std::binary_function<type_t, type_t, type_t> {
346 GPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
347 return a|b;
348 }
349};
350
351template<unsigned int prp>
352struct sBitwiseOr_
353{
354 typedef boost::mpl::int_<prp> prop;
355
356 template<typename red_type> using op_red = bitwiseOr_t<red_type>;
357
358 template<typename red_type>
359 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
360 {
361 return r1|r2;
362 }
363
364 static bool is_special()
365 {
366 return false;
367 }
368
370 template<typename seg_type, typename output_type>
371 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
372 {}
373};
374
375
376template<unsigned int prp>
377struct sstart_
378{
379 typedef boost::mpl::int_<prp> prop;
380
381 template<typename red_type> using op_red = gpu::minimum_t<red_type>;
382 template<typename red_type> using op_initial_value = zero_t<red_type>;
383
384 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
385 {
386 return 0;
387 }
388
389 static bool is_special()
390 {
391 return true;
392 }
393
395 template<typename seg_type, typename output_type>
396 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
397 {
398 output.template get<0>(i) = seg_prev;
399 }
400};
401
402template<unsigned int prp>
403struct sstop_
404{
405 typedef boost::mpl::int_<prp> prop;
406
407 template<typename red_type> using op_red = gpu::minimum_t<red_type>;
408
409 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
410 {
411 return 0;
412 }
413
414 static bool is_special()
415 {
416 return true;
417 }
418
420 template<typename seg_type, typename output_type>
421 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
422 {
423 output.template get<0>(i) = seg_next;
424 }
425};
426
427template<unsigned int prp>
428struct snum_
429{
430 typedef boost::mpl::int_<prp> prop;
431
432 template<typename red_type> using op_red = gpu::minimum_t<red_type>;
433
434 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
435 {
436 return 0;
437 }
438
439 static bool is_special()
440 {
441 return true;
442 }
443
445 template<typename seg_type, typename output_type>
446 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
447 {
448 output.template get<0>(i) = seg_next - seg_prev;
449 }
450};
451
452
453template<typename vector_index_type>
454__global__ void construct_insert_list_key_only(vector_index_type vit_block_data,
455 vector_index_type vit_block_n,
456 vector_index_type vit_block_scan,
457 vector_index_type vit_list_0,
458 vector_index_type vit_list_1,
459 int nslot)
460{
461 int n_move = vit_block_n.template get<0>(blockIdx.x);
462 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
463 int start = vit_block_scan.template get<0>(blockIdx.x);
464
465 int i = 0;
466 for ( ; i < n_block_move ; i++)
467 {
468 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
469 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
470 }
471
472 // move remaining
473 if (threadIdx.x < n_move - i*blockDim.x )
474 {
475 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
476 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
477 }
478}
479
480template<typename vector_index_type>
481__global__ void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
482 vector_index_type vit_block_n,
483 vector_index_type vit_block_scan,
484 vector_index_type vit_list_0,
485 vector_index_type vit_list_1,
486 int nslot)
487{
488 int p = blockIdx.x * blockDim.x + threadIdx.x;
489
490 if (p >= vit_block_data.size()) {return;}
491
492 int pool_id = p / nslot;
493 int thr_id = p % nslot;
494 int start = vit_block_scan.template get<0>(pool_id);
495 int n = vit_block_scan.template get<0>(pool_id+1) - start;
496
497 // move remaining
498 if (thr_id < n )
499 {
500 vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id);
501 vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id;
502 }
503}
504
505
506template<typename vector_index_type>
507__global__ void construct_remove_list(vector_index_type vit_block_data,
508 vector_index_type vit_block_n,
509 vector_index_type vit_block_scan,
510 vector_index_type vit_list_0,
511 vector_index_type vit_list_1,
512 int nslot)
513{
514 int n_move = vit_block_n.template get<0>(blockIdx.x);
515 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
516 int start = vit_block_scan.template get<0>(blockIdx.x);
517
518 int i = 0;
519 for ( ; i < n_block_move ; i++)
520 {
521 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
522 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
523 }
524
525 // move remaining
526 if (threadIdx.x < n_move - i*blockDim.x )
527 {
528 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
529 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
530 }
531}
532
533
534template<typename e_type, typename v_reduce>
535struct data_merger
536{
537 e_type src1;
538 e_type src2;
539
540 e_type dst;
541
542 __device__ __host__ inline data_merger(const e_type & src1, const e_type & src2, const e_type & dst)
543 :src1(src1),src2(src2),dst(dst)
544 {
545 };
546
547
549 template<typename T>
550 __device__ __host__ inline void operator()(T& t) const
551 {
552 typedef typename boost::mpl::at<v_reduce,T>::type red_type;
553
554 dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
555 }
556};
557
558template<typename vector_index_type, typename vector_data_type, typename vector_index_type2, unsigned int block_dim, typename ... v_reduce>
559__global__ void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
560 vector_index_type merge_index, vector_data_type vct_add_data,
561 vector_index_type vct_index_out, vector_data_type vct_data_out,
562 vector_index_type2 vct_tot_out,
563 int base)
564{
565 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
566
567 // Specialize BlockScan for a 1D block of 256 threads on type int
568 typedef cub::BlockScan<int, block_dim> BlockScan;
569 // Allocate shared memory for BlockScan
570 __shared__ typename BlockScan::TempStorage temp_storage;
571
572 int p = blockIdx.x * blockDim.x + threadIdx.x;
573
574 int scan = 0;
575 int predicate = 0;
576
577 if (p < vct_index.size())
578 {
579 index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
580 predicate = vct_index.template get<0>(p) != id_check;
581
582 scan = predicate;
583 }
584
585 // in shared memory scan
586 BlockScan(temp_storage).ExclusiveSum(scan, scan);
587
588 if (predicate == 1 && p < vct_index.size())
589 {
590 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
591
592 int index1 = merge_index.template get<0>(p);
593
594 auto e = vct_data_out.get(blockIdx.x*block_dim + scan);
595
596 if (index1 < base)
597 {
598 e = vct_data.get(index1);
599 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
600 }
601 else
602 {
603 e = vct_add_data.get(index1 - base);
604 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
605 }
606 }
607
608 __syncthreads();
609
610 if (predicate == 0 && p < vct_index.size())
611 {
613 if (threadIdx.x == blockDim.x-1)
614 {vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);}
615
616 // we have to merge the data
617
618 typedef boost::mpl::vector<v_reduce ...> v_reduce_;
619
620 int index1 = merge_index.template get<0>(p);
621 int index2 = merge_index.template get<0>(p+1) - base;
622
623 data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
624 vct_add_data.get(index2),
625 vct_data_out.get(blockIdx.x*block_dim + scan));
626
627 // data_merge
628 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(dm);
629 }
630
631 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
632 {
633 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
634 vct_tot_out.template get<2>(blockIdx.x) = predicate;
635 }
636}
637
638
639template<typename vector_index_type, typename vector_index_type2, unsigned int block_dim>
640__global__ void solve_conflicts_remove(vector_index_type vct_index,
641 vector_index_type merge_index,
642 vector_index_type vct_index_out,
643 vector_index_type vct_index_out_ps,
644 vector_index_type2 vct_tot_out,
645 int base)
646{
647 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
648
649 // Specialize BlockScan for a 1D block of 256 threads on type int
650 typedef cub::BlockScan<int, block_dim> BlockScan;
651 // Allocate shared memory for BlockScan
652 __shared__ typename BlockScan::TempStorage temp_storage;
653
654 int p = blockIdx.x * blockDim.x + threadIdx.x;
655
656 int scan = 0;
657 int predicate = 0;
658 if (p < vct_index.size())
659 {
660 index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
661 index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1);
662 index_type id_check = vct_index.template get<0>(p);
663 predicate = id_check != id_check_p;
664 predicate &= id_check != id_check_n;
665 int mi = merge_index.template get<0>(p);
666 predicate &= (mi < base);
667
668 scan = predicate;
669 }
670 // in shared memory scan
671 BlockScan(temp_storage).ExclusiveSum(scan, scan);
672
673 if (predicate == 1 && p < vct_index.size())
674 {
675 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
676 vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p);
677 }
678
679 __syncthreads();
680
681 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
682 {
683 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
684 }
685}
686
687template<typename vector_type, typename vector_type2, typename red_op>
688__global__ void reduce_from_offset(vector_type segment_offset,
689 vector_type2 output,
690 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type max_index)
691{
692 int p = blockIdx.x * blockDim.x + threadIdx.x;
693
694 if (p >= segment_offset.size()) return;
695
696 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type v;
697 if (p == segment_offset.size()-1)
698 {v = max_index;}
699 else
700 {v = segment_offset.template get<1>(p+1);}
701
702 red_op::set(v,segment_offset.template get<1>(p),output,p);
703}
704
705template<typename vector_index_type, typename vector_data_type, typename vector_index_type2>
706__global__ void realign(vector_index_type vct_index, vector_data_type vct_data,
707 vector_index_type vct_index_out, vector_data_type vct_data_out,
708 vector_index_type2 vct_tot_out_scan)
709{
710 int p = blockIdx.x * blockDim.x + threadIdx.x;
711
712 if (p >= vct_index.size()) return;
713
714 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
715
719 if (threadIdx.x > tot) // NOTE COMMENT UP !!!!!!!!!
720 {return;}
721
722 if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1)
723 {return;}
724
725 // this branch exist if the previous block (last thread) had a predicate == 0 in resolve_conflict in that case
726 // the thread 0 of the next block does not have to produce any data
727 if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0)
728 {return;}
729
730 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
731
732 if (ds + threadIdx.x >= vct_index_out.size())
733 {return;}
734
735 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
736
737 auto src = vct_data.get(p);
738 auto dst = vct_data_out.get(ds+threadIdx.x);
739
740 dst = src;
741}
742
743template<typename vector_index_type, typename vct_data_type, typename vector_index_type2>
744__global__ void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
745 vector_index_type vct_index_out, vct_data_type vct_data_out,
746 vector_index_type2 vct_tot_out_scan)
747{
748 int p = blockIdx.x * blockDim.x + threadIdx.x;
749
750 if (p >= vct_index.size()) return;
751
752 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
753
754 if (threadIdx.x >= tot)
755 {return;}
756
757 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
758
759 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
760
761 int oi = vct_m_index.template get<0>(p);
762
763 auto src = vct_data.get(oi);
764 auto dst = vct_data_out.get(ds+threadIdx.x);
765
766 dst = src;
767}
768
769template<typename vector_index_type, typename vector_data_type>
770__global__ void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
771{
772 int p = blockIdx.x * blockDim.x + threadIdx.x;
773
774 if (p >= vi.size()) return;
775
776 // reorder based on v_ord the data vector
777
778 v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p));
779}
780
781template<typename vector_index_type>
782__global__ void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
783{
784 int p = blockIdx.x * blockDim.x + threadIdx.x;
785
786 if (p >= vi.size()) return;
787
788 // reorder based on v_ord the data vector
789
790 seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p));
791}
792
793
794template<unsigned int prp, typename vector_type>
795__global__ void set_indexes(vector_type vd, int off)
796{
797 int p = blockIdx.x * blockDim.x + threadIdx.x;
798
799 if (p >= vd.size()) {return;}
800
801 vd.template get<prp>(p) = p + off;
802}
803
804#endif
805
806#endif /* MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ */
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Distributed vector.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
__device__ static __host__ void set(seg_type seg_next, seg_type seg_prev, output_type &output, int i)
is not special reduction so it does not need it
temporal buffer for reductions