OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
map_vector_sparse.hpp
1/*
2 * map_vector_sparse.hpp
3 *
4 * Created on: Jan 22, 2019
5 * Author: i-bird
6 */
7
8#ifndef MAP_VECTOR_SPARSE_HPP_
9#define MAP_VECTOR_SPARSE_HPP_
10
11#include "util/cuda_launch.hpp"
12#include "Vector/map_vector.hpp"
13#include "Vector/cuda/map_vector_sparse_cuda_ker.cuh"
14#include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh"
15#include "util/ofp_context.hpp"
16#include <iostream>
17#include <limits>
18
19#if defined(__NVCC__)
20 #include "util/cuda/kernels.cuh"
21#endif
22
23#include "util/cuda/scan_ofp.cuh"
24#include "util/cuda/sort_ofp.cuh"
25#include "util/cuda/segreduce_ofp.cuh"
26#include "util/cuda/merge_ofp.cuh"
27
28enum flush_type
29{
30 FLUSH_ON_HOST = 0,
31 FLUSH_ON_DEVICE = 1,
32 FLUSH_NO_DATA = 2
33};
34
35template<typename OfpmVectorT>
36using ValueTypeOf = typename std::remove_reference<OfpmVectorT>::type::value_type;
37
38namespace openfpm
39{
40 // All props
41 template<typename sg_type>
42 struct htoD
43 {
45 sg_type & sg;
46
47 unsigned int lele;
48
49 htoD(sg_type & sg, unsigned int lele)
50 :sg(sg),lele(lele)
51 {};
52
53
55 template<typename T>
56 __device__ __host__ inline void operator()(T& t) const
57 {
58 sg.template hostToDevice<T::value>(lele,lele);
59 }
60 };
61
62 constexpr int VECTOR_SPARSE_STANDARD = 1;
63 constexpr int VECTOR_SPARSE_BLOCK = 2;
64
65 template<typename reduction_type, unsigned int impl>
67 {
68 template<typename encap_src, typename encap_dst>
69 static inline void process(encap_src & src, encap_dst & dst)
70 {
71 dst = reduction_type::red(dst,src);
72 }
73 };
74
75 template<typename reduction_type>
76 struct cpu_block_process<reduction_type,VECTOR_SPARSE_BLOCK>
77 {
78 template<typename encap_src, typename encap_dst>
79 static inline void process(encap_src & src, encap_dst & dst)
80 {
81 for (size_t i = 0 ; i < encap_src::size ; i++)
82 {
83 dst[i] = reduction_type::red(dst[i],src[i]);
84 }
85 }
86 };
87
88 template<typename reduction_type>
89 struct cpu_block_process<reduction_type,3>
90 {
91 template<typename encap_src, typename encap_dst,unsigned int N1>
92 static inline void process(encap_src & src, encap_dst (& dst)[N1])
93 {
94 for (unsigned int j = 0 ; j < N1 ; j++)
95 {
96 for (size_t i = 0 ; i < encap_dst::size ; i++)
97 {
98 dst[i] = reduction_type::red(dst[i][j],src[j][i]);
99 }
100 }
101 }
102
103 template<unsigned int N1, unsigned int blockSize, typename encap_src, typename encap_dst>
104 static inline void process_e(encap_src & src, encap_dst & dst)
105 {
106 for (unsigned int j = 0 ; j < N1 ; j++)
107 {
108 for (size_t i = 0 ; i < blockSize ; i++)
109 {
110 dst[i] = reduction_type::red(dst[i][j],src[i][j]);
111 }
112 }
113 }
114 };
115
120 template<unsigned int impl, typename block_functor>
121 struct scalar_block_implementation_switch // Case for scalar implementations
122 {
123 template <unsigned int p, typename vector_index_type>
124 static void extendSegments(vector_index_type & segments, size_t dataSize)
125 {
126#ifdef __NVCC__
127 // Append trailing element to segment (marks end of last segment)
128 segments.resize(segments.size()+1);
129 segments.template get<p>(segments.size() - 1) = dataSize;
130 segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
131#else // __NVCC__
132 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
133#endif // __NVCC__
134 }
135
136 template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type , typename vector_index_type2>
137 static void segreduce(vector_data_type & vector_data,
138 vector_data_type & vector_data_unsorted,
139 vector_index_type & vector_data_map,
140 vector_index_type2 & segment_offset,
141 vector_data_type & vector_data_red,
142 block_functor & blf,
143 gpu::ofp_context_t & context)
144 {
145#ifdef __NVCC__
146 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
147 typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
148 typedef typename reduction_type::template op_red<red_type> red_op;
149 typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type;
150 typename reduction_type::template op_initial_value<red_type> initial_value_functor;
151
152 assert((std::is_same<seg_type,int>::value == true));
153
154 openfpm::segreduce(
155 (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
156 (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size()-1,
157 (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
158 red_op(), initial_value_functor(), context);
159#else // __NVCC__
160 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
161#endif // __NVCC__
162 }
163
164
180 template <
181 typename vector_data_type,
182 typename vector_index_type,
183 typename vector_index_type2,
184 typename vector_index_dtmp_type,
185 typename Ti,
186 typename ... v_reduce>
187 static void solveConflicts(
188 vector_index_type & vct_index_old,
189 vector_index_type & vct_index_merge,
190 vector_index_type & vct_index_merge_id,
191 vector_index_type & vct_index_out,
192 vector_index_dtmp_type & vct_index_dtmp,
193 vector_index_type & data_map,
194 vector_index_type2 & segments_new,
195 vector_data_type & vct_data_old,
196 vector_data_type & vct_add_data,
197 vector_data_type & vct_add_data_unique,
198 vector_data_type & vct_data_out,
199 ite_gpu<1> & itew,
200 block_functor & blf,
201 gpu::ofp_context_t & context
202 )
203 {
204#ifdef __NVCC__
205
206 CUDA_LAUNCH((solve_conflicts<
207 decltype(vct_index_merge.toKernel()),
208 decltype(vct_data_old.toKernel()),
209 decltype(vct_index_dtmp.toKernel()),
210 128,
211 v_reduce ...
212 >),
213 itew,
214 vct_index_merge.toKernel(),vct_data_old.toKernel(),
215 vct_index_merge_id.toKernel(),vct_add_data_unique.toKernel(),
216 vct_index_out.toKernel(),vct_data_out.toKernel(),
217 vct_index_dtmp.toKernel(),
218 vct_index_old.size());
219
220 // we scan tmp3
221 openfpm::scan(
222 (Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),
223 vct_index_dtmp.size(),
224 (Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),
225 context);
226
227 // get the size to resize vct_index and vct_data
228 vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
229 int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
230
231 vct_index_old.resize(size);
232 vct_data_old.resize(size);
233
234 CUDA_LAUNCH(realign,itew,vct_index_out.toKernel(),vct_data_out.toKernel(),
235 vct_index_old.toKernel(), vct_data_old.toKernel(),
236 vct_index_dtmp.toKernel());
237
238
239#else // __NVCC__
240 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
241#endif // __NVCC__
242 }
243 };
244
245
246 template<typename block_functor>
247 struct scalar_block_implementation_switch<2, block_functor> // Case for blocked implementations
248 {
249 template <unsigned int p, typename vector_index_type>
250 static void extendSegments(vector_index_type & segments, size_t dataSize)
251 {
252#ifdef __NVCC__
253 // Append trailing element to segment (marks end of last segment)
254 segments.resize(segments.size()+1);
255 segments.template get<p>(segments.size() - 1) = dataSize;
256 segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
257#else // __NVCC__
258 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
259#endif // __NVCC__
260 }
261
262 template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type ,typename vector_index_type2>
263 static void segreduce(vector_data_type & vector_data,
264 vector_data_type & vector_data_unsorted,
265 vector_index_type & vector_data_map,
266 vector_index_type2 & segment_offset,
267 vector_data_type & vector_data_red,
268 block_functor & blf,
269 gpu::ofp_context_t & context)
270 {
271
272 }
273
274 template <
275 typename vector_data_type,
276 typename vector_index_type,
277 typename vector_index_type2,
278 typename vector_index_dtmp_type,
279 typename Ti,
280 typename ... v_reduce>
281 static void solveConflicts(
282 vector_index_type & vct_index_old,
283 vector_index_type & vct_index_merge,
284 vector_index_type & vct_index_merge_id,
285 vector_index_type & vct_index_out,
286 vector_index_dtmp_type & vct_index_dtmp,
287 vector_index_type & data_map,
288 vector_index_type2 & segments_new,
289 vector_data_type & vct_data,
290 vector_data_type & vct_add_data,
291 vector_data_type & vct_add_data_unique,
292 vector_data_type & vct_data_out,
293 ite_gpu<1> & itew,
294 block_functor & blf,
295 gpu::ofp_context_t & context
296 )
297 {
298#ifdef __NVCC__
299 blf.template solve_conflicts<1,
300 decltype(vct_index_merge),
301 decltype(segments_new),
302 decltype(vct_data),
303 v_reduce ...>
304 (vct_index_merge, vct_index_merge_id, segments_new, data_map,
305 vct_data, vct_add_data,
306 vct_index_old, vct_data_out,
307 context);
308 vct_data_out.swap(vct_data);
309
310#else // __NVCC__
311 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
312#endif // __NVCC__
313 }
314 };
315
316 template<typename Ti>
317 struct reorder
318 {
319 Ti id;
320 Ti id2;
321
322 bool operator<(const reorder & t) const
323 {
324 return id < t.id;
325 }
326 };
327
328 template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type>
330 {
331 template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
332 static inline void red(size_t & i, vector_data_type & vector_data_red,
333 vector_data_type & vector_data,
334 vector_index_type & vector_index,
335 vector_index_type_reo & reorder_add_index_cpu)
336 {
337 size_t start = reorder_add_index_cpu.get(i).id;
338 red_type red = vector_data.template get<reduction_type::prop::value>(i);
339
340 size_t j = 1;
341 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
342 {
343 cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
344 //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
345 }
346 vector_data_red.add();
347 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
348
349 if (T::value == 0)
350 {
351 vector_index.add();
352 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
353 }
354
355 i += j;
356 }
357 };
358
359
360 template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type, unsigned int N1>
361 struct sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type[N1]>
362 {
363 template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
364 static inline void red(size_t & i, vector_data_type & vector_data_red,
365 vector_data_type & vector_data,
366 vector_index_type & vector_index,
367 vector_index_type_reo & reorder_add_index_cpu)
368 {
369 size_t start = reorder_add_index_cpu.get(i).id;
370 red_type red[N1];
371
372 for (size_t k = 0 ; k < N1 ; k++)
373 {
374 red[k] = vector_data.template get<reduction_type::prop::value>(i)[k];
375 }
376
377 size_t j = 1;
378 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
379 {
380 auto ev = vector_data.template get<reduction_type::prop::value>(i+j);
382 //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
383 }
384
385 vector_data_red.add();
386
387 for (size_t k = 0 ; k < N1 ; k++)
388 {
389 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1)[k] = red[k];
390 }
391
392 if (T::value == 0)
393 {
394 vector_index.add();
395 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
396 }
397
398 i += j;
399 }
400 };
401
412 template<typename vector_data_type,
413 typename vector_index_type,
414 typename vector_index_type_reo,
415 typename vector_reduction,
416 unsigned int impl>
418 {
420 vector_data_type & vector_data_red;
421
423 vector_data_type & vector_data;
424
426 vector_index_type_reo & reorder_add_index_cpu;
427
429 vector_index_type & vector_index;
430
438 vector_data_type & vector_data,
439 vector_index_type & vector_index,
440 vector_index_type_reo & reorder_add_index_cpu)
442 {};
443
445 template<typename T>
446 inline void operator()(T& t) const
447 {
448 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
449 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
450
451 if (reduction_type::is_special() == false)
452 {
453 for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; )
454 {
456
457/* size_t start = reorder_add_index_cpu.get(i).id;
458 red_type red = vector_data.template get<reduction_type::prop::value>(i);
459
460 size_t j = 1;
461 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
462 {
463 cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
464 //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
465 }
466 vector_data_red.add();
467 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
468
469 if (T::value == 0)
470 {
471 vector_index.add();
472 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
473 }
474
475 i += j;*/
476 }
477 }
478 }
479 };
480
491 template<typename encap_src,
492 typename encap_dst,
493 typename vector_reduction>
495 {
497 encap_src & src;
498
500 encap_dst & dst;
501
502
510 :src(src),dst(dst)
511 {};
512
514 template<typename T>
515 inline void operator()(T& t) const
516 {
517 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
518
519 dst.template get<reduction_type::prop::value>() = src.template get<reduction_type::prop::value>();
520 }
521 };
522
523
524 template<unsigned int impl,typename vector_reduction, typename T,typename red_type>
526 {
527 template<typename encap_src, typename encap_dst>
528 static inline void red(encap_src & src, encap_dst & dst)
529 {
530 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
531
532 cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
533 }
534 };
535
536 template<unsigned int impl, typename vector_reduction, typename T,typename red_type, unsigned int N1>
537 struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type[N1]>
538 {
539 template<typename encap_src, typename encap_dst>
540 static inline void red(encap_src & src, encap_dst & dst)
541 {
542 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
543
544 auto src_e = src.template get<reduction_type::prop::value>();
545 auto dst_e = dst.template get<reduction_type::prop::value>();
546
547 cpu_block_process<reduction_type,impl+1>::template process_e<N1,red_type::size>(src_e,dst_e);
548 }
549 };
550
561 template<typename encap_src,
562 typename encap_dst,
563 typename vector_reduction,
564 unsigned int impl>
566 {
568 encap_src & src;
569
571 encap_dst & dst;
572
573
581 :src(src),dst(dst)
582 {};
583
585 template<typename T>
586 inline void operator()(T& t) const
587 {
588 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
589 typedef typename boost::mpl::at<typename encap_src::T_type::type, typename reduction_type::prop>::type red_type;
590
592
593// cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
594// reduction_type::red(dst.template get<reduction_type::prop::value>(),src.template get<reduction_type::prop::value>());
595 }
596 };
597
608 template<typename vector_data_type,
609 typename vector_index_type,
610 typename vector_index_type2,
611 typename vector_reduction,
612 typename block_functor,
613 unsigned int impl2, unsigned int pSegment=1>
615 {
617 vector_data_type & vector_data_red;
618
620 vector_data_type & vector_data;
621
623 vector_data_type & vector_data_unsorted;
624
626 vector_index_type2 & segment_offset;
627
629 vector_index_type & vector_data_map;
630
632 block_functor & blf;
633
636
643 inline sparse_vector_reduction(vector_data_type & vector_data_red,
644 vector_data_type & vector_data,
645 vector_data_type & vector_data_unsorted,
646 vector_index_type & vector_data_map,
647 vector_index_type2 & segment_offset,
648 block_functor & blf,
655 blf(blf),
657 {};
658
660 template<typename T>
661 inline void operator()(T& t) const
662 {
663#ifdef __NVCC__
664
665 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
666 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
667 if (reduction_type::is_special() == false)
668 {
669 scalar_block_implementation_switch<impl2, block_functor>::template segreduce<pSegment, vector_reduction, T>(
675 blf,
676 context);
677 }
678#else
679 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
680#endif
681 }
682 };
683
684
686 {
687 template<unsigned int pSegment, typename vector_reduction, typename T, typename vector_index_type, typename vector_data_type>
688 static bool seg_reduce(vector_index_type & segments, vector_data_type & src, vector_data_type & dst)
689 {
690 return true;
691 }
692
693 template<typename vector_index_type, typename vector_data_type, typename ... v_reduce>
694 static bool solve_conflicts(vector_index_type &keys, vector_index_type &merge_indices,
695 vector_data_type &data1, vector_data_type &data2,
696 vector_index_type &indices_tmp, vector_data_type &data_tmp,
697 vector_index_type &keysOut, vector_data_type &dataOut,
698 gpu::ofp_context_t & context)
699 {
700 return true;
701 }
702
704
706 {
707 return outputMap;
708 }
709
710 const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const
711 {
712 return outputMap;
713 }
714 };
715
726 template<typename vector_data_type, typename vector_index_type, typename vector_reduction>
728 {
730 vector_data_type & vector_data_red;
731
733 vector_data_type & vector_data;
734
736 vector_index_type & segment_offset;
737
740
747 inline sparse_vector_special(vector_data_type & vector_data_red,
748 vector_data_type & vector_data,
749 vector_index_type & segment_offset,
752 {};
753
755 template<typename T>
756 inline void operator()(T& t) const
757 {
758#ifdef __NVCC__
759
760 typedef typename boost::mpl::at<vector_reduction,T>::type reduction_type;
761
762 // reduction type
763 typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
764
765 if (reduction_type::is_special() == true)
766 {
767 auto ite = segment_offset.getGPUIterator();
768
769 CUDA_LAUNCH((reduce_from_offset<decltype(segment_offset.toKernel()),decltype(vector_data_red.toKernel()),reduction_type>),
770 ite,segment_offset.toKernel(),vector_data_red.toKernel(),vector_data.size());
771 }
772
773#else
774 std::cout << __FILE__ << ":" << __LINE__ << " error: this file si supposed to be compiled with nvcc" << std::endl;
775#endif
776 }
777 };
778
779 template<typename T,
780 typename Ti = long int,
781 typename Memory=HeapMemory,
782 typename layout=typename memory_traits_lin<T>::type,
783 template<typename> class layout_base=memory_traits_lin ,
784 typename grow_p=grow_policy_double,
785 unsigned int impl=vect_isel<T>::value,
786 unsigned int impl2 = VECTOR_SPARSE_STANDARD,
787 typename block_functor = stub_block_functor>
789 {
790 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index;
792 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_m_index;
793
794 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index;
795 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_rem_index;
796 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nadd_index;
797 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nrem_index;
799 vector<T,Memory,layout_base,grow_p> vct_add_data_reord;
800
801 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_0;
802 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_1;
803 vector<T,Memory,layout_base,grow_p> vct_add_data_cont;
804 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> vct_add_index_unique;
805 vector<aggregate<int,int>,Memory,layout_base,grow_p> segments_int;
806
808
809 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp4;
810 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
811 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp2;
812 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp3;
813 vector<aggregate<Ti,Ti,Ti>,Memory,layout_base,grow_p> vct_index_dtmp;
814
815 // segments map (This is used only in case of Blocked data)
816 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_segment_index_map;
817
818 block_functor blf;
819
820 T bck;
821
822 CudaMemory mem;
823
824 openfpm::vector<reorder<Ti>> reorder_add_index_cpu;
825
826 size_t max_ele;
827
828 int n_gpu_add_block_slot = 0;
829 int n_gpu_rem_block_slot = 0;
830
837 template<bool prefetch>
838 inline Ti _branchfree_search_nobck(Ti x, Ti & id) const
839 {
840 if (vct_index.size() == 0) {id = 0; return -1;}
841 const Ti *base = &vct_index.template get<0>(0);
842 const Ti *end = (const Ti *)vct_index.template getPointer<0>() + vct_index.size();
843 Ti n = vct_data.size()-1;
844 while (n > 1)
845 {
846 Ti half = n / 2;
847 if (prefetch)
848 {
849 __builtin_prefetch(base + half/2, 0, 0);
850 __builtin_prefetch(base + half + half/2, 0, 0);
851 }
852 base = (base[half] < x) ? base+half : base;
853 n -= half;
854 }
855
856 int off = (*base < x);
857 id = base - &vct_index.template get<0>(0) + off;
858 return (base + off != end)?*(base + off):-1;
859 }
860
867 template<bool prefetch>
868 inline void _branchfree_search(Ti x, Ti & id) const
869 {
870 Ti v = _branchfree_search_nobck<prefetch>(x,id);
871 id = (x == v)?id:vct_data.size()-1;
872 }
873
874
875 /* \brief take the indexes for the insertion pools and create a continuos array
876 *
877 * \param vct_nadd_index number of insertions of each pool
878 * \param vct_add_index pool of inserts
879 * \param vct_add_cont_index output continuos array of inserted indexes
880 * \param vct_add_data array of added data
881 * \param vct_add_data_cont continuos array of inserted data
882 * \param contect gpu context
883 *
884 */
885 size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index,
886 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index,
887 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
888 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
890 vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
891 gpu::ofp_context_t & context)
892 {
893#ifdef __NVCC__
894
895 // Add 0 to the last element to vct_nadd_index
896 vct_nadd_index.resize(vct_nadd_index.size()+1);
897 vct_nadd_index.template get<0>(vct_nadd_index.size()-1) = 0;
898 vct_nadd_index.template hostToDevice<0>(vct_nadd_index.size()-1,vct_nadd_index.size()-1);
899
900 // Merge the list of inserted points for each block
901 vct_index_tmp4.resize(vct_nadd_index.size());
902
903 openfpm::scan((Ti *)vct_nadd_index.template getDeviceBuffer<0>(),
904 vct_nadd_index.size(),
905 (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() ,
906 context);
907
908 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
909 size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
910
911 // we reuse vct_nadd_index
912 vct_add_cont_index.resize(n_ele);
913 vct_add_cont_index_map.resize(n_ele);
914
915 if (impl2 == VECTOR_SPARSE_STANDARD)
916 {
917 vct_add_data_cont.resize(n_ele);
918 }
919 else
920 {
921 vct_segment_index_map.resize(n_ele);
922 }
923
924 if (n_gpu_add_block_slot >= 128)
925 {
926 ite_gpu<1> itew;
927 itew.wthr.x = vct_nadd_index.size()-1;
928 itew.wthr.y = 1;
929 itew.wthr.z = 1;
930 itew.thr.x = 128;
931 itew.thr.y = 1;
932 itew.thr.z = 1;
933
934 CUDA_LAUNCH(construct_insert_list_key_only,itew,vct_add_index.toKernel(),
935 vct_nadd_index.toKernel(),
936 vct_index_tmp4.toKernel(),
937 vct_add_cont_index.toKernel(),
938 vct_add_cont_index_map.toKernel(),
939 n_gpu_add_block_slot);
940 }
941 else
942 {
943 auto itew = vct_add_index.getGPUIterator();
944
945 CUDA_LAUNCH(construct_insert_list_key_only_small_pool,itew,vct_add_index.toKernel(),
946 vct_nadd_index.toKernel(),
947 vct_index_tmp4.toKernel(),
948 vct_add_cont_index.toKernel(),
949 vct_add_cont_index_map.toKernel(),
950 n_gpu_add_block_slot);
951 }
952
953 return n_ele;
954#endif
955 return 0;
956 }
957
967 void reorder_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
968 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
969 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
970 vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
971 gpu::ofp_context_t & context)
972 {
973#ifdef __NVCC__
974 ite_gpu<1> itew;
975 itew.wthr.x = vct_nadd_index.size()-1;
976 itew.wthr.y = 1;
977 itew.wthr.z = 1;
978 itew.thr.x = 128;
979 itew.thr.y = 1;
980 itew.thr.z = 1;
981
982 size_t n_ele = vct_add_cont_index.size();
983
984 n_gpu_add_block_slot = 0;
985
986 // now we sort
987 openfpm::sort(
988 (Ti *)vct_add_cont_index.template getDeviceBuffer<0>(),
989 (Ti *)vct_add_cont_index_map.template getDeviceBuffer<0>(),
990 vct_add_cont_index.size(),
991 gpu::template less_t<Ti>(),
992 context);
993
994 auto ite = vct_add_cont_index.getGPUIterator();
995
996 // Now we reorder the data vector accordingly to the indexes
997
998 if (impl2 == VECTOR_SPARSE_STANDARD)
999 {
1000 vct_add_data_reord.resize(n_ele);
1001 CUDA_LAUNCH(reorder_vector_data,ite,vct_add_cont_index_map.toKernel(),vct_add_data_cont.toKernel(),vct_add_data_reord.toKernel());
1002 }
1003
1004#endif
1005 }
1006
1013 template<typename ... v_reduce>
1014 void merge_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_sort,
1015 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique,
1016 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index,
1017 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index_map,
1018 gpu::ofp_context_t & context)
1019 {
1020#ifdef __NVCC__
1021
1022 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1023
1024 auto ite = vct_add_index_sort.getGPUIterator();
1025
1026 mem.allocate(sizeof(int));
1027 mem.fill(0);
1028 vct_add_index_unique.resize(vct_add_index_sort.size()+1);
1029
1030 ite = vct_add_index_sort.getGPUIterator();
1031
1032 vct_index_tmp4.resize(vct_add_index_sort.size()+1);
1033
1034 CUDA_LAUNCH(
1035 (
1036 find_buffer_offsets_for_scan
1037 <0,
1038 decltype(vct_add_index_sort.toKernel()),
1039 decltype(vct_index_tmp4.toKernel())
1040 >
1041 ),
1042 ite,
1043 vct_add_index_sort.toKernel(),
1044 vct_index_tmp4.toKernel());
1045
1046 openfpm::scan((Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_index_tmp4.size(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),context);
1047
1048 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1049 int n_ele_unique = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1050
1051 vct_add_index_unique.resize(n_ele_unique);
1052
1053 if (impl2 == VECTOR_SPARSE_STANDARD)
1054 {
1055 vct_add_data_unique.resize(n_ele_unique);
1056 }
1057
1058 CUDA_LAUNCH(
1059 (construct_index_unique<0>),
1060 ite,
1061 vct_add_index_sort.toKernel(),
1062 vct_index_tmp4.toKernel(),
1063 vct_add_index_unique.toKernel());
1064
1065 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1066
1067 // Then we merge the two list vct_index and vct_add_index_unique
1068
1069 // index to get merge index
1070 vct_m_index.resize(vct_index.size());
1071
1072 if (vct_m_index.size() != 0)
1073 {
1074 ite = vct_m_index.getGPUIterator();
1075 CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1076 }
1077
1078 // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1079 // we reorder the data array also
1080
1081 vct_merge_index.resize(vct_index.size() + vct_add_index_unique.size());
1082 vct_merge_index_map.resize(vct_index.size() + vct_add_index_unique.size());
1083 vct_index_tmp3.resize(vct_index.size() + vct_add_index_unique.size());
1084
1085 // Do not delete this reserve
1086 // Unfortunately all resize with DataBlocks are broken
1087 if (impl2 == VECTOR_SPARSE_STANDARD)
1088 {
1089 vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+1);
1090 vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size());
1091 }
1092
1093 ite = vct_add_index_unique.getGPUIterator();
1094 vct_index_tmp4.resize(vct_add_index_unique.size());
1095 CUDA_LAUNCH((set_indexes<0>),ite,vct_index_tmp4.toKernel(),vct_index.size());
1096
1097 ite_gpu<1> itew;
1098
1099 itew.wthr.x = vct_merge_index.size() / 128 + (vct_merge_index.size() % 128 != 0);
1100 itew.wthr.y = 1;
1101 itew.wthr.z = 1;
1102 itew.thr.x = 128;
1103 itew.thr.y = 1;
1104 itew.thr.z = 1;
1105
1106 vct_index_dtmp.resize(itew.wthr.x);
1107
1108 // we merge with vct_index with vct_add_index_unique in vct_merge_index, vct_merge_index contain the merged indexes
1109 //
1110
1111 openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1112 (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(),
1113 (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),context);
1114
1115
1116#endif
1117 }
1118
1119
1120
1121 template<typename ... v_reduce>
1122 void merge_datas(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1123 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & segments_new,
1125 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_data_reord_map,
1126 gpu::ofp_context_t & context)
1127 {
1128#ifdef __NVCC__
1129 ite_gpu<1> itew;
1130 itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1131 itew.wthr.y = 1;
1132 itew.wthr.z = 1;
1133 itew.thr.x = 128;
1134 itew.thr.y = 1;
1135 itew.thr.z = 1;
1136
1137 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1138
1140
1141 // Now we can do a segmented reduction
1143 ::template extendSegments<1>(vct_add_index_unique, vct_add_data_reord_map.size());
1144
1145 if (impl2 == VECTOR_SPARSE_STANDARD)
1146 {
1147 sparse_vector_reduction<typename std::remove_reference<decltype(vct_add_data)>::type,
1148 decltype(vct_add_data_reord_map),
1149 decltype(vct_add_index_unique),vv_reduce,block_functor,impl2>
1150 svr(
1151 vct_add_data_unique,
1152 vct_add_data_reord,
1153 vct_add_data,
1154 vct_add_data_reord_map,
1155 vct_add_index_unique,
1156 blf,
1157 context);
1158
1159 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1160 vct_add_index_unique.remove(vct_add_index_unique.size()-1);
1161 }
1162
1163 sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type,
1164 decltype(vct_add_index_unique),
1165 vv_reduce> svr2(vct_add_data_unique,vct_add_data_reord,vct_add_index_unique,context);
1166 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr2);
1167
1169
1170 // Now perform the right solve_conflicts according to impl2
1171 scalar_block_implementation_switch<impl2, block_functor>::template solveConflicts<
1172 decltype(vct_data),
1173 decltype(vct_index),
1174 decltype(segments_new),
1175 decltype(vct_index_dtmp),
1176 Ti,
1177 v_reduce ...
1178 >
1179 (
1180 vct_index,
1181 vct_index_tmp,
1182 vct_index_tmp2,
1183 vct_index_tmp3,
1184 vct_index_dtmp,
1185 vct_add_data_reord_map,
1186 segments_new,
1187 vct_data,
1188 vct_add_data,
1189 vct_add_data_unique,
1190 vct_add_data_cont,
1191 itew,
1192 blf,
1193 context
1194 );
1195
1196
1197#else
1198 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1199#endif
1200 }
1201
1202 template<typename ... v_reduce>
1203 void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1204 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1205 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1206 gpu::ofp_context_t & context)
1207 {
1208#ifdef __NVCC__
1209
1210 // To avoid the case where you never called setGPUInsertBuffer
1211 if (n_gpu_add_block_slot == 0 || vct_add_index.size() == 0)
1212 {
1213 return;
1214 }
1215
1216 size_t n_ele = make_continuos(vct_nadd_index,vct_add_index,vct_add_index_cont_0,vct_add_index_cont_1,
1217 vct_add_data,vct_add_data_cont,context);
1218
1219 // At this point we can check whether we have not inserted anything actually,
1220 // in this case, return without further ado...
1221 if (vct_add_index_cont_0.size() == 0)
1222 {return;}
1223
1224 reorder_indexes(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,vct_add_data,context);
1225
1226 merge_indexes<v_reduce ... >(vct_add_index_cont_0,vct_add_index_unique,
1227 vct_index_tmp,vct_index_tmp2,
1228 context);
1229
1230 merge_datas<v_reduce ... >(vct_add_data_reord,vct_add_index_unique,vct_add_data,vct_add_index_cont_1,context);
1231
1232#else
1233 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1234#endif
1235 }
1236
1237
1238 void flush_on_gpu_remove(
1239 gpu::ofp_context_t & context)
1240 {
1241#ifdef __NVCC__
1242
1243 // Add 0 to the last element to vct_nadd_index
1244 vct_nrem_index.resize(vct_nrem_index.size()+1);
1245 vct_nrem_index.template get<0>(vct_nrem_index.size()-1) = 0;
1246 vct_nrem_index.template hostToDevice<0>(vct_nrem_index.size()-1,vct_nrem_index.size()-1);
1247
1248 // Merge the list of inserted points for each block
1249 vct_index_tmp4.resize(vct_nrem_index.size());
1250
1251 openfpm::scan((Ti *)vct_nrem_index.template getDeviceBuffer<0>(), vct_nrem_index.size(), (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() , context);
1252
1253 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1254 size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1255
1256 // we reuse vct_nadd_index
1257 vct_add_index_cont_0.resize(n_ele);
1258 vct_add_index_cont_1.resize(n_ele);
1259
1260 ite_gpu<1> itew;
1261 itew.wthr.x = vct_nrem_index.size()-1;
1262 itew.wthr.y = 1;
1263 itew.wthr.z = 1;
1264 itew.thr.x = 128;
1265 itew.thr.y = 1;
1266 itew.thr.z = 1;
1267
1268 CUDA_LAUNCH(construct_remove_list,itew,vct_rem_index.toKernel(),
1269 vct_nrem_index.toKernel(),
1270 vct_index_tmp4.toKernel(),
1271 vct_add_index_cont_0.toKernel(),
1272 vct_add_index_cont_1.toKernel(),
1273 n_gpu_rem_block_slot);
1274
1275 // now we sort
1276 openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(),
1277 vct_add_index_cont_0.size(), gpu::template less_t<Ti>(), context);
1278
1279 auto ite = vct_add_index_cont_0.getGPUIterator();
1280
1281 mem.allocate(sizeof(int));
1282 mem.fill(0);
1283 vct_add_index_unique.resize(vct_add_index_cont_0.size()+1);
1284
1285 ite = vct_add_index_cont_0.getGPUIterator();
1286
1287 // produce unique index list
1288 // Find the buffer bases
1289 CUDA_LAUNCH((find_buffer_offsets_zero<0,decltype(vct_add_index_cont_0.toKernel()),decltype(vct_add_index_unique.toKernel())>),
1290 ite,
1291 vct_add_index_cont_0.toKernel(),(int *)mem.getDevicePointer(),vct_add_index_unique.toKernel());
1292
1293 mem.deviceToHost();
1294 int n_ele_unique = *(int *)mem.getPointer();
1295
1296 vct_add_index_unique.resize(n_ele_unique);
1297
1298 openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),
1299 vct_add_index_unique.size(),gpu::template less_t<Ti>(),context);
1300
1301 // Then we merge the two list vct_index and vct_add_index_unique
1302
1303 // index to get merge index
1304 vct_m_index.resize(vct_index.size() + vct_add_index_unique.size());
1305
1306 ite = vct_m_index.getGPUIterator();
1307 CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1308
1309 ite = vct_add_index_unique.getGPUIterator();
1310 CUDA_LAUNCH((set_indexes<1>),ite,vct_add_index_unique.toKernel(),vct_index.size());
1311
1312 // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1313 // we reorder the data array also
1314
1315 vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1316 vct_index_tmp2.resize(vct_index.size() + vct_add_index_unique.size());
1317
1318 itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1319 itew.wthr.y = 1;
1320 itew.wthr.z = 1;
1321 itew.thr.x = 128;
1322 itew.thr.y = 1;
1323 itew.thr.z = 1;
1324
1325 vct_index_dtmp.resize(itew.wthr.x);
1326
1327 // we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index
1328 //
1329 openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1330 (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(),
1331 (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),context);
1332
1333 vct_index_tmp3.resize(128*itew.wthr.x);
1334
1335 CUDA_LAUNCH((solve_conflicts_remove<decltype(vct_index_tmp.toKernel()),decltype(vct_index_dtmp.toKernel()),128>),
1336 itew,
1337 vct_index_tmp.toKernel(),
1338 vct_index_tmp2.toKernel(),
1339 vct_index_tmp3.toKernel(),
1340 vct_m_index.toKernel(),
1341 vct_index_dtmp.toKernel(),
1342 vct_index.size());
1343
1344 // we scan tmp3
1345 openfpm::scan((Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),vct_index_dtmp.size(),(Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),context);
1346
1347 // get the size to resize vct_index and vct_data
1348 vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
1349 int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
1350
1351 vct_add_data_cont.resize(size);
1352 vct_index.resize(size);
1353
1354 CUDA_LAUNCH(realign_remove,itew,vct_index_tmp3.toKernel(),vct_m_index.toKernel(),vct_data.toKernel(),
1355 vct_index.toKernel(),vct_add_data_cont.toKernel(),
1356 vct_index_dtmp.toKernel());
1357
1358 vct_data.swap(vct_add_data_cont);
1359
1360#else
1361 std::cout << __FILE__ << ":" << __LINE__ << " error: you are suppose to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1362#endif
1363 }
1364
1365 void resetBck()
1366 {
1367 // re-add background
1368 vct_data.resize(vct_data.size()+1);
1369 vct_data.get(vct_data.size()-1) = bck;
1370
1371 htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1372 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1373 }
1374
1375 template<typename ... v_reduce>
1376 void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1377 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1378 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1379 gpu::ofp_context_t & context)
1380 {
1381 flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);
1382 }
1383
1384 template<typename ... v_reduce>
1385 void flush_on_cpu()
1386 {
1387 if (vct_add_index.size() == 0)
1388 {return;}
1389
1390 // First copy the added index to reorder
1391 reorder_add_index_cpu.resize(vct_add_index.size());
1392 vct_add_data_cont.resize(vct_add_index.size());
1393
1394 for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1395 {
1396 reorder_add_index_cpu.get(i).id = vct_add_index.template get<0>(i);
1397 reorder_add_index_cpu.get(i).id2 = i;
1398 }
1399
1400 reorder_add_index_cpu.sort();
1401
1402 // Copy the data
1403 for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1404 {
1405 vct_add_data_cont.get(i) = vct_add_data.get(reorder_add_index_cpu.get(i).id2);
1406 }
1407
1408 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1409
1410 sparse_vector_reduction_cpu<decltype(vct_add_data),
1411 decltype(vct_add_index_unique),
1412 decltype(reorder_add_index_cpu),
1413 vv_reduce,
1414 impl2>
1415 svr(vct_add_data_unique,
1416 vct_add_data_cont,
1417 vct_add_index_unique,
1418 reorder_add_index_cpu);
1419
1420 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1421
1422 // merge the the data
1423
1424 vector<T,Memory,layout_base,grow_p,impl> vct_data_tmp;
1425 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
1426
1427 vct_data_tmp.resize(vct_data.size() + vct_add_data_unique.size());
1428 vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1429
1430 Ti di = 0;
1431 Ti ai = 0;
1432 size_t i = 0;
1433
1434 for ( ; i < vct_data_tmp.size() ; i++)
1435 {
1436 Ti id_a = (ai < vct_add_index_unique.size())?vct_add_index_unique.template get<0>(ai):std::numeric_limits<Ti>::max();
1437 Ti id_d = (di < vct_index.size())?vct_index.template get<0>(di):std::numeric_limits<Ti>::max();
1438
1439 if ( id_a <= id_d )
1440 {
1441 vct_index_tmp.template get<0>(i) = id_a;
1442
1443 if (id_a == id_d)
1444 {
1445 auto dst = vct_data_tmp.get(i);
1446 auto src = vct_add_data_unique.get(ai);
1447
1448 sparse_vector_reduction_solve_conflict_assign_cpu<decltype(vct_data_tmp.get(i)),
1449 decltype(vct_add_data.get(ai)),
1450 vv_reduce>
1451 sva(src,dst);
1452
1453 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(sva);
1454 ai++;
1455
1456 dst = vct_data_tmp.get(i);
1457 src = vct_data.get(di);
1458
1459 sparse_vector_reduction_solve_conflict_reduce_cpu<decltype(vct_data_tmp.get(i)),
1460 decltype(vct_data.get(di)),
1461 vv_reduce,
1462 impl2>
1463 svr(src,dst);
1464 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1465
1466 di++;
1467
1468 vct_data_tmp.resize(vct_data_tmp.size()-1);
1469 vct_index_tmp.resize(vct_index_tmp.size()-1);
1470 }
1471 else
1472 {
1473 vct_index_tmp.template get<0>(i) = vct_add_index_unique.template get<0>(ai);
1474 vct_data_tmp.get(i) = vct_add_data_unique.get(ai);
1475 ai++;
1476 }
1477 }
1478 else
1479 {
1480 vct_index_tmp.template get<0>(i) = vct_index.template get<0>(di);
1481 vct_data_tmp.get(i) = vct_data.get(di);
1482 di++;
1483 }
1484 }
1485
1486 vct_index.swap(vct_index_tmp);
1487 vct_data.swap(vct_data_tmp);
1488
1489 vct_add_data.clear();
1490 vct_add_index.clear();
1491 vct_add_index_unique.clear();
1492 vct_add_data_unique.clear();
1493 }
1494
1495 public:
1496
1497 vector_sparse()
1498 :max_ele(0)
1499 {
1500 vct_data.resize(1);
1501 }
1502
1507 auto getIndexBuffer() -> decltype(vct_index)&
1508 {
1509 return vct_index;
1510 }
1511
1516 auto getDataBuffer() -> decltype(vct_data)&
1517 {
1518 return vct_data;
1519 }
1520
1525 auto getIndexBuffer() const -> const decltype(vct_index)&
1526 {
1527 return vct_index;
1528 }
1529
1534 auto getDataBuffer() const -> const decltype(vct_data)&
1535 {
1536 return vct_data;
1537 }
1538
1551 {
1552 Ti di;
1553 this->_branchfree_search<false>(id,di);
1555 sid.id = di;
1556
1557 return sid;
1558 }
1559
1570 template <unsigned int p>
1571 inline auto get(Ti id) const -> decltype(vct_data.template get<p>(id))
1572 {
1573 Ti di;
1574 this->_branchfree_search<false>(id,di);
1575 return vct_data.template get<p>(di);
1576 }
1577
1588 inline auto get(Ti id) const -> decltype(vct_data.get(id))
1589 {
1590 Ti di;
1591 this->_branchfree_search<false>(id,di);
1592 return vct_data.get(di);
1593 }
1594
1600 void resize(size_t n)
1601 {
1602 max_ele = n;
1603 }
1604
1613 void swapIndexVector(vector<aggregate<Ti>,Memory,layout_base,grow_p> & iv)
1614 {
1615 vct_index.swap(iv);
1616 }
1617
1623 template <unsigned int p>
1624 auto getBackground() const -> decltype(vct_data.template get<p>(vct_data.size()-1))
1625 {
1626 return vct_data.template get<p>(vct_data.size()-1);
1627 }
1628
1634 auto getBackground() const -> decltype(vct_data.get(vct_data.size()-1))
1635 {
1636 return vct_data.get(vct_data.size()-1);
1637 }
1638
1639 template<unsigned int p>
1640 void setBackground(const typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type & bck_)
1641 {
1643 typename std::remove_reference<decltype(vct_data.template get<p>(vct_data.size()-1))>::type>
1644 ::meta_copy_d_(bck_,vct_data.template get<p>(vct_data.size()-1));
1645
1646 vct_data.template hostToDevice<p>(vct_data.size()-1,vct_data.size()-1);
1647
1649 ::meta_copy_(bck_,bck.template get<p>());
1650 }
1651
1659 template <unsigned int p>
1660 auto insert(Ti ele) -> decltype(vct_data.template get<p>(0))
1661 {
1662 vct_add_index.add();
1663 vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1664 vct_add_data.add();
1665 return vct_add_data.template get<p>(vct_add_data.size()-1);
1666 }
1667
1675 template <unsigned int p>
1676 auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.template get<p>(0))
1677 {
1678 is_new = false;
1679 size_t di;
1680
1681 // first we have to search if the block exist
1682 Ti v = _branchfree_search_nobck(ele,di);
1683
1684 if (v == ele)
1685 {
1686 // block exist
1687 return vct_data.template get<p>(di);
1688 }
1689 is_new = true;
1690
1691 // It does not exist, we create it di contain the index where we have to create the new block
1692 vct_index.insert(di);
1693 vct_data.isert(di);
1694
1695 return vct_data.template get<p>(di);
1696 }
1697
1703 auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.get(0))
1704 {
1705 is_new = false;
1706 Ti di;
1707
1708 // first we have to search if the block exist
1709 Ti v = _branchfree_search_nobck<true>(ele,di);
1710
1711 if (v == ele)
1712 {
1713 // block exist
1714 return vct_data.get(di);
1715 }
1716
1717 // It does not exist, we create it di contain the index where we have to create the new block
1718 vct_index.insert(di);
1719 vct_data.insert(di);
1720 is_new = true;
1721
1722 vct_index.template get<0>(di) = ele;
1723
1724 return vct_data.get(di);
1725 }
1726
1732 auto insert(Ti ele) -> decltype(vct_data.get(0))
1733 {
1734 vct_add_index.add();
1735 vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1736 vct_add_data.add();
1737 return vct_add_data.get(vct_add_data.size()-1);
1738 }
1739
1747 template<typename ... v_reduce>
1748 void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1749 gpu::ofp_context_t & context,
1750 flush_type opt = FLUSH_ON_HOST,
1751 int i = 0)
1752 {
1753 // Eliminate background
1754 vct_data.resize(vct_index.size());
1755
1756 if (opt & flush_type::FLUSH_ON_DEVICE)
1757 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context,i);}
1758 else
1759 {this->flush_on_cpu<v_reduce ... >();}
1760
1761 resetBck();
1762 }
1763
1771 template<typename ... v_reduce>
1773 gpu::ofp_context_t & context,
1774 flush_type opt = FLUSH_ON_HOST)
1775 {
1776 // Eliminate background
1777 vct_data.resize(vct_index.size());
1778
1779 if (opt & flush_type::FLUSH_ON_DEVICE)
1780 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1781 else
1782 {this->flush_on_cpu<v_reduce ... >();}
1783
1784 resetBck();
1785 }
1786
1792 template<typename ... v_reduce>
1793 void flush(gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1794 {
1795 // Eliminate background
1796 vct_data.resize(vct_index.size());
1797
1798 if (opt & flush_type::FLUSH_ON_DEVICE)
1799 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1800 else
1801 {this->flush_on_cpu<v_reduce ... >();}
1802
1803 resetBck();
1804 }
1805
1811 void flush_remove(gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1812 {
1813 vct_data.resize(vct_data.size()-1);
1814
1815 if (opt & flush_type::FLUSH_ON_DEVICE)
1816 {this->flush_on_gpu_remove(context);}
1817 else
1818 {
1819 std::cerr << __FILE__ << ":" << __LINE__ << " error, flush_remove on CPU has not implemented yet";
1820 }
1821
1822 resetBck();
1823 }
1824
1829 size_t size()
1830 {
1831 return vct_index.size();
1832 }
1833
1838 vector<aggregate<Ti>,Memory,layout_base,grow_p> &
1840 {
1841 return vct_index;
1842 }
1843
1849 template<unsigned int ... prp>
1851 {
1852 vct_index.template deviceToHost<0>();
1853 vct_data.template deviceToHost<prp...>();
1854 }
1855
1861 template<unsigned int ... prp>
1863 {
1864 vct_index.template hostToDevice<0>();
1865 vct_data.template hostToDevice<prp...>();
1866 }
1867
1874 {
1875 vector_sparse_gpu_ker<T,Ti,layout_base> mvsck(vct_index.toKernel(),vct_data.toKernel(),
1876 vct_add_index.toKernel(),
1877 vct_rem_index.toKernel(),vct_add_data.toKernel(),
1878 vct_nadd_index.toKernel(),
1879 vct_nrem_index.toKernel(),
1880 n_gpu_add_block_slot,
1881 n_gpu_rem_block_slot);
1882
1883 return mvsck;
1884 }
1885
1892 void setGPUInsertBuffer(int nblock, int nslot)
1893 {
1894 vct_add_index.resize(nblock*nslot);
1895 vct_nadd_index.resize(nblock);
1896 vct_add_data.resize(nblock*nslot);
1897 n_gpu_add_block_slot = nslot;
1898 vct_nadd_index.template fill<0>(0);
1899 }
1900
1907 {
1908#ifdef __NVCC__
1909 vct_nadd_index.resize(vct_add_index.size());
1910
1911 if (vct_nadd_index.size() != 0)
1912 {
1913 auto ite = vct_nadd_index.getGPUIterator();
1914 CUDA_LAUNCH((set_one_insert_buffer),ite,vct_nadd_index.toKernel());
1915 }
1916 n_gpu_add_block_slot = 1;
1917#endif
1918 }
1919
1924 auto getGPUInsertBuffer() -> decltype(vct_add_data)&
1925 {
1926 return vct_add_data;
1927 }
1928
1935 void setGPURemoveBuffer(int nblock, int nslot)
1936 {
1937 vct_rem_index.resize(nblock*nslot);
1938 vct_nrem_index.resize(nblock);
1939 n_gpu_rem_block_slot = nslot;
1940 vct_nrem_index.template fill<0>(0);
1941 }
1942
1943#ifdef CUDA_GPU
1944
1950 auto getGPUIterator() -> decltype(vct_index.getGPUIterator())
1951 {
1952 return vct_index.getGPUIterator();
1953 }
1954
1955#endif
1956
1961 void clear()
1962 {
1963 vct_data.clear();
1964 vct_index.clear();
1965 vct_add_index.clear();
1966 vct_add_data.clear();
1967
1968 // re-add background
1969 vct_data.resize(vct_data.size()+1);
1970 vct_data.get(vct_data.size()-1) = bck;
1971
1972 htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1973 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1974
1975 max_ele = 0;
1976 n_gpu_add_block_slot = 0;
1977 n_gpu_rem_block_slot = 0;
1978 }
1979
1981 {
1982 vct_data.swap(sp.vct_data);
1983 vct_index.swap(sp.vct_index);
1984 vct_add_index.swap(sp.vct_add_index);
1985 vct_add_data.swap(sp.vct_add_data);
1986
1987 size_t max_ele_ = sp.max_ele;
1988 sp.max_ele = max_ele;
1989 this->max_ele = max_ele_;
1990 }
1991
1992 vector<T,Memory,layout_base,grow_p> & private_get_vct_add_data()
1993 {
1994 return vct_add_data;
1995 }
1996
1997 vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index()
1998 {
1999 return vct_add_index;
2000 }
2001
2002 const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index() const
2003 {
2004 return vct_add_index;
2005 }
2006
2007 vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index()
2008 {
2009 return vct_nadd_index;
2010 }
2011
2012 const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index() const
2013 {
2014 return vct_nadd_index;
2015 }
2016
2017 auto getSegmentToOutMap() -> decltype(blf.get_outputMap())
2018 {
2019 return blf.get_outputMap();
2020 }
2021
2022 auto getSegmentToOutMap() const -> decltype(blf.get_outputMap())
2023 {
2024 return blf.get_outputMap();
2025 }
2026
2032 {
2033 vct_add_data.resize(0);
2034 vct_add_data.shrink_to_fit();
2035
2036 vct_add_data.resize(0);
2037 vct_add_data.shrink_to_fit();
2038
2039 vct_add_data_reord.resize(0);
2040 vct_add_data_reord.shrink_to_fit();
2041
2042 vct_add_data_cont.resize(0);
2043 vct_add_data_cont.shrink_to_fit();
2044
2045 vct_add_data_unique.resize(0);
2046 vct_add_data_unique.shrink_to_fit();
2047 }
2048
2049 /* \brief Return the offsets of the segments for the merge indexes
2050 *
2051 *
2052 */
2053 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap()
2054 {
2055 return vct_add_index_unique;
2056 }
2057
2058 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap() const
2059 {
2060 return vct_add_index_unique;
2061 }
2062
2081 vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMappingVector()
2082 {
2083 return vct_add_index_cont_1;
2084 }
2085
2104 vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMergeIndexMapVector()
2105 {
2106 return vct_index_tmp2;
2107 }
2108 };
2109
2110
2111 template<typename T, unsigned int blockSwitch = VECTOR_SPARSE_STANDARD, typename block_functor = stub_block_functor, typename indexT = int>
2112 using vector_sparse_gpu = openfpm::vector_sparse<
2113 T,
2114 indexT,
2115 CudaMemory,
2118 grow_policy_double,
2119 vect_isel<T>::value,
2120 blockSwitch,
2121 block_functor
2122 >;
2123
2124 template<typename T, typename block_functor = stub_block_functor, typename indexT = long int>
2125 using vector_sparse_gpu_block = openfpm::vector_sparse<
2126 T,
2127 indexT,
2128 CudaMemory,
2131 grow_policy_double,
2132 vect_isel<T>::value,
2133 VECTOR_SPARSE_BLOCK,
2134 block_functor
2135 >;
2136}
2137
2138
2139
2140#endif /* MAP_VECTOR_SPARSE_HPP_ */
virtual void * getDevicePointer()
get a readable pointer with the data
virtual void deviceToHost()
Move memory from device to host.
virtual void fill(unsigned char c)
fill the buffer with a byte
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
Definition CudaMemory.cu:38
This class allocate, and destroy CPU memory.
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
auto insert(Ti ele) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
auto getBackground() const -> decltype(vct_data.template get< p >(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMappingVector()
Return the mapping vector.
void flush_vd(vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array but save the insert buffer in v
void flush_remove(gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
auto getIndexBuffer() const -> const decltype(vct_index)&
Get the indices buffer.
auto getGPUInsertBuffer() -> decltype(vct_add_data)&
Get the GPU insert buffer.
void _branchfree_search(Ti x, Ti &id) const
get the element i
void deviceToHost()
Transfer from device to host.
void setGPURemoveBuffer(int nblock, int nslot)
set the gpu remove buffer for every block
auto getIndexBuffer() -> decltype(vct_index)&
Get the indices buffer.
auto get(Ti id) const -> decltype(vct_data.template get< p >(id))
Get an element of the vector.
auto insert(Ti ele) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
openfpm::sparse_index< Ti > get_sparse(Ti id) const
Get the sparse index.
void clear()
Clear all from all the elements.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void merge_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_sort, vector< aggregate< Ti, Ti >, Memory, layout_base, grow_p > &vct_add_index_unique, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_merge_index, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_merge_index_map, gpu::ofp_context_t &context)
Merge indexes.
size_t size()
Return how many element you have in this map.
vector< aggregate< Ti >, Memory, layout_base, grow_p > & private_get_vct_index()
Return the sorted vector of the indexes.
auto getBackground() const -> decltype(vct_data.get(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
void flush(gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
void hostToDevice()
Transfer from host to device.
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
void swapIndexVector(vector< aggregate< Ti >, Memory, layout_base, grow_p > &iv)
Ti _branchfree_search_nobck(Ti x, Ti &id) const
get the element i
auto getDataBuffer() const -> const decltype(vct_data)&
Get the data buffer.
auto getDataBuffer() -> decltype(vct_data)&
Get the data buffer.
auto get(Ti id) const -> decltype(vct_data.get(id))
Get an element of the vector.
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMergeIndexMapVector()
Return the merge mapping vector.
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void reorder_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_cont_index, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_cont_index_map, vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, vector< T, Memory, layout_base, grow_p > &vct_add_data_cont, gpu::ofp_context_t &context)
sort the continuos array of inserted key
void resize(size_t n)
resize to n elements
void flush_v(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_cont_0, gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST, int i=0)
merge the added element to the main data array but save the insert buffer in v
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
convert a type into constant type
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)
inter_memc< typenameT::type >::type type
for each element in the vector interleave memory_c
Transform the boost::fusion::vector into memory specification (memory_traits)
copy for a source object to a destination
Definition meta_copy.hpp:85
This class copy general objects.
Definition meta_copy.hpp:53
__device__ __host__ void operator()(T &t) const
It call the copy function for each property.
sg_type & sg
encapsulated source object
Functor switch to select the vector sparse for standars scalar and blocked implementation.
static void solveConflicts(vector_index_type &vct_index_old, vector_index_type &vct_index_merge, vector_index_type &vct_index_merge_id, vector_index_type &vct_index_out, vector_index_dtmp_type &vct_index_dtmp, vector_index_type &data_map, vector_index_type2 &segments_new, vector_data_type &vct_data_old, vector_data_type &vct_add_data, vector_data_type &vct_add_data_unique, vector_data_type &vct_data_out, ite_gpu< 1 > &itew, block_functor &blf, gpu::ofp_context_t &context)
this class is a functor for "for_each" algorithm
vector_data_type & vector_data_red
Vector in which to the reduction.
void operator()(T &t) const
It call the copy function for each property.
vector_index_type_reo & reorder_add_index_cpu
reorder vector index
vector_index_type & vector_index
Index type vector.
vector_data_type & vector_data
Vector in which to the reduction.
sparse_vector_reduction_cpu(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &vector_index, vector_index_type_reo &reorder_add_index_cpu)
constructor
this class is a functor for "for_each" algorithm
void operator()(T &t) const
It call the copy function for each property.
sparse_vector_reduction_solve_conflict_assign_cpu(encap_src &src, encap_dst &dst)
constructor
this class is a functor for "for_each" algorithm
void operator()(T &t) const
It call the copy function for each property.
sparse_vector_reduction_solve_conflict_reduce_cpu(encap_src &src, encap_dst &dst)
constructor
this class is a functor for "for_each" algorithm
gpu::ofp_context_t & context
gpu context
block_functor & blf
block functor
vector_data_type & vector_data_unsorted
new data in an unsorted way
vector_data_type & vector_data
new datas
vector_data_type & vector_data_red
Vector in which to the reduction.
sparse_vector_reduction(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_data_type &vector_data_unsorted, vector_index_type &vector_data_map, vector_index_type2 &segment_offset, block_functor &blf, gpu::ofp_context_t &context)
constructor
vector_index_type & vector_data_map
map of the data
vector_index_type2 & segment_offset
segment of offsets
void operator()(T &t) const
It call the copy function for each property.
this class is a functor for "for_each" algorithm
vector_data_type & vector_data
Vector in which to the reduction.
gpu::ofp_context_t & context
gpu context
void operator()(T &t) const
It call the copy function for each property.
vector_index_type & segment_offset
segment of offsets
vector_data_type & vector_data_red
Vector in which to the reduction.
sparse_vector_special(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &segment_offset, gpu::ofp_context_t &context)
constructor
temporal buffer for reductions