OpenFPM  5.2.0
Project that contain the implementation of distributed structures
map_vector_sparse.hpp
1 /*
2  * map_vector_sparse.hpp
3  *
4  * Created on: Jan 22, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef MAP_VECTOR_SPARSE_HPP_
9 #define MAP_VECTOR_SPARSE_HPP_
10 
11 #include "Vector/map_vector.hpp"
12 #include "Vector/cuda/map_vector_sparse_cuda_ker.cuh"
13 #include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh"
14 #include "util/ofp_context.hpp"
15 #include <iostream>
16 #include <limits>
17 
18 #if defined(__NVCC__)
19  #include "util/cuda/kernels.cuh"
20 #endif
21 
22 #include "util/cuda/scan_ofp.cuh"
23 #include "util/cuda/sort_ofp.cuh"
24 #include "util/cuda/segreduce_ofp.cuh"
25 #include "util/cuda/merge_ofp.cuh"
26 
27 enum flush_type
28 {
29  FLUSH_ON_HOST = 0,
30  FLUSH_ON_DEVICE = 1,
31  FLUSH_NO_DATA = 2
32 };
33 
34 template<typename OfpmVectorT>
35 using ValueTypeOf = typename std::remove_reference<OfpmVectorT>::type::value_type;
36 
37 namespace openfpm
38 {
39  // All props
40  template<typename sg_type>
41  struct htoD
42  {
44  sg_type & sg;
45 
46  unsigned int lele;
47 
48  htoD(sg_type & sg, unsigned int lele)
49  :sg(sg),lele(lele)
50  {};
51 
52 
54  template<typename T>
55  __device__ __host__ inline void operator()(T& t) const
56  {
57  sg.template hostToDevice<T::value>(lele,lele);
58  }
59  };
60 
61  constexpr int VECTOR_SPARSE_STANDARD = 1;
62  constexpr int VECTOR_SPARSE_BLOCK = 2;
63 
64  template<typename reduction_type, unsigned int impl>
66  {
67  template<typename encap_src, typename encap_dst>
68  static inline void process(encap_src & src, encap_dst & dst)
69  {
70  dst = reduction_type::red(dst,src);
71  }
72  };
73 
74  template<typename reduction_type>
75  struct cpu_block_process<reduction_type,VECTOR_SPARSE_BLOCK>
76  {
77  template<typename encap_src, typename encap_dst>
78  static inline void process(encap_src & src, encap_dst & dst)
79  {
80  for (size_t i = 0 ; i < encap_src::size ; i++)
81  {
82  dst[i] = reduction_type::red(dst[i],src[i]);
83  }
84  }
85  };
86 
87  template<typename reduction_type>
88  struct cpu_block_process<reduction_type,3>
89  {
90  template<typename encap_src, typename encap_dst,unsigned int N1>
91  static inline void process(encap_src & src, encap_dst (& dst)[N1])
92  {
93  for (unsigned int j = 0 ; j < N1 ; j++)
94  {
95  for (size_t i = 0 ; i < encap_dst::size ; i++)
96  {
97  dst[i] = reduction_type::red(dst[i][j],src[j][i]);
98  }
99  }
100  }
101 
102  template<unsigned int N1, unsigned int blockSize, typename encap_src, typename encap_dst>
103  static inline void process_e(encap_src & src, encap_dst & dst)
104  {
105  for (unsigned int j = 0 ; j < N1 ; j++)
106  {
107  for (size_t i = 0 ; i < blockSize ; i++)
108  {
109  dst[i] = reduction_type::red(dst[i][j],src[i][j]);
110  }
111  }
112  }
113  };
114 
119  template<unsigned int impl, typename block_functor>
120  struct scalar_block_implementation_switch // Case for scalar implementations
121  {
122  template <unsigned int p, typename vector_index_type>
123  static void extendSegments(vector_index_type & segments, size_t dataSize)
124  {
125 #ifdef __NVCC__
126  // Append trailing element to segment (marks end of last segment)
127  segments.resize(segments.size()+1);
128  segments.template get<p>(segments.size() - 1) = dataSize;
129  segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
130 #else // __NVCC__
131  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
132 #endif // __NVCC__
133  }
134 
135  template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type , typename vector_index_type2>
136  static void segreduce(vector_data_type & vector_data,
137  vector_data_type & vector_data_unsorted,
138  vector_index_type & vector_data_map,
139  vector_index_type2 & segment_offset,
140  vector_data_type & vector_data_red,
141  block_functor & blf,
142  gpu::ofp_context_t& gpuContext)
143  {
144 #ifdef __NVCC__
145  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
146  typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
147  typedef typename reduction_type::template op_red<red_type> red_op;
148  typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type;
149  typename reduction_type::template op_initial_value<red_type> initial_value_functor;
150 
151  assert((std::is_same<seg_type,int>::value == true));
152 
153  openfpm::segreduce(
154  (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
155  (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size()-1,
156  (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
157  red_op(), initial_value_functor(), gpuContext);
158 #else // __NVCC__
159  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
160 #endif // __NVCC__
161  }
162 
163 
179  template <
180  typename vector_data_type,
181  typename vector_index_type,
182  typename vector_index_type2,
183  typename vector_index_dtmp_type,
184  typename Ti,
185  typename ... v_reduce>
186  static void solveConflicts(
187  vector_index_type & vct_index,
188  vector_index_type & vct_index_tmp,
189  vector_index_type & vct_index_tmp2,
190  vector_index_type & vct_index_tmp3,
191  vector_index_dtmp_type & vct_index_dtmp,
192  vector_index_type & vct_add_index_cont_1,
193  vector_index_type2 & vct_add_index_unique,
194  vector_data_type & vct_data,
195  vector_data_type & vct_add_data,
196  vector_data_type & vct_add_data_unique,
197  vector_data_type & vct_add_data_cont,
198  ite_gpu<1> & itew,
199  block_functor & blf,
200  gpu::ofp_context_t& gpuContext
201  )
202  {
203 #ifdef __NVCC__
204 
205  CUDA_LAUNCH((solve_conflicts<
206  decltype(vct_index_tmp.toKernel()),
207  decltype(vct_data.toKernel()),
208  decltype(vct_index_dtmp.toKernel()),
209  128,
210  v_reduce ...
211  >),
212  itew,
213  vct_index_tmp.toKernel(),vct_data.toKernel(),
214  vct_index_tmp2.toKernel(),vct_add_data_unique.toKernel(),
215  vct_index_tmp3.toKernel(),vct_add_data_cont.toKernel(),
216  vct_index_dtmp.toKernel(),
217  (int)vct_index.size());
218 
219  // we scan tmp3
220  openfpm::scan(
221  (Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),
222  vct_index_dtmp.size(),
223  (Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),
224  gpuContext);
225 
226  // get the size to resize vct_index and vct_data
227  vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
228  int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
229 
230  vct_index.resize(size);
231  vct_data.resize(size);
232 
233  CUDA_LAUNCH(realign,itew,vct_index_tmp3.toKernel(),vct_add_data_cont.toKernel(),
234  vct_index.toKernel(), vct_data.toKernel(),
235  vct_index_dtmp.toKernel());
236 
237 
238 #else // __NVCC__
239  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
240 #endif // __NVCC__
241  }
242  };
243 
244 
245  template<typename block_functor>
246  struct scalar_block_implementation_switch<2, block_functor> // Case for blocked implementations
247  {
248  template <unsigned int p, typename vector_index_type>
249  static void extendSegments(vector_index_type & segments, size_t dataSize)
250  {
251 #ifdef __NVCC__
252  // Append trailing element to segment (marks end of last segment)
253  segments.resize(segments.size()+1);
254  segments.template get<p>(segments.size() - 1) = dataSize;
255  segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
256 #else // __NVCC__
257  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
258 #endif // __NVCC__
259  }
260 
261  template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type ,typename vector_index_type2>
262  static void segreduce(vector_data_type & vector_data,
263  vector_data_type & vector_data_unsorted,
264  vector_index_type & vector_data_map,
265  vector_index_type2 & segment_offset,
266  vector_data_type & vector_data_red,
267  block_functor & blf,
268  gpu::ofp_context_t& gpuContext)
269  {
270 
271  }
272 
273  template <
274  typename vector_data_type,
275  typename vector_index_type,
276  typename vector_index_type2,
277  typename vector_index_dtmp_type,
278  typename Ti,
279  typename ... v_reduce>
280  static void solveConflicts(
281  vector_index_type & vct_index,
282  vector_index_type & vct_index_tmp,
283  vector_index_type & vct_index_tmp2,
284  vector_index_type & vct_index_tmp3,
285  vector_index_dtmp_type & vct_index_dtmp,
286  vector_index_type & vct_add_index_cont_1,
287  vector_index_type2 & vct_add_index_unique,
288  vector_data_type & vct_data,
289  vector_data_type & vct_add_data,
290  vector_data_type & vct_add_data_unique,
291  vector_data_type & vct_add_data_cont,
292  ite_gpu<1> & itew,
293  block_functor & blf,
294  gpu::ofp_context_t& gpuContext
295  )
296  {
297 #ifdef __NVCC__
298  blf.template solve_conflicts<1,
299  decltype(vct_index_tmp),
300  decltype(vct_add_index_unique),
301  decltype(vct_data),
302  v_reduce ...>
303  (vct_index_tmp, vct_index_tmp2, vct_add_index_unique, vct_add_index_cont_1,
304  vct_data, vct_add_data,
305  vct_index, vct_add_data_cont,
306  gpuContext);
307  vct_add_data_cont.swap(vct_data);
308 
309 #else // __NVCC__
310  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
311 #endif // __NVCC__
312  }
313  };
314 
315  template<typename Ti>
316  struct reorder
317  {
318  Ti id;
319  Ti id2;
320 
321  bool operator<(const reorder & t) const
322  {
323  return id < t.id;
324  }
325  };
326 
327  template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type>
329  {
330  template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
331  static inline void red(size_t & i, vector_data_type & vector_data_red,
332  vector_data_type & vector_data,
333  vector_index_type & vector_index,
334  vector_index_type_reo & reorder_add_index_cpu)
335  {
336  size_t start = reorder_add_index_cpu.get(i).id;
337  red_type red = vector_data.template get<reduction_type::prop::value>(i);
338 
339  size_t j = 1;
340  for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
341  {
342  cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
343  //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
344  }
345  vector_data_red.add();
346  vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
347 
348  if (T::value == 0)
349  {
350  vector_index.add();
351  vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
352  }
353 
354  i += j;
355  }
356  };
357 
358 
359  template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type, unsigned int N1>
360  struct sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type[N1]>
361  {
362  template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
363  static inline void red(size_t & i, vector_data_type & vector_data_red,
364  vector_data_type & vector_data,
365  vector_index_type & vector_index,
366  vector_index_type_reo & reorder_add_index_cpu)
367  {
368  size_t start = reorder_add_index_cpu.get(i).id;
369  red_type red[N1];
370 
371  for (size_t k = 0 ; k < N1 ; k++)
372  {
373  red[k] = vector_data.template get<reduction_type::prop::value>(i)[k];
374  }
375 
376  size_t j = 1;
377  for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
378  {
379  auto ev = vector_data.template get<reduction_type::prop::value>(i+j);
381  //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
382  }
383 
384  vector_data_red.add();
385 
386  for (size_t k = 0 ; k < N1 ; k++)
387  {
388  vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1)[k] = red[k];
389  }
390 
391  if (T::value == 0)
392  {
393  vector_index.add();
394  vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
395  }
396 
397  i += j;
398  }
399  };
400 
411  template<typename vector_data_type,
412  typename vector_index_type,
413  typename vector_index_type_reo,
414  typename vector_reduction,
415  unsigned int impl>
417  {
419  vector_data_type & vector_data_red;
420 
422  vector_data_type & vector_data;
423 
425  vector_index_type_reo & reorder_add_index_cpu;
426 
428  vector_index_type & vector_index;
429 
436  inline sparse_vector_reduction_cpu(vector_data_type & vector_data_red,
437  vector_data_type & vector_data,
438  vector_index_type & vector_index,
439  vector_index_type_reo & reorder_add_index_cpu)
441  {};
442 
444  template<typename T>
445  inline void operator()(T& t) const
446  {
447  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
448  typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
449 
450  if (reduction_type::is_special() == false)
451  {
452  for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; )
453  {
455 
456 /* size_t start = reorder_add_index_cpu.get(i).id;
457  red_type red = vector_data.template get<reduction_type::prop::value>(i);
458 
459  size_t j = 1;
460  for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
461  {
462  cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
463  //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
464  }
465  vector_data_red.add();
466  vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
467 
468  if (T::value == 0)
469  {
470  vector_index.add();
471  vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
472  }
473 
474  i += j;*/
475  }
476  }
477  }
478  };
479 
490  template<typename encap_src,
491  typename encap_dst,
492  typename vector_reduction>
494  {
496  encap_src & src;
497 
499  encap_dst & dst;
500 
501 
509  :src(src),dst(dst)
510  {};
511 
513  template<typename T>
514  inline void operator()(T& t) const
515  {
516  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
517 
518  dst.template get<reduction_type::prop::value>() = src.template get<reduction_type::prop::value>();
519  }
520  };
521 
522 
523  template<unsigned int impl,typename vector_reduction, typename T,typename red_type>
525  {
526  template<typename encap_src, typename encap_dst>
527  static inline void red(encap_src & src, encap_dst & dst)
528  {
529  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
530 
531  cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
532  }
533  };
534 
535  template<unsigned int impl, typename vector_reduction, typename T,typename red_type, unsigned int N1>
536  struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type[N1]>
537  {
538  template<typename encap_src, typename encap_dst>
539  static inline void red(encap_src & src, encap_dst & dst)
540  {
541  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
542 
543  auto src_e = src.template get<reduction_type::prop::value>();
544  auto dst_e = dst.template get<reduction_type::prop::value>();
545 
546  cpu_block_process<reduction_type,impl+1>::template process_e<N1,red_type::size>(src_e,dst_e);
547  }
548  };
549 
560  template<typename encap_src,
561  typename encap_dst,
562  typename vector_reduction,
563  unsigned int impl>
565  {
567  encap_src & src;
568 
570  encap_dst & dst;
571 
572 
580  :src(src),dst(dst)
581  {};
582 
584  template<typename T>
585  inline void operator()(T& t) const
586  {
587  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
588  typedef typename boost::mpl::at<typename encap_src::T_type::type, typename reduction_type::prop>::type red_type;
589 
591 
592 // cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
593 // reduction_type::red(dst.template get<reduction_type::prop::value>(),src.template get<reduction_type::prop::value>());
594  }
595  };
596 
607  template<typename vector_data_type,
608  typename vector_index_type,
609  typename vector_index_type2,
610  typename vector_reduction,
611  typename block_functor,
612  unsigned int impl2, unsigned int pSegment=1>
614  {
616  vector_data_type & vector_data_red;
617 
619  vector_data_type & vector_data;
620 
622  vector_data_type & vector_data_unsorted;
623 
625  vector_index_type2 & segment_offset;
626 
628  vector_index_type & vector_data_map;
629 
631  block_functor & blf;
632 
635 
642  inline sparse_vector_reduction(vector_data_type & vector_data_red,
643  vector_data_type & vector_data,
644  vector_data_type & vector_data_unsorted,
645  vector_index_type & vector_data_map,
646  vector_index_type2 & segment_offset,
647  block_functor & blf,
654  blf(blf),
656  {};
657 
659  template<typename T>
660  inline void operator()(T& t) const
661  {
662 #ifdef __NVCC__
663 
664  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
665  typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
666  if (reduction_type::is_special() == false)
667  {
668  scalar_block_implementation_switch<impl2, block_functor>::template segreduce<pSegment, vector_reduction, T>(
669  vector_data,
674  blf,
675  gpuContext);
676  }
677 #else
678  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
679 #endif
680  }
681  };
682 
683 
685  {
686  template<unsigned int pSegment, typename vector_reduction, typename T, typename vector_index_type, typename vector_data_type>
687  static bool seg_reduce(vector_index_type & segments, vector_data_type & src, vector_data_type & dst)
688  {
689  return true;
690  }
691 
692  template<typename vector_index_type, typename vector_data_type, typename ... v_reduce>
693  static bool solve_conflicts(vector_index_type &keys, vector_index_type &merge_indices,
694  vector_data_type &data1, vector_data_type &data2,
695  vector_index_type &indices_tmp, vector_data_type &data_tmp,
696  vector_index_type &keysOut, vector_data_type &dataOut,
697  gpu::ofp_context_t& gpuContext)
698  {
699  return true;
700  }
701 
703 
705  {
706  return outputMap;
707  }
708 
709  const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const
710  {
711  return outputMap;
712  }
713  };
714 
725  template<typename vector_data_type, typename vector_index_type, typename vector_reduction>
727  {
729  vector_data_type & vector_data_red;
730 
732  vector_data_type & vector_data;
733 
735  vector_index_type & segment_offset;
736 
739 
746  inline sparse_vector_special(vector_data_type & vector_data_red,
747  vector_data_type & vector_data,
748  vector_index_type & segment_offset,
751  {};
752 
754  template<typename T>
755  inline void operator()(T& t) const
756  {
757 #ifdef __NVCC__
758 
759  typedef typename boost::mpl::at<vector_reduction,T>::type reduction_type;
760 
761  // reduction type
762  typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
763 
764  if (reduction_type::is_special() == true)
765  {
766  auto ite = segment_offset.getGPUIterator();
767 
768  CUDA_LAUNCH((reduce_from_offset<decltype(segment_offset.toKernel()),decltype(vector_data_red.toKernel()),reduction_type>),
769  ite,segment_offset.toKernel(),vector_data_red.toKernel(),
770  (typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type) vector_data.size());
771  }
772 
773 #else
774  std::cout << __FILE__ << ":" << __LINE__ << " error: this file si supposed to be compiled with nvcc" << std::endl;
775 #endif
776  }
777  };
778 
779  template<typename T,
780  typename Ti = long int,
781  typename Memory=HeapMemory,
782  typename layout=typename memory_traits_lin<T>::type,
783  template<typename> class layout_base=memory_traits_lin ,
784  typename grow_p=grow_policy_double,
785  unsigned int impl=vect_isel<T>::value,
786  unsigned int impl2 = VECTOR_SPARSE_STANDARD,
787  typename block_functor = stub_block_functor>
789  {
790  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index;
792  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_m_index;
793 
794  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index;
795  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_rem_index;
796  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nadd_index;
797  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nrem_index;
799  vector<T,Memory,layout_base,grow_p> vct_add_data_reord;
800 
801  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_0;
802  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_1;
803  vector<T,Memory,layout_base,grow_p> vct_add_data_cont;
804  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> vct_add_index_unique;
805 
806  vector<T,Memory,layout_base,grow_p,impl> vct_add_data_unique;
807 
808  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp4;
809  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
810  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp2;
811  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp3;
812  vector<aggregate<Ti,Ti,Ti>,Memory,layout_base,grow_p> vct_index_dtmp;
813 
814  block_functor blf;
815 
816  T bck;
817 
818  CudaMemory mem;
819 
820  openfpm::vector<reorder<Ti>> reorder_add_index_cpu;
821 
822  size_t max_ele;
823 
824  int n_gpu_add_block_slot = 0;
825  int n_gpu_rem_block_slot = 0;
826 
833  template<bool prefetch>
834  inline Ti _branchfree_search_nobck(Ti x, Ti & id) const
835  {
836  if (vct_index.size() == 0) {id = 0; return -1;}
837  const Ti *base = &vct_index.template get<0>(0);
838  const Ti *end = (const Ti *)vct_index.template getPointer<0>() + vct_index.size();
839  Ti n = vct_data.size()-1;
840  while (n > 1)
841  {
842  Ti half = n / 2;
843  if (prefetch)
844  {
845  __builtin_prefetch(base + half/2, 0, 0);
846  __builtin_prefetch(base + half + half/2, 0, 0);
847  }
848  base = (base[half] < x) ? base+half : base;
849  n -= half;
850  }
851 
852  int off = (*base < x);
853  id = base - &vct_index.template get<0>(0) + off;
854  return (base + off != end)?*(base + off):-1;
855  }
856 
863  template<bool prefetch>
864  inline void _branchfree_search(Ti x, Ti & id) const
865  {
866  Ti v = _branchfree_search_nobck<prefetch>(x,id);
867  id = (x == v)?id:vct_data.size()-1;
868  }
869 
870 
871  /* \brief take the indexes for the insertion pools and create a continuos array
872  *
873  * \param vct_nadd_index number of insertions of each pool
874  * \param vct_add_index pool of inserts
875  * \param vct_add_index_cont_0 output continuos array of inserted indexes
876  * \param vct_add_data_cont continuos array of inserted data
877  * \param gpuContext gpu context
878  *
879  */
880  size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index,
881  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index,
882  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
883  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
884  vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
885  gpu::ofp_context_t& gpuContext)
886  {
887 #ifdef __NVCC__
888 
889  // Add 0 to the last element to vct_nadd_index
890  vct_nadd_index.resize(vct_nadd_index.size()+1);
891  vct_nadd_index.template get<0>(vct_nadd_index.size()-1) = 0;
892  vct_nadd_index.template hostToDevice<0>(vct_nadd_index.size()-1,vct_nadd_index.size()-1);
893 
894  // Merge the list of inserted points for each block
895  vct_index_tmp4.resize(vct_nadd_index.size());
896 
897  openfpm::scan((Ti *)vct_nadd_index.template getDeviceBuffer<0>(),
898  vct_nadd_index.size(),
899  (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() ,
900  gpuContext);
901 
902  vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
903  size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
904 
905  // we reuse vct_nadd_index
906  vct_add_index_cont_0.resize(n_ele);
907  vct_add_index_cont_1.resize(n_ele);
908 
909  if (impl2 == VECTOR_SPARSE_STANDARD)
910  {
911  vct_add_data_cont.resize(n_ele);
912  }
913 
914  if (n_gpu_add_block_slot >= 128)
915  {
916  ite_gpu<1> itew;
917  itew.wthr.x = vct_nadd_index.size()-1;
918  itew.wthr.y = 1;
919  itew.wthr.z = 1;
920  itew.thr.x = 128;
921  itew.thr.y = 1;
922  itew.thr.z = 1;
923 
924  CUDA_LAUNCH(construct_insert_list_key_only,itew,vct_add_index.toKernel(),
925  vct_nadd_index.toKernel(),
926  vct_index_tmp4.toKernel(),
927  vct_add_index_cont_0.toKernel(),
928  vct_add_index_cont_1.toKernel(),
929  n_gpu_add_block_slot);
930  }
931  else
932  {
933  auto itew = vct_add_index.getGPUIterator();
934 
935  CUDA_LAUNCH(construct_insert_list_key_only_small_pool,itew,vct_add_index.toKernel(),
936  vct_nadd_index.toKernel(),
937  vct_index_tmp4.toKernel(),
938  vct_add_index_cont_0.toKernel(),
939  vct_add_index_cont_1.toKernel(),
940  n_gpu_add_block_slot);
941  }
942 
943  return n_ele;
944 #endif
945  return 0;
946  }
947 
957  void reorder_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
958  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
959  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
961  gpu::ofp_context_t& gpuContext)
962  {
963 #ifdef __NVCC__
964  ite_gpu<1> itew;
965  itew.wthr.x = vct_nadd_index.size()-1;
966  itew.wthr.y = 1;
967  itew.wthr.z = 1;
968  itew.thr.x = 128;
969  itew.thr.y = 1;
970  itew.thr.z = 1;
971 
972  size_t n_ele = vct_add_index_cont_0.size();
973 
974  n_gpu_add_block_slot = 0;
975 
976  // now we sort
977  openfpm::sort(
978  (Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),
979  (Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(),
980  vct_add_index_cont_0.size(),
981  gpu::template less_t<Ti>(),
982  gpuContext);
983 
984  auto ite = vct_add_index_cont_0.getGPUIterator();
985 
986  // Now we reorder the data vector accordingly to the indexes
987 
988  if (impl2 == VECTOR_SPARSE_STANDARD)
989  {
990  vct_add_data_reord.resize(n_ele);
991  CUDA_LAUNCH(reorder_vector_data,ite,vct_add_index_cont_1.toKernel(),vct_add_data.toKernel(),vct_add_data_reord.toKernel());
992  }
993 
994 #endif
995  }
996 
1003  template<typename ... v_reduce>
1004  void merge_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1005  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique,
1006  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_index_tmp,
1007  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_index_tmp2,
1008  gpu::ofp_context_t& gpuContext)
1009  {
1010 #ifdef __NVCC__
1011 
1012  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1013 
1014  auto ite = vct_add_index_cont_0.getGPUIterator();
1015 
1016  mem.allocate(sizeof(int));
1017  mem.fill(0);
1018  vct_add_index_unique.resize(vct_add_index_cont_0.size()+1);
1019 
1020  ite = vct_add_index_cont_0.getGPUIterator();
1021 
1022  vct_index_tmp4.resize(vct_add_index_cont_0.size()+1);
1023 
1024  CUDA_LAUNCH(
1025  (
1026  find_buffer_offsets_for_scan
1027  <0,
1028  decltype(vct_add_index_cont_0.toKernel()),
1029  decltype(vct_index_tmp4.toKernel())
1030  >
1031  ),
1032  ite,
1033  vct_add_index_cont_0.toKernel(),
1034  vct_index_tmp4.toKernel());
1035 
1036  openfpm::scan((Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_index_tmp4.size(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),gpuContext);
1037 
1038  vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1039  int n_ele_unique = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1040 
1041  vct_add_index_unique.resize(n_ele_unique);
1042 
1043  if (impl2 == VECTOR_SPARSE_STANDARD)
1044  {
1045  vct_add_data_unique.resize(n_ele_unique);
1046  }
1047 
1048  CUDA_LAUNCH(
1049  (construct_index_unique<0>),
1050  ite,
1051  vct_add_index_cont_0.toKernel(),
1052  vct_index_tmp4.toKernel(),
1053  vct_add_index_unique.toKernel());
1054 
1055  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1056 
1057  // Then we merge the two list vct_index and vct_add_index_unique
1058 
1059  // index to get merge index
1060  vct_m_index.resize(vct_index.size());
1061 
1062  if (vct_m_index.size() != 0)
1063  {
1064  ite = vct_m_index.getGPUIterator();
1065  CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1066  }
1067 
1068  // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1069  // we reorder the data array also
1070 
1071  vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1072  vct_index_tmp2.resize(vct_index.size() + vct_add_index_unique.size());
1073  vct_index_tmp3.resize(vct_index.size() + vct_add_index_unique.size());
1074 
1075  // Do not delete this reserve
1076  // Unfortunately all resize with DataBlocks are broken
1077  if (impl2 == VECTOR_SPARSE_STANDARD)
1078  {
1079  vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+1);
1080  vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size());
1081  }
1082 
1083  ite = vct_add_index_unique.getGPUIterator();
1084  vct_index_tmp4.resize(vct_add_index_unique.size());
1085  CUDA_LAUNCH((set_indexes<0>),ite,vct_index_tmp4.toKernel(),(int)vct_index.size());
1086 
1087  ite_gpu<1> itew;
1088 
1089  itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1090  itew.wthr.y = 1;
1091  itew.wthr.z = 1;
1092  itew.thr.x = 128;
1093  itew.thr.y = 1;
1094  itew.thr.z = 1;
1095 
1096  vct_index_dtmp.resize(itew.wthr.x);
1097 
1098  // we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_index_tmp contain the merged indexes
1099  //
1100 
1101  openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1102  (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(),
1103  (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),gpuContext);
1104 
1105 
1106 #endif
1107  }
1108 
1109 
1110 
1111  template<typename ... v_reduce>
1112  void merge_datas(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1113  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique,
1114  vector<T,Memory,layout_base,grow_p> & vct_add_data,
1115  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1116  gpu::ofp_context_t& gpuContext)
1117  {
1118 #ifdef __NVCC__
1119  ite_gpu<1> itew;
1120  itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1121  itew.wthr.y = 1;
1122  itew.wthr.z = 1;
1123  itew.thr.x = 128;
1124  itew.thr.y = 1;
1125  itew.thr.z = 1;
1126 
1127  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1128 
1130 
1131  // Now we can do a segmented reduction
1133  ::template extendSegments<1>(vct_add_index_unique, vct_add_index_cont_1.size());
1134 
1135  if (impl2 == VECTOR_SPARSE_STANDARD)
1136  {
1137  sparse_vector_reduction<typename std::remove_reference<decltype(vct_add_data)>::type,
1138  decltype(vct_add_index_cont_1),
1139  decltype(vct_add_index_unique),vv_reduce,block_functor,impl2>
1140  svr(
1141  vct_add_data_unique,
1142  vct_add_data_reord,
1143  vct_add_data,
1144  vct_add_index_cont_1,
1145  vct_add_index_unique,
1146  blf,
1147  gpuContext);
1148 
1149  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1150  vct_add_index_unique.remove(vct_add_index_unique.size()-1);
1151  }
1152 
1153  sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type,
1154  decltype(vct_add_index_unique),
1155  vv_reduce> svr2(vct_add_data_unique,vct_add_data_reord,vct_add_index_unique,gpuContext);
1156  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr2);
1157 
1159 
1160  // Now perform the right solve_conflicts according to impl2
1161  scalar_block_implementation_switch<impl2, block_functor>::template solveConflicts<
1162  decltype(vct_data),
1163  decltype(vct_index),
1164  decltype(vct_add_index_unique),
1165  decltype(vct_index_dtmp),
1166  Ti,
1167  v_reduce ...
1168  >
1169  (
1170  vct_index,
1171  vct_index_tmp,
1172  vct_index_tmp2,
1173  vct_index_tmp3,
1174  vct_index_dtmp,
1175  vct_add_index_cont_1,
1176  vct_add_index_unique,
1177  vct_data,
1178  vct_add_data,
1179  vct_add_data_unique,
1180  vct_add_data_cont,
1181  itew,
1182  blf,
1183  gpuContext
1184  );
1185 
1186 
1187 #else
1188  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1189 #endif
1190  }
1191 
1192  template<typename ... v_reduce>
1193  void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1194  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1195  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1196  gpu::ofp_context_t& gpuContext)
1197  {
1198 #ifdef __NVCC__
1199 
1200  // To avoid the case where you never called setGPUInsertBuffer
1201  if (n_gpu_add_block_slot == 0 || vct_add_index.size() == 0)
1202  {
1203  return;
1204  }
1205 
1206  size_t n_ele = make_continuos(vct_nadd_index,vct_add_index,vct_add_index_cont_0,
1207  vct_add_index_cont_1,vct_add_data_cont,gpuContext);
1208 
1209 
1210  // At this point we can check whether we have not inserted anything actually,
1211  // in this case, return without further ado...
1212  if (vct_add_index_cont_0.size() == 0)
1213  {return;}
1214 
1215  reorder_indexes(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,vct_add_data,gpuContext);
1216 
1217  merge_indexes<v_reduce ... >(vct_add_index_cont_0,vct_add_index_unique,
1218  vct_index_tmp,vct_index_tmp2,
1219  gpuContext);
1220 
1221  merge_datas<v_reduce ... >(vct_add_data_reord,vct_add_index_unique,vct_add_data,vct_add_index_cont_1,gpuContext);
1222 
1223 #else
1224  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1225 #endif
1226  }
1227 
1228 
1229  void flush_on_gpu_remove(
1230  gpu::ofp_context_t& gpuContext)
1231  {
1232 #ifdef __NVCC__
1233 
1234  // Add 0 to the last element to vct_nadd_index
1235  vct_nrem_index.resize(vct_nrem_index.size()+1);
1236  vct_nrem_index.template get<0>(vct_nrem_index.size()-1) = 0;
1237  vct_nrem_index.template hostToDevice<0>(vct_nrem_index.size()-1,vct_nrem_index.size()-1);
1238 
1239  // Merge the list of inserted points for each block
1240  vct_index_tmp4.resize(vct_nrem_index.size());
1241 
1242  openfpm::scan((Ti *)vct_nrem_index.template getDeviceBuffer<0>(), vct_nrem_index.size(), (Ti *)vct_index_tmp4.template getDeviceBuffer<0>(), gpuContext);
1243 
1244  vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1245  size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1246 
1247  // we reuse vct_nadd_index
1248  vct_add_index_cont_0.resize(n_ele);
1249  vct_add_index_cont_1.resize(n_ele);
1250 
1251  ite_gpu<1> itew;
1252  itew.wthr.x = vct_nrem_index.size()-1;
1253  itew.wthr.y = 1;
1254  itew.wthr.z = 1;
1255  itew.thr.x = 128;
1256  itew.thr.y = 1;
1257  itew.thr.z = 1;
1258 
1259  CUDA_LAUNCH(construct_remove_list,itew,vct_rem_index.toKernel(),
1260  vct_nrem_index.toKernel(),
1261  vct_index_tmp4.toKernel(),
1262  vct_add_index_cont_0.toKernel(),
1263  vct_add_index_cont_1.toKernel(),
1264  n_gpu_rem_block_slot);
1265 
1266  // now we sort
1267  openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(),
1268  vct_add_index_cont_0.size(), gpu::template less_t<Ti>(), gpuContext);
1269 
1270  auto ite = vct_add_index_cont_0.getGPUIterator();
1271 
1272  mem.allocate(sizeof(int));
1273  mem.fill(0);
1274  vct_add_index_unique.resize(vct_add_index_cont_0.size()+1);
1275 
1276  ite = vct_add_index_cont_0.getGPUIterator();
1277 
1278  // produce unique index list
1279  // Find the buffer bases
1280  CUDA_LAUNCH((find_buffer_offsets_zero<0,decltype(vct_add_index_cont_0.toKernel()),decltype(vct_add_index_unique.toKernel())>),
1281  ite,
1282  vct_add_index_cont_0.toKernel(),(int *)mem.getDevicePointer(),vct_add_index_unique.toKernel());
1283 
1284  mem.deviceToHost();
1285  int n_ele_unique = *(int *)mem.getPointer();
1286 
1287  vct_add_index_unique.resize(n_ele_unique);
1288 
1289  openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),
1290  vct_add_index_unique.size(),gpu::template less_t<Ti>(),gpuContext);
1291 
1292  // Then we merge the two list vct_index and vct_add_index_unique
1293 
1294  // index to get merge index
1295  vct_m_index.resize(vct_index.size() + vct_add_index_unique.size());
1296 
1297  ite = vct_m_index.getGPUIterator();
1298  CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1299 
1300  ite = vct_add_index_unique.getGPUIterator();
1301  CUDA_LAUNCH((set_indexes<1>),ite,vct_add_index_unique.toKernel(),(int)vct_index.size());
1302 
1303  // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1304  // we reorder the data array also
1305 
1306  vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1307  vct_index_tmp2.resize(vct_index.size() + vct_add_index_unique.size());
1308 
1309  itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1310  itew.wthr.y = 1;
1311  itew.wthr.z = 1;
1312  itew.thr.x = 128;
1313  itew.thr.y = 1;
1314  itew.thr.z = 1;
1315 
1316  vct_index_dtmp.resize(itew.wthr.x);
1317 
1318  // we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index
1319  //
1320  openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1321  (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(),
1322  (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),gpuContext);
1323 
1324  vct_index_tmp3.resize(128*itew.wthr.x);
1325 
1326  CUDA_LAUNCH((solve_conflicts_remove<decltype(vct_index_tmp.toKernel()),decltype(vct_index_dtmp.toKernel()),128>),
1327  itew,
1328  vct_index_tmp.toKernel(),
1329  vct_index_tmp2.toKernel(),
1330  vct_index_tmp3.toKernel(),
1331  vct_m_index.toKernel(),
1332  vct_index_dtmp.toKernel(),
1333  (int)vct_index.size());
1334 
1335  // we scan tmp3
1336  openfpm::scan((Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),vct_index_dtmp.size(),(Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),gpuContext);
1337 
1338  // get the size to resize vct_index and vct_data
1339  vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
1340  int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
1341 
1342  vct_add_data_cont.resize(size);
1343  vct_index.resize(size);
1344 
1345  CUDA_LAUNCH(realign_remove,itew,vct_index_tmp3.toKernel(),vct_m_index.toKernel(),vct_data.toKernel(),
1346  vct_index.toKernel(),vct_add_data_cont.toKernel(),
1347  vct_index_dtmp.toKernel());
1348 
1349  vct_data.swap(vct_add_data_cont);
1350 
1351 #else
1352  std::cout << __FILE__ << ":" << __LINE__ << " error: you are suppose to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1353 #endif
1354  }
1355 
1356  void resetBck()
1357  {
1358  // re-add background
1359  vct_data.resize(vct_data.size()+1);
1360  vct_data.get(vct_data.size()-1) = bck;
1361 
1362  htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1363  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1364  }
1365 
1366  template<typename ... v_reduce>
1367  void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1368  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1369  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1370  gpu::ofp_context_t& gpuContext)
1371  {
1372  flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,gpuContext);
1373  }
1374 
1375  template<typename ... v_reduce>
1376  void flush_on_cpu()
1377  {
1378  if (vct_add_index.size() == 0)
1379  {return;}
1380 
1381  // First copy the added index to reorder
1382  reorder_add_index_cpu.resize(vct_add_index.size());
1383  vct_add_data_cont.resize(vct_add_index.size());
1384 
1385  for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1386  {
1387  reorder_add_index_cpu.get(i).id = vct_add_index.template get<0>(i);
1388  reorder_add_index_cpu.get(i).id2 = i;
1389  }
1390 
1391  reorder_add_index_cpu.sort();
1392 
1393  // Copy the data
1394  for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1395  {
1396  vct_add_data_cont.get(i) = vct_add_data.get(reorder_add_index_cpu.get(i).id2);
1397  }
1398 
1399  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1400 
1401  sparse_vector_reduction_cpu<decltype(vct_add_data),
1402  decltype(vct_add_index_unique),
1403  decltype(reorder_add_index_cpu),
1404  vv_reduce,
1405  impl2>
1406  svr(vct_add_data_unique,
1407  vct_add_data_cont,
1408  vct_add_index_unique,
1409  reorder_add_index_cpu);
1410 
1411  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1412 
1413  // merge the the data
1414 
1415  vector<T,Memory,layout_base,grow_p,impl> vct_data_tmp;
1416  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
1417 
1418  vct_data_tmp.resize(vct_data.size() + vct_add_data_unique.size());
1419  vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1420 
1421  Ti di = 0;
1422  Ti ai = 0;
1423  size_t i = 0;
1424 
1425  for ( ; i < vct_data_tmp.size() ; i++)
1426  {
1427  Ti id_a = (ai < vct_add_index_unique.size())?vct_add_index_unique.template get<0>(ai):std::numeric_limits<Ti>::max();
1428  Ti id_d = (di < vct_index.size())?vct_index.template get<0>(di):std::numeric_limits<Ti>::max();
1429 
1430  if ( id_a <= id_d )
1431  {
1432  vct_index_tmp.template get<0>(i) = id_a;
1433 
1434  if (id_a == id_d)
1435  {
1436  auto dst = vct_data_tmp.get(i);
1437  auto src = vct_add_data_unique.get(ai);
1438 
1439  sparse_vector_reduction_solve_conflict_assign_cpu<decltype(vct_data_tmp.get(i)),
1440  decltype(vct_add_data.get(ai)),
1441  vv_reduce>
1442  sva(src,dst);
1443 
1444  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(sva);
1445  ai++;
1446 
1447  dst = vct_data_tmp.get(i);
1448  src = vct_data.get(di);
1449 
1450  sparse_vector_reduction_solve_conflict_reduce_cpu<decltype(vct_data_tmp.get(i)),
1451  decltype(vct_data.get(di)),
1452  vv_reduce,
1453  impl2>
1454  svr(src,dst);
1455  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1456 
1457  di++;
1458 
1459  vct_data_tmp.resize(vct_data_tmp.size()-1);
1460  vct_index_tmp.resize(vct_index_tmp.size()-1);
1461  }
1462  else
1463  {
1464  vct_index_tmp.template get<0>(i) = vct_add_index_unique.template get<0>(ai);
1465  vct_data_tmp.get(i) = vct_add_data_unique.get(ai);
1466  ai++;
1467  }
1468  }
1469  else
1470  {
1471  vct_index_tmp.template get<0>(i) = vct_index.template get<0>(di);
1472  vct_data_tmp.get(i) = vct_data.get(di);
1473  di++;
1474  }
1475  }
1476 
1477  vct_index.swap(vct_index_tmp);
1478  vct_data.swap(vct_data_tmp);
1479 
1480  vct_add_data.clear();
1481  vct_add_index.clear();
1482  vct_add_index_unique.clear();
1483  vct_add_data_unique.clear();
1484  }
1485 
1486  public:
1487 
1488  vector_sparse()
1489  :max_ele(0)
1490  {
1491  vct_data.resize(1);
1492  }
1493 
1498  auto getIndexBuffer() -> decltype(vct_index)&
1499  {
1500  return vct_index;
1501  }
1502 
1507  auto getDataBuffer() -> decltype(vct_data)&
1508  {
1509  return vct_data;
1510  }
1511 
1516  auto getIndexBuffer() const -> const decltype(vct_index)&
1517  {
1518  return vct_index;
1519  }
1520 
1525  auto getDataBuffer() const -> const decltype(vct_data)&
1526  {
1527  return vct_data;
1528  }
1529 
1542  {
1543  Ti di;
1544  this->_branchfree_search<false>(id,di);
1546  sid.id = di;
1547 
1548  return sid;
1549  }
1550 
1561  template <unsigned int p>
1562  inline auto get(Ti id) const -> decltype(vct_data.template get<p>(id))
1563  {
1564  Ti di;
1565  this->_branchfree_search<false>(id,di);
1566  return vct_data.template get<p>(di);
1567  }
1568 
1579  inline auto get(Ti id) const -> decltype(vct_data.get(id))
1580  {
1581  Ti di;
1582  this->_branchfree_search<false>(id,di);
1583  return vct_data.get(di);
1584  }
1585 
1591  void resize(size_t n)
1592  {
1593  max_ele = n;
1594  }
1595 
1604  void swapIndexVector(vector<aggregate<Ti>,Memory,layout_base,grow_p> & iv)
1605  {
1606  vct_index.swap(iv);
1607  }
1608 
1614  template <unsigned int p>
1615  auto getBackground() const -> decltype(vct_data.template get<p>(vct_data.size()-1))
1616  {
1617  return vct_data.template get<p>(vct_data.size()-1);
1618  }
1619 
1625  auto getBackground() const -> decltype(vct_data.get(vct_data.size()-1))
1626  {
1627  return vct_data.get(vct_data.size()-1);
1628  }
1629 
1630  template<unsigned int p>
1631  void setBackground(const typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type & bck_)
1632  {
1634  typename std::remove_reference<decltype(vct_data.template get<p>(vct_data.size()-1))>::type>
1635  ::meta_copy_d_(bck_,vct_data.template get<p>(vct_data.size()-1));
1636 
1637  vct_data.template hostToDevice<p>(vct_data.size()-1,vct_data.size()-1);
1638 
1640  ::meta_copy_(bck_,bck.template get<p>());
1641  }
1642 
1650  template <unsigned int p>
1651  auto insert(Ti ele) -> decltype(vct_data.template get<p>(0))
1652  {
1653  vct_add_index.add();
1654  vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1655  vct_add_data.add();
1656  return vct_add_data.template get<p>(vct_add_data.size()-1);
1657  }
1658 
1666  template <unsigned int p>
1667  auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.template get<p>(0))
1668  {
1669  is_new = false;
1670  size_t di;
1671 
1672  // first we have to search if the block exist
1673  Ti v = _branchfree_search_nobck(ele,di);
1674 
1675  if (v == ele)
1676  {
1677  // block exist
1678  return vct_data.template get<p>(di);
1679  }
1680  is_new = true;
1681 
1682  // It does not exist, we create it di contain the index where we have to create the new block
1683  vct_index.insert(di);
1684  vct_data.insert(di);
1685 
1686  return vct_data.template get<p>(di);
1687  }
1688 
1694  auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.get(0))
1695  {
1696  is_new = false;
1697  Ti di;
1698 
1699  // first we have to search if the block exist
1700  Ti v = _branchfree_search_nobck<true>(ele,di);
1701 
1702  if (v == ele)
1703  {
1704  // block exist
1705  return vct_data.get(di);
1706  }
1707 
1708  // It does not exist, we create it di contain the index where we have to create the new block
1709  vct_index.insert(di);
1710  vct_data.insert(di);
1711  is_new = true;
1712 
1713  vct_index.template get<0>(di) = ele;
1714 
1715  return vct_data.get(di);
1716  }
1717 
1723  auto insert(Ti ele) -> decltype(vct_data.get(0))
1724  {
1725  vct_add_index.add();
1726  vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1727  vct_add_data.add();
1728  return vct_add_data.get(vct_add_data.size()-1);
1729  }
1730 
1738  template<typename ... v_reduce>
1739  void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1740  gpu::ofp_context_t& gpuContext,
1741  flush_type opt = FLUSH_ON_HOST,
1742  int i = 0)
1743  {
1744  // Eliminate background
1745  vct_data.resize(vct_index.size());
1746 
1747  if (opt & flush_type::FLUSH_ON_DEVICE)
1748  {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,gpuContext,i);}
1749  else
1750  {this->flush_on_cpu<v_reduce ... >();}
1751 
1752  resetBck();
1753  }
1754 
1762  template<typename ... v_reduce>
1764  gpu::ofp_context_t& gpuContext,
1765  flush_type opt = FLUSH_ON_HOST)
1766  {
1767  // Eliminate background
1768  vct_data.resize(vct_index.size());
1769 
1770  if (opt & flush_type::FLUSH_ON_DEVICE)
1771  {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,gpuContext);}
1772  else
1773  {this->flush_on_cpu<v_reduce ... >();}
1774 
1775  resetBck();
1776  }
1777 
1783  template<typename ... v_reduce>
1784  void flush(gpu::ofp_context_t& gpuContext, flush_type opt = FLUSH_ON_HOST)
1785  {
1786  // Eliminate background
1787  vct_data.resize(vct_index.size());
1788 
1789  if (opt & flush_type::FLUSH_ON_DEVICE)
1790  {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,gpuContext);}
1791  else
1792  {this->flush_on_cpu<v_reduce ... >();}
1793 
1794  resetBck();
1795  }
1796 
1802  void flush_remove(gpu::ofp_context_t& gpuContext, flush_type opt = FLUSH_ON_HOST)
1803  {
1804  vct_data.resize(vct_data.size()-1);
1805 
1806  if (opt & flush_type::FLUSH_ON_DEVICE)
1807  {this->flush_on_gpu_remove(gpuContext);}
1808  else
1809  {
1810  std::cerr << __FILE__ << ":" << __LINE__ << " error, flush_remove on CPU has not implemented yet";
1811  }
1812 
1813  resetBck();
1814  }
1815 
1820  size_t size()
1821  {
1822  return vct_index.size();
1823  }
1824 
1829  vector<aggregate<Ti>,Memory,layout_base,grow_p> &
1831  {
1832  return vct_index;
1833  }
1834 
1840  template<unsigned int ... prp>
1842  {
1843  vct_index.template deviceToHost<0>();
1844  vct_data.template deviceToHost<prp...>();
1845  }
1846 
1852  template<unsigned int ... prp>
1854  {
1855  vct_index.template hostToDevice<0>();
1856  vct_data.template hostToDevice<prp...>();
1857  }
1858 
1865  {
1866  vector_sparse_gpu_ker<T,Ti,layout_base> mvsck(vct_index.toKernel(),vct_data.toKernel(),
1867  vct_add_index.toKernel(),
1868  vct_rem_index.toKernel(),vct_add_data.toKernel(),
1869  vct_nadd_index.toKernel(),
1870  vct_nrem_index.toKernel(),
1871  n_gpu_add_block_slot,
1872  n_gpu_rem_block_slot);
1873 
1874  return mvsck;
1875  }
1876 
1883  void setGPUInsertBuffer(int nblock, int nslot)
1884  {
1885  vct_add_index.resize(nblock*nslot);
1886  vct_nadd_index.resize(nblock);
1887  vct_add_data.resize(nblock*nslot);
1888  n_gpu_add_block_slot = nslot;
1889  vct_nadd_index.template fill<0>(0);
1890  }
1891 
1897  void preFlush()
1898  {
1899 #ifdef __NVCC__
1900  vct_nadd_index.resize(vct_add_index.size());
1901 
1902  if (vct_nadd_index.size() != 0)
1903  {
1904  auto ite = vct_nadd_index.getGPUIterator();
1905  CUDA_LAUNCH((set_one_insert_buffer),ite,vct_nadd_index.toKernel());
1906  }
1907  n_gpu_add_block_slot = 1;
1908 #endif
1909  }
1910 
1915  auto getGPUInsertBuffer() -> decltype(vct_add_data)&
1916  {
1917  return vct_add_data;
1918  }
1919 
1926  void setGPURemoveBuffer(int nblock, int nslot)
1927  {
1928  vct_rem_index.resize(nblock*nslot);
1929  vct_nrem_index.resize(nblock);
1930  n_gpu_rem_block_slot = nslot;
1931  vct_nrem_index.template fill<0>(0);
1932  }
1933 
1934 #ifdef CUDA_GPU
1935 
1941  auto getGPUIterator() -> decltype(vct_index.getGPUIterator())
1942  {
1943  return vct_index.getGPUIterator();
1944  }
1945 
1946 #endif
1947 
1952  void clear()
1953  {
1954  vct_data.clear();
1955  vct_index.clear();
1956  vct_add_index.clear();
1957  vct_add_data.clear();
1958  vct_add_index_cont_0.clear();
1959 
1960  // re-add background
1961  vct_data.resize(vct_data.size()+1);
1962  vct_data.get(vct_data.size()-1) = bck;
1963 
1964  htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1965  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1966 
1967  max_ele = 0;
1968  n_gpu_add_block_slot = 0;
1969  n_gpu_rem_block_slot = 0;
1970  }
1971 
1973  {
1974  vct_data.swap(sp.vct_data);
1975  vct_index.swap(sp.vct_index);
1976  vct_add_index.swap(sp.vct_add_index);
1977  vct_add_data.swap(sp.vct_add_data);
1978 
1979  size_t max_ele_ = sp.max_ele;
1980  sp.max_ele = max_ele;
1981  this->max_ele = max_ele_;
1982  }
1983 
1984  vector<T,Memory,layout_base,grow_p> & private_get_vct_add_data()
1985  {
1986  return vct_add_data;
1987  }
1988 
1989  vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index()
1990  {
1991  return vct_add_index;
1992  }
1993 
1994  const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index() const
1995  {
1996  return vct_add_index;
1997  }
1998 
1999  vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index()
2000  {
2001  return vct_nadd_index;
2002  }
2003 
2004  const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index() const
2005  {
2006  return vct_nadd_index;
2007  }
2008 
2009  auto getSegmentToOutMap() -> decltype(blf.get_outputMap())
2010  {
2011  return blf.get_outputMap();
2012  }
2013 
2014  auto getSegmentToOutMap() const -> decltype(blf.get_outputMap())
2015  {
2016  return blf.get_outputMap();
2017  }
2018 
2024  {
2025  vct_add_data.resize(0);
2026  vct_add_data.shrink_to_fit();
2027 
2028  vct_add_data.resize(0);
2029  vct_add_data.shrink_to_fit();
2030 
2031  vct_add_data_reord.resize(0);
2032  vct_add_data_reord.shrink_to_fit();
2033 
2034  vct_add_data_cont.resize(0);
2035  vct_add_data_cont.shrink_to_fit();
2036 
2037  vct_add_data_unique.resize(0);
2038  vct_add_data_unique.shrink_to_fit();
2039  }
2040 
2041  /* \brief Return the offsets of the segments for the merge indexes
2042  *
2043  *
2044  */
2045  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap()
2046  {
2047  return vct_add_index_unique;
2048  }
2049 
2050  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap() const
2051  {
2052  return vct_add_index_unique;
2053  }
2054 
2073  vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMappingVector()
2074  {
2075  return vct_add_index_cont_1;
2076  }
2077 
2096  vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMergeIndexMapVector()
2097  {
2098  return vct_index_tmp2;
2099  }
2100  };
2101 
2102 
2103  template<typename T, unsigned int blockSwitch = VECTOR_SPARSE_STANDARD, typename block_functor = stub_block_functor, typename indexT = int>
2104  using vector_sparse_gpu = openfpm::vector_sparse<
2105  T,
2106  indexT,
2107  CudaMemory,
2108  typename memory_traits_inte<T>::type,
2110  grow_policy_double,
2111  vect_isel<T>::value,
2112  blockSwitch,
2113  block_functor
2114  >;
2115 
2116  template<typename T, typename block_functor = stub_block_functor, typename indexT = long int>
2117  using vector_sparse_gpu_block = openfpm::vector_sparse<
2118  T,
2119  indexT,
2120  CudaMemory,
2121  typename memory_traits_inte<T>::type,
2123  grow_policy_double,
2124  vect_isel<T>::value,
2125  VECTOR_SPARSE_BLOCK,
2126  block_functor
2127  >;
2128 }
2129 
2130 
2131 
2132 #endif /* MAP_VECTOR_SPARSE_HPP_ */
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:503
virtual void deviceToHost()
Move memory from device to host.
Definition: CudaMemory.cu:369
virtual void fill(unsigned char c)
fill the buffer with a byte
Definition: CudaMemory.cu:485
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:354
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
This class allocate, and destroy CPU memory.
Definition: HeapMemory.hpp:40
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
auto insert(Ti ele) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
auto getBackground() const -> decltype(vct_data.template get< p >(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
openfpm::sparse_index< Ti > get_sparse(Ti id) const
Get the sparse index.
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMergeIndexMapVector()
Return the merge mapping vector.
auto getIndexBuffer() const -> const decltype(vct_index)&
Get the indices buffer.
auto getGPUInsertBuffer() -> decltype(vct_add_data)&
Get the GPU insert buffer.
void flush_vd(vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, gpu::ofp_context_t &gpuContext, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array but save the insert buffer in vct_add_data_reord
void _branchfree_search(Ti x, Ti &id) const
get the element i
void deviceToHost()
Transfer from device to host.
void setGPURemoveBuffer(int nblock, int nslot)
set the gpu remove buffer for every block
auto getIndexBuffer() -> decltype(vct_index)&
Get the indices buffer.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
auto get(Ti id) const -> decltype(vct_data.template get< p >(id))
Get an element of the vector.
void flush_v(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_cont_0, gpu::ofp_context_t &gpuContext, flush_type opt=FLUSH_ON_HOST, int i=0)
merge the added element to the main data array but save the insert buffer in v
void flush_remove(gpu::ofp_context_t &gpuContext, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
auto insert(Ti ele) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
void flush(gpu::ofp_context_t &gpuContext, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
vector< aggregate< Ti >, Memory, layout_base, grow_p > & private_get_vct_index()
Return the sorted vector of the indexes.
void clear()
Clear all from all the elements.
size_t size()
Return how many element you have in this map.
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMappingVector()
Return the mapping vector.
auto getBackground() const -> decltype(vct_data.get(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
void hostToDevice()
Transfer from host to device.
void merge_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_cont_0, vector< aggregate< Ti, Ti >, Memory, layout_base, grow_p > &vct_add_index_unique, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_index_tmp, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_index_tmp2, gpu::ofp_context_t &gpuContext)
Merge indexes.
void reorder_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_cont_0, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_cont_1, vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, vector< T, Memory, layout_base, grow_p > &vct_add_data, gpu::ofp_context_t &gpuContext)
sort the continuos array of inserted key
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
void swapIndexVector(vector< aggregate< Ti >, Memory, layout_base, grow_p > &iv)
Ti _branchfree_search_nobck(Ti x, Ti &id) const
get the element i
auto getDataBuffer() const -> const decltype(vct_data)&
Get the data buffer.
auto getDataBuffer() -> decltype(vct_data)&
Get the data buffer.
auto get(Ti id) const -> decltype(vct_data.get(id))
Get an element of the vector.
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void resize(size_t n)
resize to n elements
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:204
size_t size()
Stub size.
Definition: map_vector.hpp:212
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
convert a type into constant type
Definition: aggregate.hpp:302
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:84
inter_memc< typename T::type >::type type
for each element in the vector interleave memory_c
Definition: memory_conf.hpp:86
Transform the boost::fusion::vector into memory specification (memory_traits)
copy for a source object to a destination
Definition: meta_copy.hpp:85
This class copy general objects.
Definition: meta_copy.hpp:53
__device__ __host__ void operator()(T &t) const
It call the copy function for each property.
sg_type & sg
encapsulated source object
Functor switch to select the vector sparse for standars scalar and blocked implementation.
static void solveConflicts(vector_index_type &vct_index, vector_index_type &vct_index_tmp, vector_index_type &vct_index_tmp2, vector_index_type &vct_index_tmp3, vector_index_dtmp_type &vct_index_dtmp, vector_index_type &vct_add_index_cont_1, vector_index_type2 &vct_add_index_unique, vector_data_type &vct_data, vector_data_type &vct_add_data, vector_data_type &vct_add_data_unique, vector_data_type &vct_add_data_cont, ite_gpu< 1 > &itew, block_functor &blf, gpu::ofp_context_t &gpuContext)
Merge all datas.
this class is a functor for "for_each" algorithm
vector_data_type & vector_data_red
Vector in which to the reduction.
void operator()(T &t) const
It call the copy function for each property.
vector_index_type_reo & reorder_add_index_cpu
reorder vector index
vector_index_type & vector_index
Index type vector.
vector_data_type & vector_data
Vector in which to the reduction.
sparse_vector_reduction_cpu(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &vector_index, vector_index_type_reo &reorder_add_index_cpu)
constructor
this class is a functor for "for_each" algorithm
void operator()(T &t) const
It call the copy function for each property.
sparse_vector_reduction_solve_conflict_assign_cpu(encap_src &src, encap_dst &dst)
constructor
this class is a functor for "for_each" algorithm
void operator()(T &t) const
It call the copy function for each property.
sparse_vector_reduction_solve_conflict_reduce_cpu(encap_src &src, encap_dst &dst)
constructor
this class is a functor for "for_each" algorithm
block_functor & blf
block functor
vector_data_type & vector_data_unsorted
new data in an unsorted way
sparse_vector_reduction(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_data_type &vector_data_unsorted, vector_index_type &vector_data_map, vector_index_type2 &segment_offset, block_functor &blf, gpu::ofp_context_t &gpuContext)
constructor
vector_data_type & vector_data
new datas
vector_data_type & vector_data_red
Vector in which to the reduction.
vector_index_type & vector_data_map
map of the data
vector_index_type2 & segment_offset
segment of offsets
gpu::ofp_context_t & gpuContext
gpu context
void operator()(T &t) const
It call the copy function for each property.
this class is a functor for "for_each" algorithm
vector_data_type & vector_data
Vector in which to the reduction.
gpu::ofp_context_t & gpuContext
gpu context
void operator()(T &t) const
It call the copy function for each property.
sparse_vector_special(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &segment_offset, gpu::ofp_context_t &gpuContext)
constructor
vector_index_type & segment_offset
segment of offsets
vector_data_type & vector_data_red
Vector in which to the reduction.
temporal buffer for reductions