OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
map_vector_sparse.hpp
1 /*
2  * map_vector_sparse.hpp
3  *
4  * Created on: Jan 22, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef MAP_VECTOR_SPARSE_HPP_
9 #define MAP_VECTOR_SPARSE_HPP_
10 
11 #include "util/cuda_launch.hpp"
12 #include "Vector/map_vector.hpp"
13 #include "Vector/cuda/map_vector_sparse_cuda_ker.cuh"
14 #include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh"
15 #include "util/cuda/ofp_context.hxx"
16 #include <iostream>
17 #include <limits>
18 
19 #if defined(__NVCC__)
20  #if !defined(CUDA_ON_CPU) && !defined(__HIP__)
21  #include "util/cuda/moderngpu/kernel_segreduce.hxx"
22  #include "util/cuda/moderngpu/kernel_merge.hxx"
23  #endif
24  #include "util/cuda/kernels.cuh"
25 #endif
26 
27 #include "util/cuda/scan_ofp.cuh"
28 #include "util/cuda/sort_ofp.cuh"
29 #include "util/cuda/segreduce_ofp.cuh"
30 #include "util/cuda/merge_ofp.cuh"
31 
32 enum flush_type
33 {
34  FLUSH_ON_HOST = 0,
35  FLUSH_ON_DEVICE = 1,
36  FLUSH_NO_DATA = 2
37 };
38 
39 template<typename OfpmVectorT>
40 using ValueTypeOf = typename std::remove_reference<OfpmVectorT>::type::value_type;
41 
42 namespace openfpm
43 {
44  // All props
45  template<typename sg_type>
46  struct htoD
47  {
49  sg_type & sg;
50 
51  unsigned int lele;
52 
53  htoD(sg_type & sg, unsigned int lele)
54  :sg(sg),lele(lele)
55  {};
56 
57 
59  template<typename T>
60  __device__ __host__ inline void operator()(T& t) const
61  {
62  sg.template hostToDevice<T::value>(lele,lele);
63  }
64  };
65 
66  constexpr int VECTOR_SPARSE_STANDARD = 1;
67  constexpr int VECTOR_SPARSE_BLOCK = 2;
68 
69  template<typename reduction_type, unsigned int impl>
71  {
72  template<typename encap_src, typename encap_dst>
73  static inline void process(encap_src & src, encap_dst & dst)
74  {
75  dst = reduction_type::red(dst,src);
76  }
77  };
78 
79  template<typename reduction_type>
80  struct cpu_block_process<reduction_type,VECTOR_SPARSE_BLOCK>
81  {
82  template<typename encap_src, typename encap_dst>
83  static inline void process(encap_src & src, encap_dst & dst)
84  {
85  for (size_t i = 0 ; i < encap_src::size ; i++)
86  {
87  dst[i] = reduction_type::red(dst[i],src[i]);
88  }
89  }
90  };
91 
92  template<typename reduction_type>
93  struct cpu_block_process<reduction_type,3>
94  {
95  template<typename encap_src, typename encap_dst,unsigned int N1>
96  static inline void process(encap_src & src, encap_dst (& dst)[N1])
97  {
98  for (unsigned int j = 0 ; j < N1 ; j++)
99  {
100  for (size_t i = 0 ; i < encap_dst::size ; i++)
101  {
102  dst[i] = reduction_type::red(dst[i][j],src[j][i]);
103  }
104  }
105  }
106 
107  template<unsigned int N1, unsigned int blockSize, typename encap_src, typename encap_dst>
108  static inline void process_e(encap_src & src, encap_dst & dst)
109  {
110  for (unsigned int j = 0 ; j < N1 ; j++)
111  {
112  for (size_t i = 0 ; i < blockSize ; i++)
113  {
114  dst[i] = reduction_type::red(dst[i][j],src[i][j]);
115  }
116  }
117  }
118  };
119 
124  template<unsigned int impl, typename block_functor>
125  struct scalar_block_implementation_switch // Case for scalar implementations
126  {
127  template <unsigned int p, typename vector_index_type>
128  static void extendSegments(vector_index_type & segments, size_t dataSize)
129  {
130 #ifdef __NVCC__
131  // Pass as there is nothing to append for mgpu
132 #else // __NVCC__
133  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
134 #endif // __NVCC__
135  }
136 
137  template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type , typename vector_index_type2>
138  static void segreduce(vector_data_type & vector_data,
139  vector_data_type & vector_data_unsorted,
140  vector_index_type & vector_data_map,
141  vector_index_type2 & segment_offset,
142  vector_data_type & vector_data_red,
143  block_functor & blf,
144  mgpu::ofp_context_t & context)
145  {
146 #ifdef __NVCC__
147  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
148  typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
149  typedef typename reduction_type::template op_red<red_type> red_op;
150  typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type;
151  red_type init;
152  init = 0;
153 
154  assert((std::is_same<seg_type,int>::value == true));
155 
156  openfpm::segreduce(
157  (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
158  (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size(),
159  (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
160  red_op(), init, context);
161 #else // __NVCC__
162  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
163 #endif // __NVCC__
164  }
165 
166 
182  template <
183  typename vector_data_type,
184  typename vector_index_type,
185  typename vector_index_type2,
186  typename vector_index_dtmp_type,
187  typename Ti,
188  typename ... v_reduce>
189  static void solveConflicts(
190  vector_index_type & vct_index_old,
191  vector_index_type & vct_index_merge,
192  vector_index_type & vct_index_merge_id,
193  vector_index_type & vct_index_out,
194  vector_index_dtmp_type & vct_index_dtmp,
195  vector_index_type & data_map,
196  vector_index_type2 & segments_new,
197  vector_data_type & vct_data_old,
198  vector_data_type & vct_add_data,
199  vector_data_type & vct_add_data_unique,
200  vector_data_type & vct_data_out,
201  ite_gpu<1> & itew,
202  block_functor & blf,
203  mgpu::ofp_context_t & context
204  )
205  {
206 #ifdef __NVCC__
207 
208  CUDA_LAUNCH((solve_conflicts<
209  decltype(vct_index_merge.toKernel()),
210  decltype(vct_data_old.toKernel()),
211  decltype(vct_index_dtmp.toKernel()),
212  128,
213  v_reduce ...
214  >),
215  itew,
216  vct_index_merge.toKernel(),vct_data_old.toKernel(),
217  vct_index_merge_id.toKernel(),vct_add_data_unique.toKernel(),
218  vct_index_out.toKernel(),vct_data_out.toKernel(),
219  vct_index_dtmp.toKernel(),
220  vct_index_old.size());
221 
222  // we scan tmp3
223  openfpm::scan(
224  (Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),
225  vct_index_dtmp.size(),
226  (Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),
227  context);
228 
229  // get the size to resize vct_index and vct_data
230  vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
231  int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
232 
233  vct_index_old.resize(size);
234  vct_data_old.resize(size);
235 
236  CUDA_LAUNCH(realign,itew,vct_index_out.toKernel(),vct_data_out.toKernel(),
237  vct_index_old.toKernel(), vct_data_old.toKernel(),
238  vct_index_dtmp.toKernel());
239 
240 
241 #else // __NVCC__
242  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
243 #endif // __NVCC__
244  }
245  };
246 
247 
248  template<typename block_functor>
249  struct scalar_block_implementation_switch<2, block_functor> // Case for blocked implementations
250  {
251  template <unsigned int p, typename vector_index_type>
252  static void extendSegments(vector_index_type & segments, size_t dataSize)
253  {
254 #ifdef __NVCC__
255  // Append trailing element to segment (marks end of last segment)
256  segments.resize(segments.size()+1);
257  segments.template get<p>(segments.size() - 1) = dataSize;
258  segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
259 #else // __NVCC__
260  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
261 #endif // __NVCC__
262  }
263 
264  template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type ,typename vector_index_type2>
265  static void segreduce(vector_data_type & vector_data,
266  vector_data_type & vector_data_unsorted,
267  vector_index_type & vector_data_map,
268  vector_index_type2 & segment_offset,
269  vector_data_type & vector_data_red,
270  block_functor & blf,
271  mgpu::ofp_context_t & context)
272  {
273 
274  }
275 
276  template <
277  typename vector_data_type,
278  typename vector_index_type,
279  typename vector_index_type2,
280  typename vector_index_dtmp_type,
281  typename Ti,
282  typename ... v_reduce>
283  static void solveConflicts(
284  vector_index_type & vct_index_old,
285  vector_index_type & vct_index_merge,
286  vector_index_type & vct_index_merge_id,
287  vector_index_type & vct_index_out,
288  vector_index_dtmp_type & vct_index_dtmp,
289  vector_index_type & data_map,
290  vector_index_type2 & segments_new,
291  vector_data_type & vct_data,
292  vector_data_type & vct_add_data,
293  vector_data_type & vct_add_data_unique,
294  vector_data_type & vct_data_out,
295  ite_gpu<1> & itew,
296  block_functor & blf,
297  mgpu::ofp_context_t & context
298  )
299  {
300 #ifdef __NVCC__
301  blf.template solve_conflicts<1,
302  decltype(vct_index_merge),
303  decltype(segments_new),
304  decltype(vct_data),
305  v_reduce ...>
306  (vct_index_merge, vct_index_merge_id, segments_new, data_map,
307  vct_data, vct_add_data,
308  vct_index_old, vct_data_out,
309  context);
310  vct_data_out.swap(vct_data);
311 
312 #else // __NVCC__
313  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
314 #endif // __NVCC__
315  }
316  };
317 
318  template<typename Ti>
319  struct reorder
320  {
321  Ti id;
322  Ti id2;
323 
324  bool operator<(const reorder & t) const
325  {
326  return id < t.id;
327  }
328  };
329 
330  template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type>
332  {
333  template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
334  static inline void red(size_t & i, vector_data_type & vector_data_red,
335  vector_data_type & vector_data,
336  vector_index_type & vector_index,
337  vector_index_type_reo & reorder_add_index_cpu)
338  {
339  size_t start = reorder_add_index_cpu.get(i).id;
340  red_type red = vector_data.template get<reduction_type::prop::value>(i);
341 
342  size_t j = 1;
343  for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
344  {
345  cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
346  //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
347  }
348  vector_data_red.add();
349  vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
350 
351  if (T::value == 0)
352  {
353  vector_index.add();
354  vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
355  }
356 
357  i += j;
358  }
359  };
360 
361 
362  template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type, unsigned int N1>
363  struct sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type[N1]>
364  {
365  template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
366  static inline void red(size_t & i, vector_data_type & vector_data_red,
367  vector_data_type & vector_data,
368  vector_index_type & vector_index,
369  vector_index_type_reo & reorder_add_index_cpu)
370  {
371  size_t start = reorder_add_index_cpu.get(i).id;
372  red_type red[N1];
373 
374  for (size_t k = 0 ; k < N1 ; k++)
375  {
376  red[k] = vector_data.template get<reduction_type::prop::value>(i)[k];
377  }
378 
379  size_t j = 1;
380  for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
381  {
382  auto ev = vector_data.template get<reduction_type::prop::value>(i+j);
384  //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
385  }
386 
387  vector_data_red.add();
388 
389  for (size_t k = 0 ; k < N1 ; k++)
390  {
391  vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1)[k] = red[k];
392  }
393 
394  if (T::value == 0)
395  {
396  vector_index.add();
397  vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
398  }
399 
400  i += j;
401  }
402  };
403 
414  template<typename vector_data_type,
415  typename vector_index_type,
416  typename vector_index_type_reo,
417  typename vector_reduction,
418  unsigned int impl>
420  {
422  vector_data_type & vector_data_red;
423 
425  vector_data_type & vector_data;
426 
428  vector_index_type_reo & reorder_add_index_cpu;
429 
431  vector_index_type & vector_index;
432 
439  inline sparse_vector_reduction_cpu(vector_data_type & vector_data_red,
440  vector_data_type & vector_data,
441  vector_index_type & vector_index,
442  vector_index_type_reo & reorder_add_index_cpu)
444  {};
445 
447  template<typename T>
448  inline void operator()(T& t) const
449  {
450  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
451  typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
452 
453  if (reduction_type::is_special() == false)
454  {
455  for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; )
456  {
458 
459 /* size_t start = reorder_add_index_cpu.get(i).id;
460  red_type red = vector_data.template get<reduction_type::prop::value>(i);
461 
462  size_t j = 1;
463  for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
464  {
465  cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
466  //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
467  }
468  vector_data_red.add();
469  vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
470 
471  if (T::value == 0)
472  {
473  vector_index.add();
474  vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
475  }
476 
477  i += j;*/
478  }
479  }
480  }
481  };
482 
493  template<typename encap_src,
494  typename encap_dst,
495  typename vector_reduction>
497  {
499  encap_src & src;
500 
502  encap_dst & dst;
503 
504 
512  :src(src),dst(dst)
513  {};
514 
516  template<typename T>
517  inline void operator()(T& t) const
518  {
519  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
520 
521  dst.template get<reduction_type::prop::value>() = src.template get<reduction_type::prop::value>();
522  }
523  };
524 
525 
526  template<unsigned int impl,typename vector_reduction, typename T,typename red_type>
528  {
529  template<typename encap_src, typename encap_dst>
530  static inline void red(encap_src & src, encap_dst & dst)
531  {
532  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
533 
534  cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
535  }
536  };
537 
538  template<unsigned int impl, typename vector_reduction, typename T,typename red_type, unsigned int N1>
539  struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type[N1]>
540  {
541  template<typename encap_src, typename encap_dst>
542  static inline void red(encap_src & src, encap_dst & dst)
543  {
544  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
545 
546  auto src_e = src.template get<reduction_type::prop::value>();
547  auto dst_e = dst.template get<reduction_type::prop::value>();
548 
549  cpu_block_process<reduction_type,impl+1>::template process_e<N1,red_type::size>(src_e,dst_e);
550  }
551  };
552 
563  template<typename encap_src,
564  typename encap_dst,
565  typename vector_reduction,
566  unsigned int impl>
568  {
570  encap_src & src;
571 
573  encap_dst & dst;
574 
575 
583  :src(src),dst(dst)
584  {};
585 
587  template<typename T>
588  inline void operator()(T& t) const
589  {
590  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
591  typedef typename boost::mpl::at<typename encap_src::T_type::type, typename reduction_type::prop>::type red_type;
592 
594 
595 // cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
596 // reduction_type::red(dst.template get<reduction_type::prop::value>(),src.template get<reduction_type::prop::value>());
597  }
598  };
599 
610  template<typename vector_data_type,
611  typename vector_index_type,
612  typename vector_index_type2,
613  typename vector_reduction,
614  typename block_functor,
615  unsigned int impl2, unsigned int pSegment=1>
617  {
619  vector_data_type & vector_data_red;
620 
622  vector_data_type & vector_data;
623 
625  vector_data_type & vector_data_unsorted;
626 
628  vector_index_type2 & segment_offset;
629 
631  vector_index_type & vector_data_map;
632 
634  block_functor & blf;
635 
637  mgpu::ofp_context_t & context;
638 
645  inline sparse_vector_reduction(vector_data_type & vector_data_red,
646  vector_data_type & vector_data,
647  vector_data_type & vector_data_unsorted,
648  vector_index_type & vector_data_map,
649  vector_index_type2 & segment_offset,
650  block_functor & blf,
651  mgpu::ofp_context_t & context)
657  blf(blf),
659  {};
660 
662  template<typename T>
663  inline void operator()(T& t) const
664  {
665 #ifdef __NVCC__
666 
667  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
668  typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
669  if (reduction_type::is_special() == false)
670  {
671  scalar_block_implementation_switch<impl2, block_functor>::template segreduce<pSegment, vector_reduction, T>(
672  vector_data,
677  blf,
678  context);
679  }
680 #else
681  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
682 #endif
683  }
684  };
685 
686 
688  {
689  template<unsigned int pSegment, typename vector_reduction, typename T, typename vector_index_type, typename vector_data_type>
690  static bool seg_reduce(vector_index_type & segments, vector_data_type & src, vector_data_type & dst)
691  {
692  return true;
693  }
694 
695  template<typename vector_index_type, typename vector_data_type, typename ... v_reduce>
696  static bool solve_conflicts(vector_index_type &keys, vector_index_type &merge_indices,
697  vector_data_type &data1, vector_data_type &data2,
698  vector_index_type &indices_tmp, vector_data_type &data_tmp,
699  vector_index_type &keysOut, vector_data_type &dataOut,
700  mgpu::ofp_context_t & context)
701  {
702  return true;
703  }
704 
706 
708  {
709  return outputMap;
710  }
711 
712  const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const
713  {
714  return outputMap;
715  }
716  };
717 
728  template<typename vector_data_type, typename vector_index_type, typename vector_reduction>
730  {
732  vector_data_type & vector_data_red;
733 
735  vector_data_type & vector_data;
736 
738  vector_index_type & segment_offset;
739 
741  mgpu::ofp_context_t & context;
742 
749  inline sparse_vector_special(vector_data_type & vector_data_red,
750  vector_data_type & vector_data,
751  vector_index_type & segment_offset,
752  mgpu::ofp_context_t & context)
754  {};
755 
757  template<typename T>
758  inline void operator()(T& t) const
759  {
760 #ifdef __NVCC__
761 
762  typedef typename boost::mpl::at<vector_reduction,T>::type reduction_type;
763 
764  // reduction type
765  typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
766 
767  if (reduction_type::is_special() == true)
768  {
769  auto ite = segment_offset.getGPUIterator();
770 
771  CUDA_LAUNCH((reduce_from_offset<decltype(segment_offset.toKernel()),decltype(vector_data_red.toKernel()),reduction_type>),
772  ite,segment_offset.toKernel(),vector_data_red.toKernel(),vector_data.size());
773  }
774 
775 #else
776  std::cout << __FILE__ << ":" << __LINE__ << " error: this file si supposed to be compiled with nvcc" << std::endl;
777 #endif
778  }
779  };
780 
781  template<typename T,
782  typename Ti = long int,
783  typename Memory=HeapMemory,
784  typename layout=typename memory_traits_lin<T>::type,
785  template<typename> class layout_base=memory_traits_lin ,
786  typename grow_p=grow_policy_double,
787  unsigned int impl=vect_isel<T>::value,
788  unsigned int impl2 = VECTOR_SPARSE_STANDARD,
789  typename block_functor = stub_block_functor>
791  {
792  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index;
794  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_m_index;
795 
796  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index;
797  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_rem_index;
798  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nadd_index;
799  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nrem_index;
801  vector<T,Memory,layout_base,grow_p> vct_add_data_reord;
802 
803  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_0;
804  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_1;
805  vector<T,Memory,layout_base,grow_p> vct_add_data_cont;
806  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> vct_add_index_unique;
807  vector<aggregate<int,int>,Memory,layout_base,grow_p> segments_int;
808 
809  vector<T,Memory,layout_base,grow_p,impl> vct_add_data_unique;
810 
811  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp4;
812  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
813  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp2;
814  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp3;
815  vector<aggregate<Ti,Ti,Ti>,Memory,layout_base,grow_p> vct_index_dtmp;
816 
817  // segments map (This is used only in case of Blocked data)
818  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_segment_index_map;
819 
820  block_functor blf;
821 
822  T bck;
823 
824  CudaMemory mem;
825 
826  openfpm::vector<reorder<Ti>> reorder_add_index_cpu;
827 
828  size_t max_ele;
829 
830  int n_gpu_add_block_slot = 0;
831  int n_gpu_rem_block_slot = 0;
832 
839  template<bool prefetch>
840  inline Ti _branchfree_search_nobck(Ti x, Ti & id) const
841  {
842  if (vct_index.size() == 0) {id = 0; return -1;}
843  const Ti *base = &vct_index.template get<0>(0);
844  const Ti *end = (const Ti *)vct_index.template getPointer<0>() + vct_index.size();
845  Ti n = vct_data.size()-1;
846  while (n > 1)
847  {
848  Ti half = n / 2;
849  if (prefetch)
850  {
851  __builtin_prefetch(base + half/2, 0, 0);
852  __builtin_prefetch(base + half + half/2, 0, 0);
853  }
854  base = (base[half] < x) ? base+half : base;
855  n -= half;
856  }
857 
858  int off = (*base < x);
859  id = base - &vct_index.template get<0>(0) + off;
860  return (base + off != end)?*(base + off):-1;
861  }
862 
869  template<bool prefetch>
870  inline void _branchfree_search(Ti x, Ti & id) const
871  {
872  Ti v = _branchfree_search_nobck<prefetch>(x,id);
873  id = (x == v)?id:vct_data.size()-1;
874  }
875 
876 
877  /* \brief take the indexes for the insertion pools and create a continuos array
878  *
879  * \param vct_nadd_index number of insertions of each pool
880  * \param vct_add_index pool of inserts
881  * \param vct_add_cont_index output continuos array of inserted indexes
882  * \param vct_add_data array of added data
883  * \param vct_add_data_cont continuos array of inserted data
884  * \param contect mgpu context
885  *
886  */
887  size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index,
888  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index,
889  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
890  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
892  vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
893  mgpu::ofp_context_t & context)
894  {
895 #ifdef __NVCC__
896 
897  // Add 0 to the last element to vct_nadd_index
898  vct_nadd_index.resize(vct_nadd_index.size()+1);
899  vct_nadd_index.template get<0>(vct_nadd_index.size()-1) = 0;
900  vct_nadd_index.template hostToDevice<0>(vct_nadd_index.size()-1,vct_nadd_index.size()-1);
901 
902  // Merge the list of inserted points for each block
903  vct_index_tmp4.resize(vct_nadd_index.size());
904 
905  openfpm::scan((Ti *)vct_nadd_index.template getDeviceBuffer<0>(),
906  vct_nadd_index.size(),
907  (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() ,
908  context);
909 
910  vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
911  size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
912 
913  // we reuse vct_nadd_index
914  vct_add_cont_index.resize(n_ele);
915  vct_add_cont_index_map.resize(n_ele);
916 
917  if (impl2 == VECTOR_SPARSE_STANDARD)
918  {
919  vct_add_data_cont.resize(n_ele);
920  }
921  else
922  {
923  vct_segment_index_map.resize(n_ele);
924  }
925 
926  if (n_gpu_add_block_slot >= 128)
927  {
928  ite_gpu<1> itew;
929  itew.wthr.x = vct_nadd_index.size()-1;
930  itew.wthr.y = 1;
931  itew.wthr.z = 1;
932  itew.thr.x = 128;
933  itew.thr.y = 1;
934  itew.thr.z = 1;
935 
936  CUDA_LAUNCH(construct_insert_list_key_only,itew,vct_add_index.toKernel(),
937  vct_nadd_index.toKernel(),
938  vct_index_tmp4.toKernel(),
939  vct_add_cont_index.toKernel(),
940  vct_add_cont_index_map.toKernel(),
941  n_gpu_add_block_slot);
942  }
943  else
944  {
945  auto itew = vct_add_index.getGPUIterator();
946 
947  CUDA_LAUNCH(construct_insert_list_key_only_small_pool,itew,vct_add_index.toKernel(),
948  vct_nadd_index.toKernel(),
949  vct_index_tmp4.toKernel(),
950  vct_add_cont_index.toKernel(),
951  vct_add_cont_index_map.toKernel(),
952  n_gpu_add_block_slot);
953  }
954 
955  return n_ele;
956 #endif
957  return 0;
958  }
959 
969  void reorder_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
970  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
971  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
972  vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
973  mgpu::ofp_context_t & context)
974  {
975 #ifdef __NVCC__
976  ite_gpu<1> itew;
977  itew.wthr.x = vct_nadd_index.size()-1;
978  itew.wthr.y = 1;
979  itew.wthr.z = 1;
980  itew.thr.x = 128;
981  itew.thr.y = 1;
982  itew.thr.z = 1;
983 
984  size_t n_ele = vct_add_cont_index.size();
985 
986  n_gpu_add_block_slot = 0;
987 
988  // now we sort
989  openfpm::sort(
990  (Ti *)vct_add_cont_index.template getDeviceBuffer<0>(),
991  (Ti *)vct_add_cont_index_map.template getDeviceBuffer<0>(),
992  vct_add_cont_index.size(),
993  mgpu::template less_t<Ti>(),
994  context);
995 
996  auto ite = vct_add_cont_index.getGPUIterator();
997 
998  // Now we reorder the data vector accordingly to the indexes
999 
1000  if (impl2 == VECTOR_SPARSE_STANDARD)
1001  {
1002  vct_add_data_reord.resize(n_ele);
1003  CUDA_LAUNCH(reorder_vector_data,ite,vct_add_cont_index_map.toKernel(),vct_add_data_cont.toKernel(),vct_add_data_reord.toKernel());
1004  }
1005 
1006 #endif
1007  }
1008 
1015  template<typename ... v_reduce>
1016  void merge_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_sort,
1017  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique,
1018  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index,
1019  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index_map,
1020  mgpu::ofp_context_t & context)
1021  {
1022 #ifdef __NVCC__
1023 
1024  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1025 
1026  auto ite = vct_add_index_sort.getGPUIterator();
1027 
1028  mem.allocate(sizeof(int));
1029  mem.fill(0);
1030  vct_add_index_unique.resize(vct_add_index_sort.size()+1);
1031 
1032  ite = vct_add_index_sort.getGPUIterator();
1033 
1034  vct_index_tmp4.resize(vct_add_index_sort.size()+1);
1035 
1036  CUDA_LAUNCH(
1037  (
1038  find_buffer_offsets_for_scan
1039  <0,
1040  decltype(vct_add_index_sort.toKernel()),
1041  decltype(vct_index_tmp4.toKernel())
1042  >
1043  ),
1044  ite,
1045  vct_add_index_sort.toKernel(),
1046  vct_index_tmp4.toKernel());
1047 
1048  openfpm::scan((Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_index_tmp4.size(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),context);
1049 
1050  vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1051  int n_ele_unique = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1052 
1053  vct_add_index_unique.resize(n_ele_unique);
1054 
1055  if (impl2 == VECTOR_SPARSE_STANDARD)
1056  {
1057  vct_add_data_unique.resize(n_ele_unique);
1058  }
1059 
1060  CUDA_LAUNCH(
1061  (construct_index_unique<0>),
1062  ite,
1063  vct_add_index_sort.toKernel(),
1064  vct_index_tmp4.toKernel(),
1065  vct_add_index_unique.toKernel());
1066 
1067  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1068 
1069  // Then we merge the two list vct_index and vct_add_index_unique
1070 
1071  // index to get merge index
1072  vct_m_index.resize(vct_index.size());
1073 
1074  if (vct_m_index.size() != 0)
1075  {
1076  ite = vct_m_index.getGPUIterator();
1077  CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1078  }
1079 
1080  // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1081  // we reorder the data array also
1082 
1083  vct_merge_index.resize(vct_index.size() + vct_add_index_unique.size());
1084  vct_merge_index_map.resize(vct_index.size() + vct_add_index_unique.size());
1085  vct_index_tmp3.resize(vct_index.size() + vct_add_index_unique.size());
1086 
1087  // Do not delete this reserve
1088  // Unfortunately all resize with DataBlocks are broken
1089  if (impl2 == VECTOR_SPARSE_STANDARD)
1090  {
1091  vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+1);
1092  vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size());
1093  }
1094 
1095  ite = vct_add_index_unique.getGPUIterator();
1096  vct_index_tmp4.resize(vct_add_index_unique.size());
1097  CUDA_LAUNCH((set_indexes<0>),ite,vct_index_tmp4.toKernel(),vct_index.size());
1098 
1099  ite_gpu<1> itew;
1100 
1101  itew.wthr.x = vct_merge_index.size() / 128 + (vct_merge_index.size() % 128 != 0);
1102  itew.wthr.y = 1;
1103  itew.wthr.z = 1;
1104  itew.thr.x = 128;
1105  itew.thr.y = 1;
1106  itew.thr.z = 1;
1107 
1108  vct_index_dtmp.resize(itew.wthr.x);
1109 
1110  // we merge with vct_index with vct_add_index_unique in vct_merge_index, vct_merge_index contain the merged indexes
1111  //
1112 
1113  openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1114  (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(),
1115  (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
1116 
1117 
1118 #endif
1119  }
1120 
1121 
1122 
1123  template<typename ... v_reduce>
1124  void merge_datas(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1125  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & segments_new,
1126  vector<T,Memory,layout_base,grow_p> & vct_add_data,
1127  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_data_reord_map,
1128  mgpu::ofp_context_t & context)
1129  {
1130 #ifdef __NVCC__
1131  ite_gpu<1> itew;
1132  itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1133  itew.wthr.y = 1;
1134  itew.wthr.z = 1;
1135  itew.thr.x = 128;
1136  itew.thr.y = 1;
1137  itew.thr.z = 1;
1138 
1139  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1140 
1142 
1143  // Now we can do a segmented reduction
1145  ::template extendSegments<1>(vct_add_index_unique, vct_add_data_reord_map.size());
1146 
1147  if (impl2 == VECTOR_SPARSE_STANDARD)
1148  {
1149  sparse_vector_reduction<typename std::remove_reference<decltype(vct_add_data)>::type,
1150  decltype(vct_add_data_reord_map),
1151  decltype(vct_add_index_unique),vv_reduce,block_functor,impl2>
1152  svr(
1153  vct_add_data_unique,
1154  vct_add_data_reord,
1155  vct_add_data,
1156  vct_add_data_reord_map,
1157  vct_add_index_unique,
1158  blf,
1159  context);
1160 
1161  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1162  }
1163 
1164  sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type,
1165  decltype(vct_add_index_unique),
1166  vv_reduce> svr2(vct_add_data_unique,vct_add_data_reord,vct_add_index_unique,context);
1167  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr2);
1168 
1170 
1171  // Now perform the right solve_conflicts according to impl2
1172  scalar_block_implementation_switch<impl2, block_functor>::template solveConflicts<
1173  decltype(vct_data),
1174  decltype(vct_index),
1175  decltype(segments_new),
1176  decltype(vct_index_dtmp),
1177  Ti,
1178  v_reduce ...
1179  >
1180  (
1181  vct_index,
1182  vct_index_tmp,
1183  vct_index_tmp2,
1184  vct_index_tmp3,
1185  vct_index_dtmp,
1186  vct_add_data_reord_map,
1187  segments_new,
1188  vct_data,
1189  vct_add_data,
1190  vct_add_data_unique,
1191  vct_add_data_cont,
1192  itew,
1193  blf,
1194  context
1195  );
1196 
1197 
1198 #else
1199  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1200 #endif
1201  }
1202 
1203  template<typename ... v_reduce>
1204  void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1205  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1206  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1207  mgpu::ofp_context_t & context)
1208  {
1209 #ifdef __NVCC__
1210 
1211  // To avoid the case where you never called setGPUInsertBuffer
1212  if (n_gpu_add_block_slot == 0 || vct_add_index.size() == 0)
1213  {
1214  return;
1215  }
1216 
1217  size_t n_ele = make_continuos(vct_nadd_index,vct_add_index,vct_add_index_cont_0,vct_add_index_cont_1,
1218  vct_add_data,vct_add_data_cont,context);
1219 
1220  // At this point we can check whether we have not inserted anything actually,
1221  // in this case, return without further ado...
1222  if (vct_add_index_cont_0.size() == 0)
1223  {return;}
1224 
1225  reorder_indexes(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,vct_add_data,context);
1226 
1227  merge_indexes<v_reduce ... >(vct_add_index_cont_0,vct_add_index_unique,
1228  vct_index_tmp,vct_index_tmp2,
1229  context);
1230 
1231  merge_datas<v_reduce ... >(vct_add_data_reord,vct_add_index_unique,vct_add_data,vct_add_index_cont_1,context);
1232 
1233 #else
1234  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1235 #endif
1236  }
1237 
1238 
1239  void flush_on_gpu_remove(
1240  mgpu::ofp_context_t & context)
1241  {
1242 #ifdef __NVCC__
1243 
1244  // Add 0 to the last element to vct_nadd_index
1245  vct_nrem_index.resize(vct_nrem_index.size()+1);
1246  vct_nrem_index.template get<0>(vct_nrem_index.size()-1) = 0;
1247  vct_nrem_index.template hostToDevice<0>(vct_nrem_index.size()-1,vct_nrem_index.size()-1);
1248 
1249  // Merge the list of inserted points for each block
1250  vct_index_tmp4.resize(vct_nrem_index.size());
1251 
1252  openfpm::scan((Ti *)vct_nrem_index.template getDeviceBuffer<0>(), vct_nrem_index.size(), (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() , context);
1253 
1254  vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1255  size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1256 
1257  // we reuse vct_nadd_index
1258  vct_add_index_cont_0.resize(n_ele);
1259  vct_add_index_cont_1.resize(n_ele);
1260 
1261  ite_gpu<1> itew;
1262  itew.wthr.x = vct_nrem_index.size()-1;
1263  itew.wthr.y = 1;
1264  itew.wthr.z = 1;
1265  itew.thr.x = 128;
1266  itew.thr.y = 1;
1267  itew.thr.z = 1;
1268 
1269  CUDA_LAUNCH(construct_remove_list,itew,vct_rem_index.toKernel(),
1270  vct_nrem_index.toKernel(),
1271  vct_index_tmp4.toKernel(),
1272  vct_add_index_cont_0.toKernel(),
1273  vct_add_index_cont_1.toKernel(),
1274  n_gpu_rem_block_slot);
1275 
1276  // now we sort
1277  openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(),
1278  vct_add_index_cont_0.size(), mgpu::template less_t<Ti>(), context);
1279 
1280  auto ite = vct_add_index_cont_0.getGPUIterator();
1281 
1282  mem.allocate(sizeof(int));
1283  mem.fill(0);
1284  vct_add_index_unique.resize(vct_add_index_cont_0.size()+1);
1285 
1286  ite = vct_add_index_cont_0.getGPUIterator();
1287 
1288  // produce unique index list
1289  // Find the buffer bases
1290  CUDA_LAUNCH((find_buffer_offsets_zero<0,decltype(vct_add_index_cont_0.toKernel()),decltype(vct_add_index_unique.toKernel())>),
1291  ite,
1292  vct_add_index_cont_0.toKernel(),(int *)mem.getDevicePointer(),vct_add_index_unique.toKernel());
1293 
1294  mem.deviceToHost();
1295  int n_ele_unique = *(int *)mem.getPointer();
1296 
1297  vct_add_index_unique.resize(n_ele_unique);
1298 
1299  openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),
1300  vct_add_index_unique.size(),mgpu::template less_t<Ti>(),context);
1301 
1302  // Then we merge the two list vct_index and vct_add_index_unique
1303 
1304  // index to get merge index
1305  vct_m_index.resize(vct_index.size() + vct_add_index_unique.size());
1306 
1307  ite = vct_m_index.getGPUIterator();
1308  CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1309 
1310  ite = vct_add_index_unique.getGPUIterator();
1311  CUDA_LAUNCH((set_indexes<1>),ite,vct_add_index_unique.toKernel(),vct_index.size());
1312 
1313  // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1314  // we reorder the data array also
1315 
1316  vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1317  vct_index_tmp2.resize(vct_index.size() + vct_add_index_unique.size());
1318 
1319  itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1320  itew.wthr.y = 1;
1321  itew.wthr.z = 1;
1322  itew.thr.x = 128;
1323  itew.thr.y = 1;
1324  itew.thr.z = 1;
1325 
1326  vct_index_dtmp.resize(itew.wthr.x);
1327 
1328  // we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index
1329  //
1330  openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1331  (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(),
1332  (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
1333 
1334  vct_index_tmp3.resize(128*itew.wthr.x);
1335 
1336  CUDA_LAUNCH((solve_conflicts_remove<decltype(vct_index_tmp.toKernel()),decltype(vct_index_dtmp.toKernel()),128>),
1337  itew,
1338  vct_index_tmp.toKernel(),
1339  vct_index_tmp2.toKernel(),
1340  vct_index_tmp3.toKernel(),
1341  vct_m_index.toKernel(),
1342  vct_index_dtmp.toKernel(),
1343  vct_index.size());
1344 
1345  // we scan tmp3
1346  openfpm::scan((Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),vct_index_dtmp.size(),(Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),context);
1347 
1348  // get the size to resize vct_index and vct_data
1349  vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
1350  int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
1351 
1352  vct_add_data_cont.resize(size);
1353  vct_index.resize(size);
1354 
1355  CUDA_LAUNCH(realign_remove,itew,vct_index_tmp3.toKernel(),vct_m_index.toKernel(),vct_data.toKernel(),
1356  vct_index.toKernel(),vct_add_data_cont.toKernel(),
1357  vct_index_dtmp.toKernel());
1358 
1359  vct_data.swap(vct_add_data_cont);
1360 
1361 #else
1362  std::cout << __FILE__ << ":" << __LINE__ << " error: you are suppose to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1363 #endif
1364  }
1365 
1366  void resetBck()
1367  {
1368  // re-add background
1369  vct_data.resize(vct_data.size()+1);
1370  vct_data.get(vct_data.size()-1) = bck;
1371 
1372  htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1373  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1374  }
1375 
1376  template<typename ... v_reduce>
1377  void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1378  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1379  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1380  mgpu::ofp_context_t & context)
1381  {
1382  flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);
1383  }
1384 
1385  template<typename ... v_reduce>
1386  void flush_on_cpu()
1387  {
1388  if (vct_add_index.size() == 0)
1389  {return;}
1390 
1391  // First copy the added index to reorder
1392  reorder_add_index_cpu.resize(vct_add_index.size());
1393  vct_add_data_cont.resize(vct_add_index.size());
1394 
1395  for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1396  {
1397  reorder_add_index_cpu.get(i).id = vct_add_index.template get<0>(i);
1398  reorder_add_index_cpu.get(i).id2 = i;
1399  }
1400 
1401  reorder_add_index_cpu.sort();
1402 
1403  // Copy the data
1404  for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1405  {
1406  vct_add_data_cont.get(i) = vct_add_data.get(reorder_add_index_cpu.get(i).id2);
1407  }
1408 
1409  typedef boost::mpl::vector<v_reduce...> vv_reduce;
1410 
1411  sparse_vector_reduction_cpu<decltype(vct_add_data),
1412  decltype(vct_add_index_unique),
1413  decltype(reorder_add_index_cpu),
1414  vv_reduce,
1415  impl2>
1416  svr(vct_add_data_unique,
1417  vct_add_data_cont,
1418  vct_add_index_unique,
1419  reorder_add_index_cpu);
1420 
1421  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1422 
1423  // merge the the data
1424 
1425  vector<T,Memory,layout_base,grow_p,impl> vct_data_tmp;
1426  vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
1427 
1428  vct_data_tmp.resize(vct_data.size() + vct_add_data_unique.size());
1429  vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1430 
1431  Ti di = 0;
1432  Ti ai = 0;
1433  size_t i = 0;
1434 
1435  for ( ; i < vct_data_tmp.size() ; i++)
1436  {
1437  Ti id_a = (ai < vct_add_index_unique.size())?vct_add_index_unique.template get<0>(ai):std::numeric_limits<Ti>::max();
1438  Ti id_d = (di < vct_index.size())?vct_index.template get<0>(di):std::numeric_limits<Ti>::max();
1439 
1440  if ( id_a <= id_d )
1441  {
1442  vct_index_tmp.template get<0>(i) = id_a;
1443 
1444  if (id_a == id_d)
1445  {
1446  auto dst = vct_data_tmp.get(i);
1447  auto src = vct_add_data_unique.get(ai);
1448 
1449  sparse_vector_reduction_solve_conflict_assign_cpu<decltype(vct_data_tmp.get(i)),
1450  decltype(vct_add_data.get(ai)),
1451  vv_reduce>
1452  sva(src,dst);
1453 
1454  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(sva);
1455  ai++;
1456 
1457  dst = vct_data_tmp.get(i);
1458  src = vct_data.get(di);
1459 
1460  sparse_vector_reduction_solve_conflict_reduce_cpu<decltype(vct_data_tmp.get(i)),
1461  decltype(vct_data.get(di)),
1462  vv_reduce,
1463  impl2>
1464  svr(src,dst);
1465  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1466 
1467  di++;
1468 
1469  vct_data_tmp.resize(vct_data_tmp.size()-1);
1470  vct_index_tmp.resize(vct_index_tmp.size()-1);
1471  }
1472  else
1473  {
1474  vct_index_tmp.template get<0>(i) = vct_add_index_unique.template get<0>(ai);
1475  vct_data_tmp.get(i) = vct_add_data_unique.get(ai);
1476  ai++;
1477  }
1478  }
1479  else
1480  {
1481  vct_index_tmp.template get<0>(i) = vct_index.template get<0>(di);
1482  vct_data_tmp.get(i) = vct_data.get(di);
1483  di++;
1484  }
1485  }
1486 
1487  vct_index.swap(vct_index_tmp);
1488  vct_data.swap(vct_data_tmp);
1489 
1490  vct_add_data.clear();
1491  vct_add_index.clear();
1492  vct_add_index_unique.clear();
1493  vct_add_data_unique.clear();
1494  }
1495 
1496  public:
1497 
1498  vector_sparse()
1499  :max_ele(0)
1500  {
1501  vct_data.resize(1);
1502  }
1503 
1508  auto getIndexBuffer() -> decltype(vct_index)&
1509  {
1510  return vct_index;
1511  }
1512 
1517  auto getDataBuffer() -> decltype(vct_data)&
1518  {
1519  return vct_data;
1520  }
1521 
1526  auto getIndexBuffer() const -> const decltype(vct_index)&
1527  {
1528  return vct_index;
1529  }
1530 
1535  auto getDataBuffer() const -> const decltype(vct_data)&
1536  {
1537  return vct_data;
1538  }
1539 
1552  {
1553  Ti di;
1554  this->_branchfree_search<false>(id,di);
1556  sid.id = di;
1557 
1558  return sid;
1559  }
1560 
1571  template <unsigned int p>
1572  inline auto get(Ti id) const -> decltype(vct_data.template get<p>(id))
1573  {
1574  Ti di;
1575  this->_branchfree_search<false>(id,di);
1576  return vct_data.template get<p>(di);
1577  }
1578 
1589  inline auto get(Ti id) const -> decltype(vct_data.get(id))
1590  {
1591  Ti di;
1592  this->_branchfree_search<false>(id,di);
1593  return vct_data.get(di);
1594  }
1595 
1601  void resize(size_t n)
1602  {
1603  max_ele = n;
1604  }
1605 
1614  void swapIndexVector(vector<aggregate<Ti>,Memory,layout_base,grow_p> & iv)
1615  {
1616  vct_index.swap(iv);
1617  }
1618 
1624  template <unsigned int p>
1625  auto getBackground() const -> decltype(vct_data.template get<p>(vct_data.size()-1))
1626  {
1627  return vct_data.template get<p>(vct_data.size()-1);
1628  }
1629 
1635  auto getBackground() const -> decltype(vct_data.get(vct_data.size()-1))
1636  {
1637  return vct_data.get(vct_data.size()-1);
1638  }
1639 
1640  template<unsigned int p>
1641  void setBackground(const typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type & bck_)
1642  {
1644  typename std::remove_reference<decltype(vct_data.template get<p>(vct_data.size()-1))>::type>
1645  ::meta_copy_d_(bck_,vct_data.template get<p>(vct_data.size()-1));
1646 
1647  vct_data.template hostToDevice<p>(vct_data.size()-1,vct_data.size()-1);
1648 
1650  ::meta_copy_(bck_,bck.template get<p>());
1651  }
1652 
1660  template <unsigned int p>
1661  auto insert(Ti ele) -> decltype(vct_data.template get<p>(0))
1662  {
1663  vct_add_index.add();
1664  vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1665  vct_add_data.add();
1666  return vct_add_data.template get<p>(vct_add_data.size()-1);
1667  }
1668 
1676  template <unsigned int p>
1677  auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.template get<p>(0))
1678  {
1679  is_new = false;
1680  size_t di;
1681 
1682  // first we have to search if the block exist
1683  Ti v = _branchfree_search_nobck(ele,di);
1684 
1685  if (v == ele)
1686  {
1687  // block exist
1688  return vct_data.template get<p>(di);
1689  }
1690  is_new = true;
1691 
1692  // It does not exist, we create it di contain the index where we have to create the new block
1693  vct_index.insert(di);
1694  vct_data.isert(di);
1695 
1696  return vct_data.template get<p>(di);
1697  }
1698 
1704  auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.get(0))
1705  {
1706  is_new = false;
1707  Ti di;
1708 
1709  // first we have to search if the block exist
1710  Ti v = _branchfree_search_nobck<true>(ele,di);
1711 
1712  if (v == ele)
1713  {
1714  // block exist
1715  return vct_data.get(di);
1716  }
1717 
1718  // It does not exist, we create it di contain the index where we have to create the new block
1719  vct_index.insert(di);
1720  vct_data.insert(di);
1721  is_new = true;
1722 
1723  vct_index.template get<0>(di) = ele;
1724 
1725  return vct_data.get(di);
1726  }
1727 
1733  auto insert(Ti ele) -> decltype(vct_data.get(0))
1734  {
1735  vct_add_index.add();
1736  vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1737  vct_add_data.add();
1738  return vct_add_data.get(vct_add_data.size()-1);
1739  }
1740 
1748  template<typename ... v_reduce>
1749  void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1750  mgpu::ofp_context_t & context,
1751  flush_type opt = FLUSH_ON_HOST,
1752  int i = 0)
1753  {
1754  // Eliminate background
1755  vct_data.resize(vct_index.size());
1756 
1757  if (opt & flush_type::FLUSH_ON_DEVICE)
1758  {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context,i);}
1759  else
1760  {this->flush_on_cpu<v_reduce ... >();}
1761 
1762  resetBck();
1763  }
1764 
1772  template<typename ... v_reduce>
1774  mgpu::ofp_context_t & context,
1775  flush_type opt = FLUSH_ON_HOST)
1776  {
1777  // Eliminate background
1778  vct_data.resize(vct_index.size());
1779 
1780  if (opt & flush_type::FLUSH_ON_DEVICE)
1781  {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1782  else
1783  {this->flush_on_cpu<v_reduce ... >();}
1784 
1785  resetBck();
1786  }
1787 
1793  template<typename ... v_reduce>
1794  void flush(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1795  {
1796  // Eliminate background
1797  vct_data.resize(vct_index.size());
1798 
1799  if (opt & flush_type::FLUSH_ON_DEVICE)
1800  {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1801  else
1802  {this->flush_on_cpu<v_reduce ... >();}
1803 
1804  resetBck();
1805  }
1806 
1812  void flush_remove(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1813  {
1814  vct_data.resize(vct_data.size()-1);
1815 
1816  if (opt & flush_type::FLUSH_ON_DEVICE)
1817  {this->flush_on_gpu_remove(context);}
1818  else
1819  {
1820  std::cerr << __FILE__ << ":" << __LINE__ << " error, flush_remove on CPU has not implemented yet";
1821  }
1822 
1823  resetBck();
1824  }
1825 
1830  size_t size()
1831  {
1832  return vct_index.size();
1833  }
1834 
1839  vector<aggregate<Ti>,Memory,layout_base,grow_p> &
1841  {
1842  return vct_index;
1843  }
1844 
1850  template<unsigned int ... prp>
1852  {
1853  vct_index.template deviceToHost<0>();
1854  vct_data.template deviceToHost<prp...>();
1855  }
1856 
1862  template<unsigned int ... prp>
1864  {
1865  vct_index.template hostToDevice<0>();
1866  vct_data.template hostToDevice<prp...>();
1867  }
1868 
1875  {
1876  vector_sparse_gpu_ker<T,Ti,layout_base> mvsck(vct_index.toKernel(),vct_data.toKernel(),
1877  vct_add_index.toKernel(),
1878  vct_rem_index.toKernel(),vct_add_data.toKernel(),
1879  vct_nadd_index.toKernel(),
1880  vct_nrem_index.toKernel(),
1881  n_gpu_add_block_slot,
1882  n_gpu_rem_block_slot);
1883 
1884  return mvsck;
1885  }
1886 
1893  void setGPUInsertBuffer(int nblock, int nslot)
1894  {
1895  vct_add_index.resize(nblock*nslot);
1896  vct_nadd_index.resize(nblock);
1897  vct_add_data.resize(nblock*nslot);
1898  n_gpu_add_block_slot = nslot;
1899  vct_nadd_index.template fill<0>(0);
1900  }
1901 
1907  void preFlush()
1908  {
1909 #ifdef __NVCC__
1910  vct_nadd_index.resize(vct_add_index.size());
1911 
1912  if (vct_nadd_index.size() != 0)
1913  {
1914  auto ite = vct_nadd_index.getGPUIterator();
1915  CUDA_LAUNCH((set_one_insert_buffer),ite,vct_nadd_index.toKernel());
1916  }
1917  n_gpu_add_block_slot = 1;
1918 #endif
1919  }
1920 
1925  auto getGPUInsertBuffer() -> decltype(vct_add_data)&
1926  {
1927  return vct_add_data;
1928  }
1929 
1936  void setGPURemoveBuffer(int nblock, int nslot)
1937  {
1938  vct_rem_index.resize(nblock*nslot);
1939  vct_nrem_index.resize(nblock);
1940  n_gpu_rem_block_slot = nslot;
1941  vct_nrem_index.template fill<0>(0);
1942  }
1943 
1944 #ifdef CUDA_GPU
1945 
1951  auto getGPUIterator() -> decltype(vct_index.getGPUIterator())
1952  {
1953  return vct_index.getGPUIterator();
1954  }
1955 
1956 #endif
1957 
1962  void clear()
1963  {
1964  vct_data.clear();
1965  vct_index.clear();
1966  vct_add_index.clear();
1967  vct_add_data.clear();
1968 
1969  // re-add background
1970  vct_data.resize(vct_data.size()+1);
1971  vct_data.get(vct_data.size()-1) = bck;
1972 
1973  htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1974  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1975 
1976  max_ele = 0;
1977  n_gpu_add_block_slot = 0;
1978  n_gpu_rem_block_slot = 0;
1979  }
1980 
1982  {
1983  vct_data.swap(sp.vct_data);
1984  vct_index.swap(sp.vct_index);
1985  vct_add_index.swap(sp.vct_add_index);
1986  vct_add_data.swap(sp.vct_add_data);
1987 
1988  size_t max_ele_ = sp.max_ele;
1989  sp.max_ele = max_ele;
1990  this->max_ele = max_ele_;
1991  }
1992 
1993  vector<T,Memory,layout_base,grow_p> & private_get_vct_add_data()
1994  {
1995  return vct_add_data;
1996  }
1997 
1998  vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index()
1999  {
2000  return vct_add_index;
2001  }
2002 
2003  const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index() const
2004  {
2005  return vct_add_index;
2006  }
2007 
2008  vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index()
2009  {
2010  return vct_nadd_index;
2011  }
2012 
2013  const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index() const
2014  {
2015  return vct_nadd_index;
2016  }
2017 
2018  auto getSegmentToOutMap() -> decltype(blf.get_outputMap())
2019  {
2020  return blf.get_outputMap();
2021  }
2022 
2023  auto getSegmentToOutMap() const -> decltype(blf.get_outputMap())
2024  {
2025  return blf.get_outputMap();
2026  }
2027 
2033  {
2034  vct_add_data.resize(0);
2035  vct_add_data.shrink_to_fit();
2036 
2037  vct_add_data.resize(0);
2038  vct_add_data.shrink_to_fit();
2039 
2040  vct_add_data_reord.resize(0);
2041  vct_add_data_reord.shrink_to_fit();
2042 
2043  vct_add_data_cont.resize(0);
2044  vct_add_data_cont.shrink_to_fit();
2045 
2046  vct_add_data_unique.resize(0);
2047  vct_add_data_unique.shrink_to_fit();
2048  }
2049 
2050  /* \brief Return the offsets of the segments for the merge indexes
2051  *
2052  *
2053  */
2054  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap()
2055  {
2056  return vct_add_index_unique;
2057  }
2058 
2059  vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap() const
2060  {
2061  return vct_add_index_unique;
2062  }
2063 
2082  vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMappingVector()
2083  {
2084  return vct_add_index_cont_1;
2085  }
2086 
2105  vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMergeIndexMapVector()
2106  {
2107  return vct_index_tmp2;
2108  }
2109  };
2110 
2111 
2112  template<typename T, unsigned int blockSwitch = VECTOR_SPARSE_STANDARD, typename block_functor = stub_block_functor, typename indexT = int>
2113  using vector_sparse_gpu = openfpm::vector_sparse<
2114  T,
2115  indexT,
2116  CudaMemory,
2117  typename memory_traits_inte<T>::type,
2119  grow_policy_double,
2120  vect_isel<T>::value,
2121  blockSwitch,
2122  block_functor
2123  >;
2124 
2125  template<typename T, typename block_functor = stub_block_functor, typename indexT = long int>
2126  using vector_sparse_gpu_block = openfpm::vector_sparse<
2127  T,
2128  indexT,
2129  CudaMemory,
2130  typename memory_traits_inte<T>::type,
2132  grow_policy_double,
2133  vect_isel<T>::value,
2134  VECTOR_SPARSE_BLOCK,
2135  block_functor
2136  >;
2137 }
2138 
2139 
2140 
2141 #endif /* MAP_VECTOR_SPARSE_HPP_ */
void operator()(T &t) const
It call the copy function for each property.
void flush_remove(mgpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
this class is a functor for "for_each" algorithm
convert a type into constant type
Definition: aggregate.hpp:292
vector_data_type & vector_data_red
Vector in which to the reduction.
this class is a functor for "for_each" algorithm
void deviceToHost()
Transfer from device to host.
void reorder_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_cont_index, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_cont_index_map, vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, vector< T, Memory, layout_base, grow_p > &vct_add_data_cont, mgpu::ofp_context_t &context)
sort the continuos array of inserted key
Transform the boost::fusion::vector into memory specification (memory_traits)
void setGPURemoveBuffer(int nblock, int nslot)
set the gpu remove buffer for every block
void operator()(T &t) const
It call the copy function for each property.
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
this class is a functor for "for_each" algorithm
block_functor & blf
block functor
vector_data_type & vector_data
Vector in which to the reduction.
sparse_vector_reduction_cpu(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &vector_index, vector_index_type_reo &reorder_add_index_cpu)
constructor
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
sparse_vector_reduction(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_data_type &vector_data_unsorted, vector_index_type &vector_data_map, vector_index_type2 &segment_offset, block_functor &blf, mgpu::ofp_context_t &context)
constructor
vector_data_type & vector_data_red
Vector in which to the reduction.
sparse_vector_reduction_solve_conflict_assign_cpu(encap_src &src, encap_dst &dst)
constructor
Functor switch to select the vector sparse for standars scalar and blocked implementation.
auto getDataBuffer() const -> const decltype(vct_data)&
Get the data buffer.
vector_data_type & vector_data_unsorted
new data in an unsorted way
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMappingVector()
Return the mapping vector.
vector_index_type2 & segment_offset
segment of offsets
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:352
mgpu::ofp_context_t & context
gpu context
vector< aggregate< Ti >, Memory, layout_base, grow_p > & private_get_vct_index()
Return the sorted vector of the indexes.
vector_data_type & vector_data
new datas
size_t size()
Stub size.
Definition: map_vector.hpp:211
void operator()(T &t) const
It call the copy function for each property.
auto get(Ti id) const -> decltype(vct_data.get(id))
Get an element of the vector.
void hostToDevice()
Transfer from host to device.
auto getGPUInsertBuffer() -> decltype(vct_add_data)&
Get the GPU insert buffer.
This class allocate, and destroy CPU memory.
Definition: HeapMemory.hpp:39
vector_data_type & vector_data
Vector in which to the reduction.
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:83
void flush_vd(vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, mgpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array but save the insert buffer in v
vector_index_type & vector_index
Index type vector.
This class copy general objects.
virtual void fill(unsigned char c)
fill the buffer with a byte
Definition: CudaMemory.cu:479
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
vector_index_type_reo & reorder_add_index_cpu
reorder vector index
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:497
inter_memc< typename T::type >::type type
for each element in the vector interleave memory_c
Definition: memory_conf.hpp:86
sparse_vector_reduction_solve_conflict_reduce_cpu(encap_src &src, encap_dst &dst)
constructor
sparse_vector_special(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &segment_offset, mgpu::ofp_context_t &context)
constructor
auto getBackground() const -> decltype(vct_data.get(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
vector_index_type & segment_offset
segment of offsets
auto getIndexBuffer() const -> const decltype(vct_index)&
Get the indices buffer.
void flush_v(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_cont_0, mgpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST, int i=0)
merge the added element to the main data array but save the insert buffer in v
Ti _branchfree_search_nobck(Ti x, Ti &id) const
get the element i
void operator()(T &t) const
It call the copy function for each property.
vector_index_type & vector_data_map
map of the data
void _branchfree_search(Ti x, Ti &id) const
get the element i
auto insert(Ti ele) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
size_t size()
Return how many element you have in this map.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
auto getIndexBuffer() -> decltype(vct_index)&
Get the indices buffer.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
static void solveConflicts(vector_index_type &vct_index_old, vector_index_type &vct_index_merge, vector_index_type &vct_index_merge_id, vector_index_type &vct_index_out, vector_index_dtmp_type &vct_index_dtmp, vector_index_type &data_map, vector_index_type2 &segments_new, vector_data_type &vct_data_old, vector_data_type &vct_add_data, vector_data_type &vct_add_data_unique, vector_data_type &vct_data_out, ite_gpu< 1 > &itew, block_functor &blf, mgpu::ofp_context_t &context)
copy for a source object to a destination
void resize(size_t n)
resize to n elements
virtual void deviceToHost()
Move memory from device to host.
Definition: CudaMemory.cu:367
void flush(mgpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
auto getDataBuffer() -> decltype(vct_data)&
Get the data buffer.
void operator()(T &t) const
It call the copy function for each property.
auto get(Ti id) const -> decltype(vct_data.template get< p >(id))
Get an element of the vector.
auto getBackground() const -> decltype(vct_data.template get< p >(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
sg_type & sg
encapsulated source object
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
this class is a functor for "for_each" algorithm
void clear()
Clear all from all the elements.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
vector_data_type & vector_data_red
Vector in which to the reduction.
temporal buffer for reductions
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
this class is a functor for "for_each" algorithm
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMergeIndexMapVector()
Return the merge mapping vector.
auto insert(Ti ele) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
void merge_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_sort, vector< aggregate< Ti, Ti >, Memory, layout_base, grow_p > &vct_add_index_unique, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_merge_index, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_merge_index_map, mgpu::ofp_context_t &context)
Merge indexes.
mgpu::ofp_context_t & context
gpu context
void swapIndexVector(vector< aggregate< Ti >, Memory, layout_base, grow_p > &iv)
__device__ __host__ void operator()(T &t) const
It call the copy function for each property.
openfpm::sparse_index< Ti > get_sparse(Ti id) const
Get the sparse index.