OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
vector_dist_comm.hpp
1 /*
2  * vector_dist_comm.hpp
3  *
4  * Created on: Aug 18, 2016
5  * Author: i-bird
6  */
7 
8 #ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
9 #define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
10 
11 #define TEST1
12 
13 #if defined(CUDA_GPU) && defined(__NVCC__)
14 #include "Vector/cuda/vector_dist_cuda_funcs.cuh"
15 #include "util/cuda/kernels.cuh"
16 #endif
17 
18 #include "Vector/util/vector_dist_funcs.hpp"
19 #include "cuda/vector_dist_comm_util_funcs.cuh"
20 #include "util/cuda/scan_ofp.cuh"
21 
22 template<typename T>
23 struct DEBUG
24 {
25  static float ret(T & tmp)
26  {
27  return 0.0;
28  }
29 };
30 
31 template<>
32 struct DEBUG<float &>
33 {
34  static float ret(float & tmp)
35  {
36  return tmp;
37  }
38 };
39 
44 inline static size_t compute_options(size_t opt)
45 {
46  size_t opt_ = NONE;
47  if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING)
48  {opt_ = RECEIVE_KNOWN | KNOWN_ELEMENT_OR_BYTE;}
49 
50  if (opt & RUN_ON_DEVICE)
51  {
52 #if defined(CUDA_GPU) && defined(__NVCC__)
53  // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
54  opt_ |= MPI_GPU_DIRECT;
55 #else
56  std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
57 #endif
58  }
59 
60  return opt_;
61 }
62 
69 template<unsigned int impl, template<typename> class layout_base, unsigned int ... prp>
71 {
72  template<typename Vcluster_type, typename vector_prop_type,
73  typename vector_pos_type, typename send_vector,
74  typename prc_recv_get_type, typename prc_g_opart_type,
75  typename recv_sz_get_type, typename recv_sz_get_byte_type,
76  typename g_opart_sz_type>
77  static inline void sendrecv_prp(Vcluster_type & v_cl,
78  openfpm::vector<send_vector> & g_send_prp,
79  vector_prop_type & v_prp,
80  vector_pos_type & v_pos,
81  prc_g_opart_type & prc_g_opart,
82  prc_recv_get_type & prc_recv_get,
83  recv_sz_get_type & recv_sz_get,
84  recv_sz_get_byte_type & recv_sz_get_byte,
85  g_opart_sz_type & g_opart_sz,
86  size_t g_m,
87  size_t opt)
88  {
89  // if there are no properties skip
90  // SSendRecvP send everything when we do not give properties
91 
92  if (sizeof...(prp) != 0)
93  {
94  size_t opt_ = compute_options(opt);
95  if (opt & SKIP_LABELLING)
96  {
97  if (opt & RUN_ON_DEVICE)
98  {
100  v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
101  }
102  else
103  {
104  op_ssend_gg_recv_merge opm(g_m);
105  v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
106  }
107  }
108  else
109  {v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
110 
111  // fill g_opart_sz
112  g_opart_sz.resize(prc_g_opart.size());
113 
114  for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
115  g_opart_sz.get(i) = g_send_prp.get(i).size();
116  }
117  }
118 
119  template<typename Vcluster_type, typename vector_prop_type,
120  typename vector_pos_type, typename send_pos_vector,
121  typename prc_recv_get_type, typename prc_g_opart_type,
122  typename recv_sz_get_type>
123  static inline void sendrecv_pos(Vcluster_type & v_cl,
125  vector_prop_type & v_prp,
126  vector_pos_type & v_pos,
127  prc_recv_get_type & prc_recv_get,
128  recv_sz_get_type & recv_sz_get,
129  prc_g_opart_type & prc_g_opart,
130  size_t opt)
131  {
132  size_t opt_ = compute_options(opt);
133  if (opt & SKIP_LABELLING)
134  {
135  v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
136  }
137  else
138  {
139  prc_recv_get.clear();
140  recv_sz_get.clear();
141  v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
142  }
143  }
144 
145  template<typename Vcluster_type, typename vector_prop_type,
146  typename vector_pos_type, typename send_pos_vector,
147  typename prc_recv_get_type, typename prc_g_opart_type,
148  typename recv_sz_get_type>
149  static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
151  vector_prop_type & v_prp,
152  vector_pos_type & v_pos,
153  prc_recv_get_type & prc_recv_get,
154  recv_sz_get_type & recv_sz_get,
155  prc_g_opart_type & prc_g_opart,
156  size_t opt)
157  {}
158 
159  template<typename Vcluster_type, typename vector_prop_type,
160  typename vector_pos_type, typename send_vector,
161  typename prc_recv_get_type, typename prc_g_opart_type,
162  typename recv_sz_get_type, typename recv_sz_get_byte_type,
163  typename g_opart_sz_type>
164  static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
165  openfpm::vector<send_vector> & g_send_prp,
166  vector_prop_type & v_prp,
167  vector_pos_type & v_pos,
168  prc_g_opart_type & prc_g_opart,
169  prc_recv_get_type & prc_recv_get,
170  recv_sz_get_type & recv_sz_get,
171  recv_sz_get_byte_type & recv_sz_get_byte,
172  g_opart_sz_type & g_opart_sz,
173  size_t g_m,
174  size_t opt)
175  {}
176 };
177 
178 
179 template<template<typename> class layout_base, unsigned int ... prp>
180 struct ghost_exchange_comm_impl<GHOST_ASYNC,layout_base, prp ... >
181 {
182  template<typename Vcluster_type, typename vector_prop_type,
183  typename vector_pos_type, typename send_vector,
184  typename prc_recv_get_type, typename prc_g_opart_type,
185  typename recv_sz_get_type, typename recv_sz_get_byte_type,
186  typename g_opart_sz_type>
187  static inline void sendrecv_prp(Vcluster_type & v_cl,
188  openfpm::vector<send_vector> & g_send_prp,
189  vector_prop_type & v_prp,
190  vector_pos_type & v_pos,
191  prc_g_opart_type & prc_g_opart,
192  prc_recv_get_type & prc_recv_get,
193  recv_sz_get_type & recv_sz_get,
194  recv_sz_get_byte_type & recv_sz_get_byte,
195  g_opart_sz_type & g_opart_sz,
196  size_t g_m,
197  size_t opt)
198  {
199  prc_recv_get.clear();
200  recv_sz_get.clear();
201 
202  // if there are no properties skip
203  // SSendRecvP send everything when we do not give properties
204 
205  if (sizeof...(prp) != 0)
206  {
207  size_t opt_ = compute_options(opt);
208  if (opt & SKIP_LABELLING)
209  {
210  if (opt & RUN_ON_DEVICE)
211  {
213  v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
214  }
215  else
216  {
217  op_ssend_gg_recv_merge opm(g_m);
218  v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
219  }
220  }
221  else
222  {v_cl.template SSendRecvPAsync<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
223  }
224 
225  // fill g_opart_sz
226  g_opart_sz.resize(prc_g_opart.size());
227 
228  for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
229  {g_opart_sz.get(i) = g_send_prp.get(i).size();}
230  }
231 
232  template<typename Vcluster_type, typename vector_prop_type,
233  typename vector_pos_type, typename send_pos_vector,
234  typename prc_recv_get_type, typename prc_g_opart_type,
235  typename recv_sz_get_type>
236  static inline void sendrecv_pos(Vcluster_type & v_cl,
238  vector_prop_type & v_prp,
239  vector_pos_type & v_pos,
240  prc_recv_get_type & prc_recv_get,
241  recv_sz_get_type & recv_sz_get,
242  prc_g_opart_type & prc_g_opart,
243  size_t opt)
244  {
245  prc_recv_get.clear();
246  recv_sz_get.clear();
247 
248  size_t opt_ = compute_options(opt);
249  if (opt & SKIP_LABELLING)
250  {
251  v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
252  }
253  else
254  {
255  prc_recv_get.clear();
256  recv_sz_get.clear();
257  v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
258  }
259  }
260 
261  template<typename Vcluster_type, typename vector_prop_type,
262  typename vector_pos_type, typename send_pos_vector,
263  typename prc_recv_get_type, typename prc_g_opart_type,
264  typename recv_sz_get_type>
265  static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
267  vector_prop_type & v_prp,
268  vector_pos_type & v_pos,
269  prc_recv_get_type & prc_recv_get,
270  recv_sz_get_type & recv_sz_get,
271  prc_g_opart_type & prc_g_opart,
272  size_t opt)
273  {
274  size_t opt_ = compute_options(opt);
275  if (opt & SKIP_LABELLING)
276  {
277  v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
278  }
279  else
280  {
281  v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
282  }
283  }
284 
285  template<typename Vcluster_type, typename vector_prop_type,
286  typename vector_pos_type, typename send_vector,
287  typename prc_recv_get_type, typename prc_g_opart_type,
288  typename recv_sz_get_type, typename recv_sz_get_byte_type,
289  typename g_opart_sz_type>
290  static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
291  openfpm::vector<send_vector> & g_send_prp,
292  vector_prop_type & v_prp,
293  vector_pos_type & v_pos,
294  prc_g_opart_type & prc_g_opart,
295  prc_recv_get_type & prc_recv_get,
296  recv_sz_get_type & recv_sz_get,
297  recv_sz_get_byte_type & recv_sz_get_byte,
298  g_opart_sz_type & g_opart_sz,
299  size_t g_m,
300  size_t opt)
301  {
302  // if there are no properties skip
303  // SSendRecvP send everything when we do not give properties
304 
305  if (sizeof...(prp) != 0)
306  {
307  size_t opt_ = compute_options(opt);
308  if (opt & SKIP_LABELLING)
309  {
310  if (opt & RUN_ON_DEVICE)
311  {
313  v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
314  }
315  else
316  {
317  op_ssend_gg_recv_merge opm(g_m);
318  v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
319  }
320  }
321  else
322  {v_cl.template SSendRecvPWait<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
323  }
324  }
325 };
326 
327 
340 template<unsigned int dim,
341  typename St,
342  typename prop,
344  typename Memory = HeapMemory,
345  template<typename> class layout_base = memory_traits_lin>
347 {
349  size_t v_sub_unit_factor = 64;
350 
353 
356 
359 
362 
369  Memory,
370  layout_base > m_opart;
371 
376 
379  CudaMemory,
381 
384 
387 
390 
393 
397  openfpm::vector<size_t> prc_recv_get_prp;
398 
401 
404 
408  openfpm::vector<size_t> recv_sz_get_prp;
411 
412 
415 
418 
421 
424  Memory,
425  layout_base> proc_id_out;
426 
429  Memory,
430  layout_base> starts;
431 
434 
435 
438 
441  size_t lg_m;
442 
445 
447  template<typename prp_object, int ... prp>
449  {
451  template<typename T1, typename T2> inline static void proc(size_t lbl, size_t cnt, size_t id, T1 & v_prp, T2 & m_prp)
452  {
453  // source object type
455  // destination object type
457 
458  // Copy only the selected properties
459  object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(id), m_prp.get(lbl).get(cnt));
460  }
461  };
462 
470  {
471  // If the last ghost_get did not have properties the information about the number of particles
472  // received is in recv_sz_get_ois
473  if (recv_sz_get_prp.size() != 0)
474  {return recv_sz_get_prp.get(i);}
475  else
476  {return recv_sz_get_pos.get(i);}
477  }
478 
484  {
485  if (prc_recv_get_prp.size() != 0)
486  {return prc_recv_get_prp.size();}
487  else
488  {return prc_recv_get_pos.size();}
489  }
490 
496  {
497  if (prc_recv_get_prp.size() != 0)
498  {return prc_recv_get_prp;}
499  else
500  {return prc_recv_get_pos;}
501  }
502 
510  openfpm::vector<size_t> & prc_sz_r,
511  openfpm::vector<size_t> & prc_r,
512  size_t opt)
513  {
514  if (opt & RUN_ON_DEVICE)
515  {
516 #ifndef TEST1
517  size_t prev_off = 0;
518  for (size_t i = 0; i < prc_sz.size() ; i++)
519  {
520  if (prc_sz.template get<1>(i) != (unsigned int)-1)
521  {
522  prc_r.add(prc_sz.template get<1>(i));
523  prc_sz_r.add(prc_sz.template get<0>(i) - prev_off);
524  }
525  prev_off = prc_sz.template get<0>(i);
526  }
527 #else
528 
529  // Calculate the sending buffer size for each processor, put this information in
530  // a contiguous buffer
531 
532  for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
533  {
534  if (prc_sz.template get<0>(i) != 0 && v_cl.rank() != i)
535  {
536  prc_r.add(i);
537  prc_sz_r.add(prc_sz.template get<0>(i));
538  }
539  }
540 
541 #endif
542  }
543  else
544  {
545  // Calculate the sending buffer size for each processor, put this information in
546  // a contiguous buffer
547 
549  for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
550  {
551  if (prc_sz.template get<0>(i) != 0)
552  {
553  p_map_req.get(i) = prc_r.size();
554  prc_r.add(i);
555  prc_sz_r.add(prc_sz.template get<0>(i));
556  }
557  }
558  }
559  }
560 
562  long int shift_box_ndec = -1;
563 
565  std::unordered_map<size_t, size_t> map_cmb;
566 
570 
573  openfpm::vector<aggregate<unsigned int>,Memory,layout_base> box_f_sv;
574 
577 
580 
583 
589  {
590  if (shift_box_ndec == (long int)dec.get_ndec())
591  {return;}
592 
593  struct sh_box
594  {
595  size_t shift_id;
596 
597  unsigned int box_f_sv;
599 
600  bool operator<(const sh_box & tmp) const
601  {
602  return shift_id < tmp.shift_id;
603  }
604 
605  };
606  openfpm::vector<sh_box> reord_shift;
607  box_f.clear();
608  map_cmb.clear();
609  box_cmb.clear();
610 
611  // Add local particles coming from periodic boundary, the only boxes that count are the one
612  // touching the border
613  for (size_t i = 0; i < dec.getNLocalSub(); i++)
614  {
615  size_t Nl = dec.getLocalNIGhost(i);
616 
617  for (size_t j = 0; j < Nl; j++)
618  {
619  // If the ghost does not come from the intersection with an out of
620  // border sub-domain the combination is all zero and n_zero return dim
621  if (dec.getLocalIGhostPos(i, j).n_zero() == dim)
622  continue;
623 
624  // Check if we already have boxes with such combination
625  auto it = map_cmb.find(dec.getLocalIGhostPos(i, j).lin());
626  if (it == map_cmb.end())
627  {
628  // we do not have it
629  box_f.add();
630  box_f.last().add(dec.getLocalIGhostBox(i, j));
631  box_cmb.add(dec.getLocalIGhostPos(i, j));
632  map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - 1;
633  }
634  else
635  {
636  // we have it
637  box_f.get(it->second).add(dec.getLocalIGhostBox(i, j));
638  }
639 
640  reord_shift.add();
641  reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin();
642  reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j);
643  reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j));
644  }
645  }
646 
647  // now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles
648  reord_shift.sort();
649 
650  box_f_dev.resize(reord_shift.size());
651  box_f_sv.resize(reord_shift.size());
652 
653  for (size_t i = 0 ; i < reord_shift.size() ; i++)
654  {
655  box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
656  box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv;
657  }
658 
659 #ifdef CUDA_GPU
660 
661  // move box_f_dev and box_f_sv to device
662  box_f_dev.template hostToDevice<0,1>();
663  box_f_sv.template hostToDevice<0>();
664 
665 #endif
666 
667  shift_box_ndec = dec.get_ndec();
668  }
669 
677  void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
679  size_t opt)
680  {
681  // get the shift vectors
682  const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts = dec.getShiftVectors();
683 
684  if (!(opt & NO_POSITION))
685  {
686  if (opt & RUN_ON_DEVICE)
687  {
689  ::run(o_part_loc,shifts,v_pos,v_prp,opt);
690  }
691  else
692  {
693  for (size_t i = 0 ; i < o_part_loc.size() ; i++)
694  {
695  size_t lin_id = o_part_loc.template get<1>(i);
696  size_t key = o_part_loc.template get<0>(i);
697 
698  Point<dim, St> p = v_pos.get(key);
699  // shift
700  p -= shifts.get(lin_id);
701 
702  // add this particle shifting its position
703  v_pos.add(p);
704  v_prp.get(lg_m+i) = v_prp.get(key);
705  }
706  }
707  }
708  else
709  {
710  if (opt & RUN_ON_DEVICE)
711  {
713  ::run(o_part_loc,shifts,v_pos,v_prp,opt);
714  }
715  else
716  {
717  for (size_t i = 0 ; i < o_part_loc.size() ; i++)
718  {
719  size_t key = o_part_loc.template get<0>(i);
720 
721  v_prp.get(lg_m+i) = v_prp.get(key);
722  }
723  }
724  }
725  }
726 
734  void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
736  size_t g_m,size_t opt)
737  {
738  o_part_loc.clear();
739 
740  // get the shift vectors
741  const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
742 
743  if (opt & RUN_ON_DEVICE)
744  {
746  ::run(o_part_loc,shifts,box_f_dev,box_f_sv,v_cl,starts,v_pos,v_prp,g_m,opt);
747  }
748  else
749  {
750  // Label the internal (assigned) particles
751  auto it = v_pos.getIteratorTo(g_m);
752 
753  while (it.isNext())
754  {
755  auto key = it.get();
756 
757  // If particles are inside these boxes
758  for (size_t i = 0; i < box_f.size(); i++)
759  {
760  for (size_t j = 0; j < box_f.get(i).size(); j++)
761  {
762  if (box_f.get(i).get(j).isInsideNP(v_pos.get(key)) == true)
763  {
764  size_t lin_id = dec.convertShift(box_cmb.get(i));
765 
766  o_part_loc.add();
767  o_part_loc.template get<0>(o_part_loc.size()-1) = key;
768  o_part_loc.template get<1>(o_part_loc.size()-1) = lin_id;
769 
770  Point<dim, St> p = v_pos.get(key);
771  // shift
772  p -= shifts.get(lin_id);
773 
774  // add this particle shifting its position
775  v_pos.add(p);
776  v_prp.add();
777  v_prp.last() = v_prp.get(key);
778 
779  // boxes in one group can be overlapping
780  // we do not have to search for the other
781  // boxes otherwise we will have duplicate particles
782  //
783  // A small note overlap of boxes across groups is fine
784  // (and needed) because each group has different shift
785  // producing non overlapping particles
786  //
787  break;
788  }
789  }
790  }
791 
792  ++it;
793  }
794  }
795  }
796 
845  void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
847  size_t & g_m,
848  size_t opt)
849  {
850  // Create the shift boxes
851  createShiftBox();
852 
853  if (!(opt & SKIP_LABELLING))
854  lg_m = v_prp.size();
855 
856  if (box_f.size() == 0)
857  return;
858  else
859  {
860  if (opt & SKIP_LABELLING)
861  {local_ghost_from_opart(v_pos,v_prp,opt);}
862  else
863  {local_ghost_from_dec(v_pos,v_prp,g_m,opt);}
864  }
865  }
866 
873  void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
876  size_t opt,
877  bool async)
878  {
879  // get the shift vectors
880  const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
881 
882  // create a number of send buffers equal to the near processors
883  g_pos_send.resize(prc_sz.size());
884 
885  size_t old_hsmem_size = 0;
886 
887  // if we do async
888  if (async == true)
889  {
890  old_hsmem_size = hsmem.size();
891  resize_retained_buffer(hsmem,g_pos_send.size() + hsmem.size());
892  }
893  else
894  {resize_retained_buffer(hsmem,g_pos_send.size());}
895 
896  for (size_t i = 0; i < g_pos_send.size(); i++)
897  {
898  // Buffer must retained and survive the destruction of the
899  // vector
900  if (hsmem.get(i+old_hsmem_size).ref() == 0)
901  {hsmem.get(i+old_hsmem_size).incRef();}
902 
903  // Set the memory for retain the send buffer
904  g_pos_send.get(i).setMemory(hsmem.get(i+old_hsmem_size));
905 
906  // resize the sending vector (No allocation is produced)
907  g_pos_send.get(i).resize(prc_sz.get(i));
908  }
909 
910  if (opt & RUN_ON_DEVICE)
911  {
912 #if defined(CUDA_GPU) && defined(__NVCC__)
913 
914  size_t offset = 0;
915 
916  // Fill the sending buffers
917  for (size_t i = 0 ; i < g_pos_send.size() ; i++)
918  {
919  auto ite = g_pos_send.get(i).getGPUIterator();
920 
921  CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>),
922  ite,
923  g_opart_device.toKernel(), g_pos_send.get(i).toKernel(),
924  v_pos.toKernel(),shifts.toKernel(),offset);
925 
926  offset += prc_sz.get(i);
927  }
928 
929 #else
930 
931  std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
932 
933 #endif
934  }
935  else
936  {
937  // Fill the send buffer
938  for (size_t i = 0; i < g_opart.size(); i++)
939  {
940  for (size_t j = 0; j < g_opart.get(i).size(); j++)
941  {
942  Point<dim, St> s = v_pos.get(g_opart.get(i).template get<0>(j));
943  s -= shifts.get(g_opart.get(i).template get<1>(j));
944  g_pos_send.get(i).set(j, s);
945  }
946  }
947  }
948  }
949 
961  template<typename send_vector, typename prp_object, int ... prp>
963  openfpm::vector<send_vector> & g_send_prp,
964  size_t & g_m,
965  size_t opt)
966  {
967  // create a number of send buffers equal to the near processors
968  // from which we received
969 
970  // NOTE in some case the information can be in prc_recv_get_pos
971 
972  size_t nproc = get_last_ghost_get_num_proc();
973 
974  g_send_prp.resize(nproc);
975 
976  resize_retained_buffer(hsmem,g_send_prp.size());
977 
978  for (size_t i = 0; i < g_send_prp.size(); i++)
979  {
980  // Buffer must retained and survive the destruction of the
981  // vector
982  if (hsmem.get(i).ref() == 0)
983  hsmem.get(i).incRef();
984 
985  // Set the memory for retain the send buffer
986  g_send_prp.get(i).setMemory(hsmem.get(i));
987 
988  size_t n_part_recv = get_last_ghost_get_received_parts(i);
989 
990  // resize the sending vector (No allocation is produced)
991  g_send_prp.get(i).resize(n_part_recv);
992  }
993 
994  size_t accum = g_m;
995 
996  if (opt & RUN_ON_DEVICE)
997  {
998 #if defined(CUDA_GPU) && defined(__NVCC__)
999 
1000  if (sizeof...(prp) != 0)
1001  {
1002  // Fill the sending buffers
1003  for (size_t i = 0 ; i < g_send_prp.size() ; i++)
1004  {
1005  size_t n_part_recv = get_last_ghost_get_received_parts(i);
1006 
1007  auto ite = g_send_prp.get(i).getGPUIterator();
1008 
1009  if (ite.nblocks() == 0) {continue;}
1010 
1011  CUDA_LAUNCH((process_ghost_particles_prp_put<decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1012  ite,
1013  g_send_prp.get(i).toKernel(),
1014  v_prp.toKernel(),accum);
1015 
1016  accum = accum + n_part_recv;
1017  }
1018  }
1019 
1020 #else
1021 
1022  std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1023 
1024 #endif
1025  }
1026  else
1027  {
1028  // Fill the send buffer
1029  for (size_t i = 0; i < g_send_prp.size(); i++)
1030  {
1031  size_t j2 = 0;
1032  size_t n_part_recv = get_last_ghost_get_received_parts(i);
1033 
1034  for (size_t j = accum; j < accum + n_part_recv; j++)
1035  {
1036  // source object type
1038  // destination object type
1040 
1041  // Copy only the selected properties
1042  object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2));
1043 
1044  j2++;
1045  }
1046 
1047  accum = accum + n_part_recv;
1048  }
1049  }
1050  }
1051 
1057  {
1058  // Release all the buffer that are going to be deleted
1059  for (size_t i = nbf ; i < rt_buf.size() ; i++)
1060  {
1061  rt_buf.get(i).decRef();
1062  }
1063 
1064  hsmem.resize(nbf);
1065  }
1066 
1071  template<typename send_vector, typename v_mpl>
1073  {
1074  openfpm::vector<send_vector> & g_send_prp;
1075 
1076  size_t i;
1077 
1079 
1080  size_t j;
1081 
1083  openfpm::vector_fr<Memory> & hsmem, size_t j)
1084  :g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j)
1085  {}
1086 
1088  template<typename T>
1089  inline void operator()(T& t)
1090  {
1091  g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j));
1092 
1093  j++;
1094  }
1095  };
1096 
1097  template<bool inte_or_lin,typename send_vector, typename v_mpl>
1099  {
1100  static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
1102  size_t i,
1104  size_t j)
1105  {
1106  // Set the memory for retain the send buffer
1107  g_send_prp.get(i).setMemory(hsmem.get(j));
1108 
1109  // resize the sending vector (No allocation is produced)
1110  g_send_prp.get(i).resize(prc_sz.get(i));
1111 
1112  return j+1;
1113  }
1114  };
1115 
1116  template<typename send_vector, typename v_mpl>
1117  struct set_mem_retained_buffers<true,send_vector,v_mpl>
1118  {
1119  static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
1121  size_t i,
1123  size_t j)
1124  {
1126 
1127  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi);
1128 
1129  // if we do not send properties do not reallocate
1130  if (boost::mpl::size<v_mpl>::type::value != 0)
1131  {
1132  // resize the sending vector (No allocation is produced)
1133  g_send_prp.get(i).resize(prc_sz.get(i));
1134  }
1135 
1136  return smrbi.j;
1137  }
1138  };
1139 
1150  template<typename send_vector, typename prp_object, int ... prp>
1153  openfpm::vector<send_vector> & g_send_prp,
1154  size_t opt)
1155  {
1156  size_t factor = 1;
1157 
1158  typedef typename to_boost_vmpl<prp...>::type v_mpl;
1159 
1160  if (is_layout_inte<layout_base<prop>>::value == true) {factor *= sizeof...(prp);}
1161 
1162  // create a number of send buffers equal to the near processors
1163  g_send_prp.resize(prc_sz.size());
1164 
1165  resize_retained_buffer(hsmem,g_send_prp.size()*factor);
1166 
1167  for (size_t i = 0; i < hsmem.size(); i++)
1168  {
1169  // Buffer must retained and survive the destruction of the
1170  // vector
1171  if (hsmem.get(i).ref() == 0)
1172  {hsmem.get(i).incRef();}
1173  }
1174 
1175  size_t j = 0;
1176  for (size_t i = 0; i < g_send_prp.size(); i++)
1177  {
1178  j = set_mem_retained_buffers<is_layout_inte<layout_base<prop>>::value,send_vector,v_mpl>::set_mem_retained_buffers_(g_send_prp,prc_sz,i,hsmem,j);
1179  }
1180 
1181  if (opt & RUN_ON_DEVICE)
1182  {
1183 #if defined(CUDA_GPU) && defined(__NVCC__)
1184 
1185  size_t offset = 0;
1186 
1187  if (sizeof...(prp) != 0)
1188  {
1189  // Fill the sending buffers
1190  for (size_t i = 0 ; i < g_send_prp.size() ; i++)
1191  {
1192  auto ite = g_send_prp.get(i).getGPUIterator();
1193 
1194  CUDA_LAUNCH((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1195  ite,
1196  g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
1197  v_prp.toKernel(),offset);
1198 
1199  offset += prc_sz.get(i);
1200  }
1201  }
1202 
1203 #else
1204 
1205  std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1206 
1207 #endif
1208  }
1209  else
1210  {
1211  // if no properties must be sent skip this step
1212  if (sizeof...(prp) == 0) {return;}
1213 
1214  // Fill the send buffer
1215  for (size_t i = 0; i < g_opart.size(); i++)
1216  {
1217  for (size_t j = 0; j < g_opart.get(i).size(); j++)
1218  {
1219  // source object type
1220  typedef decltype(v_prp.get(g_opart.get(i).template get<0>(j))) encap_src;
1221  // destination object type
1222  typedef decltype(g_send_prp.get(i).get(j)) encap_dst;
1223 
1224  // Copy only the selected properties
1225  object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(g_opart.get(i).template get<0>(j)), g_send_prp.get(i).get(j));
1226  }
1227  }
1228  }
1229  }
1230 
1242  void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1244  openfpm::vector<size_t> & prc_sz_r,
1245  openfpm::vector<size_t> & prc_r,
1249  size_t opt)
1250  {
1251  m_prp.resize(prc_sz_r.size());
1252  m_pos.resize(prc_sz_r.size());
1253  openfpm::vector<size_t> cnt(prc_sz_r.size());
1254 
1255  for (size_t i = 0; i < prc_sz_r.size() ; i++)
1256  {
1257  // set the size and allocate, using mem warant that pos and prp is contiguous
1258  m_pos.get(i).resize(prc_sz_r.get(i));
1259  m_prp.get(i).resize(prc_sz_r.get(i));
1260  cnt.get(i) = 0;
1261  }
1262 
1263  if (opt & RUN_ON_DEVICE)
1264  {
1265  if (v_cl.size() == 1)
1266  {return;}
1267 
1268 #if defined(CUDA_GPU) && defined(__NVCC__)
1269 
1270  // The first part of m_opart and prc_sz contain the local particles
1271 
1272  int rank = v_cl.rank();
1273 
1274  v_pos_tmp.resize(prc_sz.template get<0>(rank));
1275  v_prp_tmp.resize(prc_sz.template get<0>(rank));
1276 
1277  auto ite = v_pos_tmp.getGPUIterator();
1278 
1279  starts.template deviceToHost<0>();
1280  size_t offset = starts.template get<0>(rank);
1281 
1282  // no work to do
1283  if (ite.wthr.x != 0)
1284  {
1285  // fill v_pos_tmp and v_prp_tmp with local particles
1286  CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
1287  decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1288  ite,
1289  m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
1290  v_pos.toKernel(),v_prp.toKernel(),offset);
1291  }
1292 
1293  // Fill the sending buffers
1294  for (size_t i = 0 ; i < m_pos.size() ; i++)
1295  {
1296  size_t offset = starts.template get<0>(prc_r.template get<0>(i));
1297 
1298  auto ite = m_pos.get(i).getGPUIterator();
1299 
1300  // no work to do
1301  if (ite.wthr.x != 0)
1302  {
1303 
1304  CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
1305  decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1306  ite,
1307  m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1308  v_pos.toKernel(),v_prp.toKernel(),offset);
1309 
1310  }
1311  }
1312 
1313  // old local particles with the actual local particles
1314  v_pos_tmp.swap(v_pos);
1315  v_prp_tmp.swap(v_prp);
1316 
1317 #else
1318 
1319  std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1320 
1321 #endif
1322  }
1323  else
1324  {
1325  // end vector point
1326  long int id_end = v_pos.size();
1327 
1328  // end opart point
1329  long int end = m_opart.size()-1;
1330 
1331  // Run through all the particles and fill the sending buffer
1332  for (size_t i = 0; i < m_opart.size(); i++)
1333  {
1334  process_map_particle<proc_without_prp>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1335  }
1336 
1337  v_pos.resize(v_pos.size() - m_opart.size());
1338  v_prp.resize(v_prp.size() - m_opart.size());
1339  }
1340  }
1341 
1342 
1355  template<typename prp_object,int ... prp>
1358  openfpm::vector<size_t> & prc_sz_r,
1361  {
1362  m_prp.resize(prc_sz_r.size());
1363  m_pos.resize(prc_sz_r.size());
1364  openfpm::vector<size_t> cnt(prc_sz_r.size());
1365 
1366  for (size_t i = 0; i < prc_sz_r.size(); i++)
1367  {
1368  // set the size and allocate, using mem warant that pos and prp is contiguous
1369  m_pos.get(i).resize(prc_sz_r.get(i));
1370  m_prp.get(i).resize(prc_sz_r.get(i));
1371  cnt.get(i) = 0;
1372  }
1373 
1374  // end vector point
1375  long int id_end = v_pos.size();
1376 
1377  // end opart point
1378  long int end = m_opart.size()-1;
1379 
1380  // Run through all the particles and fill the sending buffer
1381  for (size_t i = 0; i < m_opart.size(); i++)
1382  {
1383  process_map_particle<proc_with_prp<prp_object,prp...>>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1384  }
1385 
1386  v_pos.resize(v_pos.size() - m_opart.size());
1387  v_prp.resize(v_prp.size() - m_opart.size());
1388  }
1389 
1398  template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1400  Memory,
1401  layout_base> & lbl_p,
1403  size_t opt)
1404  {
1405  if (opt == RUN_ON_DEVICE)
1406  {
1407 #ifdef __NVCC__
1408 
1409  // Map directly on gpu
1410 
1411  lbl_p.resize(v_pos.size());
1412 
1413  // labelling kernel
1414 
1415  prc_sz.template fill<0>(0);
1416 
1417  auto ite = v_pos.getGPUIterator();
1418  if (ite.wthr.x == 0)
1419  {
1420  starts.resize(v_cl.size());
1421  starts.template fill<0>(0);
1422  return;
1423  }
1424 
1425  // we have one process we can skip ...
1426  if (v_cl.size() == 1)
1427  {
1428  // ... but we have to apply the boundary conditions
1429 
1431 
1432  for (size_t i = 0 ; i < dim ; i++) {bc.bc[i] = dec.periodicity(i);}
1433 
1434  CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite,dec.getDomain(),bc,v_pos.toKernel());
1435 
1436  return;
1437  }
1438 
1439  // label particle processor
1440  CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>),
1441  ite,
1442  dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
1443 
1444  starts.resize(v_cl.size());
1445  openfpm::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
1446 
1447  // move prc_sz to host
1448  prc_sz.template deviceToHost<0>();
1449 
1450  ite = lbl_p.getGPUIterator();
1451 
1452  // we order lbl_p
1453  CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite,lbl_p.toKernel(),starts.toKernel());
1454 
1455 
1456 #else
1457 
1458  std::cout << __FILE__ << ":" << __LINE__ << " error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
1459 
1460 #endif
1461  }
1462  else
1463  {
1464  // reset lbl_p
1465  lbl_p.clear();
1466  prc_sz_gg.clear();
1467  o_part_loc.clear();
1468  g_opart.clear();
1469  prc_g_opart.clear();
1470 
1471  // resize the label buffer
1472  prc_sz.template fill<0>(0);
1473 
1474  auto it = v_pos.getIterator();
1475 
1476  // Label all the particles with the processor id where they should go
1477  while (it.isNext())
1478  {
1479  auto key = it.get();
1480 
1481  // Apply the boundary conditions
1482  dec.applyPointBC(v_pos.get(key));
1483 
1484  size_t p_id = 0;
1485 
1486  // Check if the particle is inside the domain
1487  if (dec.getDomain().isInside(v_pos.get(key)) == true)
1488  {p_id = dec.processorID(v_pos.get(key));}
1489  else
1490  {p_id = obp::out(key, v_cl.getProcessUnitID());}
1491 
1492  // Particle to move
1493  if (p_id != v_cl.getProcessUnitID())
1494  {
1495  if ((long int) p_id != -1)
1496  {
1497  prc_sz.template get<0>(p_id)++;
1498  lbl_p.add();
1499  lbl_p.last().template get<0>() = key;
1500  lbl_p.last().template get<2>() = p_id;
1501  }
1502  else
1503  {
1504  lbl_p.add();
1505  lbl_p.last().template get<0>() = key;
1506  lbl_p.last().template get<2>() = p_id;
1507  }
1508  }
1509 
1510  // Add processors and add size
1511 
1512  ++it;
1513  }
1514  }
1515  }
1516 
1530  void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1535  size_t & g_m,
1536  size_t opt)
1537  {
1538  // Buffer that contain for each processor the id of the particle to send
1539  prc_sz.clear();
1540  g_opart.clear();
1541  g_opart.resize(dec.getNNProcessors());
1542  prc_g_opart.clear();
1543 
1544  if (opt & RUN_ON_DEVICE)
1545  {
1546  labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,
1547  Decomposition,std::is_same<Memory,CudaMemory>::value>
1548  ::run(mem,dec,g_opart_device,proc_id_out,starts,v_cl,v_pos,v_prp,prc,prc_sz,prc_offset,g_m,opt);
1549  }
1550  else
1551  {
1552  // Iterate over all particles
1553  auto it = v_pos.getIteratorTo(g_m);
1554  while (it.isNext())
1555  {
1556  auto key = it.get();
1557 
1558  // Given a particle, it return which processor require it (first id) and shift id, second id
1559  // For an explanation about shifts vectors please consult getShiftVector in ie_ghost
1560  const openfpm::vector<std::pair<size_t, size_t>> & vp_id = dec.template ghost_processorID_pair<typename Decomposition::lc_processor_id, typename Decomposition::shift_id>(v_pos.get(key), UNIQUE);
1561 
1562  for (size_t i = 0; i < vp_id.size(); i++)
1563  {
1564  // processor id
1565  size_t p_id = vp_id.get(i).first;
1566 
1567  // add particle to communicate
1568  g_opart.get(p_id).add();
1569  g_opart.get(p_id).last().template get<0>() = key;
1570  g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second;
1571  }
1572 
1573  ++it;
1574  }
1575 
1576  // remove all zero entry and construct prc (the list of the sending processors)
1578 
1579  // count the non zero element
1580  for (size_t i = 0 ; i < g_opart.size() ; i++)
1581  {
1582  if (g_opart.get(i).size() != 0)
1583  {
1584  prc_sz.add(g_opart.get(i).size());
1585  g_opart_f.add();
1586  g_opart.get(i).swap(g_opart_f.last());
1587  prc.add(dec.IDtoProc(i));
1588  }
1589  }
1590 
1591  g_opart.swap(g_opart_f);
1592  }
1593 #ifdef EXTREA_TRACE_PRE_COMM
1594  Extrae_user_function (0);
1595 #endif
1596  }
1597 
1611  static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void * ptr)
1612  {
1613  // cast the pointer
1615 
1616  vd->recv_mem_gm.resize(vd->v_cl.getProcessingUnits());
1617  vd->recv_mem_gm.get(i).resize(msg_i);
1618 
1619  return vd->recv_mem_gm.get(i).getPointer();
1620  }
1621 
1622 public:
1623 
1630  :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0)
1631  {
1632  this->operator=(v);
1633  }
1634 
1635 
1642  :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0)
1643  {
1644 
1645  }
1646 
1653  :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0)
1654  {
1655 
1656  }
1657 
1662  :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0)
1663  {
1664  }
1665 
1672  {
1673  for (size_t i = 0 ; i < hsmem.size() ; i++)
1674  {
1675  if (hsmem.get(i).ref() == 1)
1676  hsmem.get(i).decRef();
1677  else
1678  std::cout << __FILE__ << ":" << __LINE__ << " internal error memory is in an invalid state " << std::endl;
1679  }
1680 
1681  }
1682 
1689  {
1690  return v_sub_unit_factor;
1691  }
1692 
1698  void setDecompositionGranularity(size_t n_sub)
1699  {
1700  this->v_sub_unit_factor = n_sub;
1701  }
1702 
1712  const size_t (& bc)[dim],
1713  const Ghost<dim,St> & g,
1714  size_t opt,
1715  const grid_sm<dim,void> & gdist)
1716  {
1717  size_t div[dim];
1718 
1719  if (opt & BIND_DEC_TO_GHOST)
1720  {
1721  // padding
1722  size_t pad = 0;
1723 
1724  // CellDecomposer
1725  CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
1726 
1727  // Calculate the divisions for the symmetric Cell-lists
1728  cl_param_calculateSym<dim,St>(box,cd_sm,g,pad);
1729 
1730  for (size_t i = 0 ; i < dim ; i++)
1731  {div[i] = cd_sm.getDiv()[i] - 2*pad;}
1732 
1733  // Create the sub-domains
1734  dec.setParameters(div, box, bc, g, gdist);
1735  }
1736  else
1737  {
1738  dec.setGoodParameters(box, bc, g, getDecompositionGranularity(), gdist);
1739  }
1740  dec.decompose();
1741  }
1742 
1752  const size_t (& bc)[dim],
1753  const Ghost<dim,St> & g,
1754  size_t opt,
1755  const grid_sm<dim,void> & gdist)
1756  {
1757  size_t div[dim];
1758 
1759  for (size_t i = 0 ; i < dim ; i++)
1760  {div[i] = gdist.size(i);}
1761 
1762  // Create the sub-domains
1763  dec.setParameters(div, box, bc, g);
1764 
1765  dec.decompose();
1766  }
1767 
1778  template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1780  size_t & g_m,
1781  size_t opt = WITH_POSITION)
1782  {
1783 #ifdef PROFILE_SCOREP
1784  SCOREP_USER_REGION("ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
1785 #endif
1786 
1787  // Sending property object
1788  typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
1789 
1790  // send vector for each processor
1792 
1793  if (!(opt & NO_POSITION))
1794  {v_pos.resize(g_m);}
1795 
1796  // reset the ghost part
1797 
1798  if (!(opt & SKIP_LABELLING))
1799  {v_prp.resize(g_m);}
1800 
1801  // Label all the particles
1802  if ((opt & SKIP_LABELLING) == false)
1803  {labelParticlesGhost(v_pos,v_prp,prc_g_opart,prc_sz_gg,prc_offset,g_m,opt);}
1804 
1805  {
1806  // Send and receive ghost particle information
1807  openfpm::vector<send_vector> g_send_prp;
1808 
1809  fill_send_ghost_prp_buf<send_vector, prp_object, prp...>(v_prp,prc_sz_gg,g_send_prp,opt);
1810 
1811  #if defined(CUDA_GPU) && defined(__NVCC__)
1812  cudaDeviceSynchronize();
1813  #endif
1814 
1815  // if there are no properties skip
1816  // SSendRecvP send everything when we do not give properties
1817 
1818  ghost_exchange_comm_impl<impl,layout_base,prp ...>::template
1819  sendrecv_prp(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart,
1820  prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt);
1821  }
1822 
1823  if (!(opt & NO_POSITION))
1824  {
1825  // Sending buffer for the ghost particles position
1827 
1828  fill_send_ghost_pos_buf(v_pos,prc_sz_gg,g_pos_send,opt,impl == GHOST_ASYNC);
1829 
1830 #if defined(CUDA_GPU) && defined(__NVCC__)
1831  cudaDeviceSynchronize();
1832 #endif
1833 
1834  ghost_exchange_comm_impl<impl,layout_base,prp ...>::template
1835  sendrecv_pos(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt);
1836 
1837  // fill g_opart_sz
1838  g_opart_sz.resize(prc_g_opart.size());
1839 
1840  for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
1841  g_opart_sz.get(i) = g_pos_send.get(i).size();
1842  }
1843 
1844  // Important to ensure that the number of particles in v_prp must be equal to v_pos
1845  // Note that if we do not give properties sizeof...(prp) == 0 in general at this point
1846  // v_prp.size() != v_pos.size()
1847  if (!(opt & SKIP_LABELLING))
1848  {
1849  v_prp.resize(v_pos.size());
1850  }
1851 
1852  add_loc_particles_bc(v_pos,v_prp,g_m,opt);
1853  }
1854 
1865  template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1867  size_t & g_m,
1868  size_t opt = WITH_POSITION)
1869  {
1870  // Sending property object
1871  typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
1872 
1873  // send vector for each processor
1875 
1876  // Send and receive ghost particle information
1877  openfpm::vector<send_vector> g_send_prp;
1879 
1880  ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template
1881  sendrecv_prp_wait(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart,
1882  prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt);
1883 
1884 
1885  ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template
1886  sendrecv_pos_wait(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt);
1887  }
1888 
1905  template<unsigned int ... prp> void map_list_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt)
1906  {
1907  if (opt & RUN_ON_DEVICE)
1908  {
1909  std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " map_list is unsupported on device (coming soon)" << std::endl;
1910  return;
1911  }
1912 
1913  typedef KillParticle obp;
1914 
1915  // Processor communication size
1917 
1918  // map completely reset the ghost part
1919  v_pos.resize(g_m);
1920  v_prp.resize(g_m);
1921 
1922  // m_opart, Contain the processor id of each particle (basically where they have to go)
1923  labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt);
1924 
1925  // Calculate the sending buffer size for each processor, put this information in
1926  // a contiguous buffer
1927  p_map_req.resize(v_cl.getProcessingUnits());
1928  openfpm::vector<size_t> prc_sz_r;
1930 
1931  for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
1932  {
1933  if (prc_sz.template get<0>(i) != 0)
1934  {
1935  p_map_req.get(i) = prc_r.size();
1936  prc_r.add(i);
1937  prc_sz_r.add(prc_sz.template get<0>(i));
1938  }
1939  }
1940 
1941  if (opt & MAP_LOCAL)
1942  {
1943  // if the map is local we indicate that we receive only from the neighborhood processors
1944 
1945  prc_recv_map.clear();
1946  for (size_t i = 0 ; i < dec.getNNProcessors() ; i++)
1947  {prc_recv_map.add(dec.IDtoProc(i));}
1948  }
1949 
1950  // Sending property object
1951  typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
1952 
1957 
1958  fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp);
1959 
1960  v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt);
1961  v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
1962 
1963  // mark the ghost part
1964 
1965  g_m = v_pos.size();
1966  }
1967 
1981  template<typename obp = KillParticle>
1982  void map_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1983  openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m,
1984  size_t opt)
1985  {
1986 #ifdef PROFILE_SCOREP
1987  SCOREP_USER_REGION("map",SCOREP_USER_REGION_TYPE_FUNCTION)
1988 #endif
1989 
1990  prc_sz.resize(v_cl.getProcessingUnits());
1991 
1992  // map completely reset the ghost part
1993  v_pos.resize(g_m);
1994  v_prp.resize(g_m);
1995 
1996  // Contain the processor id of each particle (basically where they have to go)
1997  labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt);
1998 
1999  openfpm::vector<size_t> prc_sz_r;
2001 
2002  // Calculate the sending buffer size for each processor, put this information in
2003  // a contiguous buffer
2004  calc_send_buffers(prc_sz,prc_sz_r,prc_r,opt);
2005 
2010 
2011  fill_send_map_buf(v_pos,v_prp, prc_sz_r,prc_r, m_pos, m_prp,prc_sz,opt);
2012 
2013  size_t opt_ = 0;
2014  if (opt & RUN_ON_DEVICE)
2015  {
2016 #if defined(CUDA_GPU) && defined(__NVCC__)
2017  // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
2018  cudaDeviceSynchronize();
2019  opt_ |= MPI_GPU_DIRECT;
2020 #else
2021  std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2022 #endif
2023  }
2024 
2025  v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>,
2026  openfpm::vector<Point<dim, St>,Memory,layout_base>,
2027  layout_base>
2028  (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_);
2029 
2030  v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>,
2032  layout_base>
2033  (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_);
2034 
2035  // mark the ghost part
2036 
2037  g_m = v_pos.size();
2038  }
2039 
2046  {
2047  return dec;
2048  }
2049 
2055  inline const Decomposition & getDecomposition() const
2056  {
2057  return dec;
2058  }
2059 
2068  {
2069  dec = vc.dec;
2070 
2071  return *this;
2072  }
2073 
2082  {
2083  dec = vc.dec;
2084 
2085  return *this;
2086  }
2087 
2099  template<template<typename,typename> class op, int ... prp>
2100  void ghost_put_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
2102  size_t & g_m,
2103  size_t opt)
2104  {
2105  // Sending property object
2106  typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
2107 
2108  // send vector for each processor
2110 
2111  openfpm::vector<send_vector> g_send_prp;
2112  fill_send_ghost_put_prp_buf<send_vector, prp_object, prp...>(v_prp,g_send_prp,g_m,opt);
2113 
2114  if (opt & RUN_ON_DEVICE)
2115  {
2116 #if defined(CUDA_GPU) && defined(__NVCC__)
2117  // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
2118  cudaDeviceSynchronize();
2119 #else
2120  std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2121 #endif
2122  }
2123 
2124  // Send and receive ghost particle information
2125  if (opt & NO_CHANGE_ELEMENTS)
2126  {
2127  size_t opt_ = compute_options(opt);
2128 
2129  if (opt & RUN_ON_DEVICE)
2130  {
2132  v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>,
2133  send_vector,
2134  decltype(v_prp),
2135  layout_base,
2136  prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_);
2137  }
2138  else
2139  {
2140  op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart);
2141  v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>,
2142  send_vector,
2143  decltype(v_prp),
2144  layout_base,
2145  prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_);
2146  }
2147  }
2148  else
2149  {
2150  size_t opt_ = compute_options(opt);
2151 
2152  if (opt & RUN_ON_DEVICE)
2153  {
2155  v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>,
2156  send_vector,
2157  decltype(v_prp),
2158  layout_base,
2159  prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_);
2160  }
2161  else
2162  {
2163  op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart);
2164  v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>,
2165  send_vector,
2166  decltype(v_prp),
2167  layout_base,
2168  prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_);
2169  }
2170  }
2171 
2172  // process also the local replicated particles
2173 
2174  if (lg_m < v_prp.size() && v_prp.size() - lg_m != o_part_loc.size())
2175  {
2176  std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Local ghost particles = " << v_prp.size() - lg_m << " != " << o_part_loc.size() << std::endl;
2177  std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Check that you did a ghost_get before a ghost_put" << std::endl;
2178  }
2179 
2180 
2181  if (opt & RUN_ON_DEVICE)
2182  {
2183  v_prp.template merge_prp_v_device<op,prop,Memory,
2185  layout_base,
2186  decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc);
2187  }
2188  else
2189  {
2190  v_prp.template merge_prp_v<op,prop,Memory,
2192  layout_base,
2193  decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc);
2194  }
2195  }
2196 };
2197 
2198 
2199 #endif /* SRC_VECTOR_VECTOR_DIST_COMM_HPP_ */
openfpm::vector_std< openfpm::vector_std< Box< dim, St > > > box_f
std::unordered_map< size_t, size_t > map_cmb
this map is used to check if a combination is already present
openfpm::vector< size_t > recv_sz_get_pos
process the particle with properties
static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void *ptr)
Call-back to allocate buffer to receive incoming elements (particles)
openfpm::vector< Point< dim, St >, Memory, layout_base > v_pos_tmp
Helper buffer for computation (on GPU) of local particles (position)
openfpm::vector_std< comb< dim > > box_cmb
Store the sector for each group (previous vector)
openfpm::vector< openfpm::vector< aggregate< size_t, size_t > > > g_opart
Vcluster< Memory > & v_cl
VCluster.
Transform the boost::fusion::vector into memory specification (memory_traits)
openfpm::vector< size_t > prc_sz_gg
elements sent for each processors (ghost_get)
size_t getProcessUnitID()
Get the process unit id.
openfpm::vector_fr< Memory > hsmem
Sending buffer.
openfpm::vector< size_t > prc_recv_map
the same as prc_recv_get but for map
size_t v_sub_unit_factor
Number of units for each sub-domain.
Helper class to merge data.
Grow policy define how the vector should grow every time we exceed the size.
openfpm::vector< size_t > p_map_req
It map the processor id with the communication request into map procedure.
size_t size()
return the size of the vector
Definition: map_vector.hpp:935
void map_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
__device__ __host__ size_t size() const
Return the size of the grid.
Definition: grid_sm.hpp:637
openfpm::vector< aggregate< unsigned int, unsigned long int >, CudaMemory, memory_traits_inte > g_opart_device
Same as g_opart but on device, the vector of vector is flatten into a single vector.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_sz
Processor communication size.
openfpm::vector< prop, Memory, layout_base > v_prp_tmp
Helper buffer for computation (on GPU) of local particles (properties)
openfpm::vector< size_t > recv_sz_put
The same as recv_sz_get but for put.
void ghost_get_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
void fill_send_map_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, openfpm::vector< openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity >> &m_pos, openfpm::vector< openfpm::vector< prop, Memory, layout_base, openfpm::grow_policy_identity >> &m_prp, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
allocate and fill the send buffer for the map function
void map_list_(openfpm::vector< Point< dim, St >> &v_pos, openfpm::vector< prop > &v_prp, size_t &g_m, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
void fill_send_ghost_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_vector > &g_send_prp, size_t opt)
This function fill the send buffer for properties after the particles has been label with labelPartic...
Helper class to merge data.
void add_loc_particles_bc(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
Add local particles based on the boundary conditions.
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:27
void setDecompositionGranularity(size_t n_sub)
Set the minimum number of sub-domain per processor.
template selector for asynchronous or not asynchronous
size_t size()
Stub size.
Definition: map_vector.hpp:211
openfpm::vector< size_t > prc_recv_get_pos
Decomposition & getDecomposition()
Get the decomposition.
openfpm::vector< size_t > recv_sz_map
The same as recv_sz_get but for map.
openfpm::vector< size_t > & get_last_ghost_get_num_proc_vector()
Get the number of processor involved during the last ghost_get.
This class allocate, and destroy CPU memory.
Definition: HeapMemory.hpp:39
void fill_send_map_buf_list(openfpm::vector< Point< dim, St >> &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< openfpm::vector< Point< dim, St >>> &m_pos, openfpm::vector< openfpm::vector< prp_object >> &m_prp)
allocate and fill the send buffer for the map function
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:83
void resize(size_t sz)
resize the vector retaining the objects
Definition: map_vector.hpp:955
This class define the domain decomposition interface.
openfpm::vector< size_t > prc_recv_put
the same as prc_recv_get but for put
void calc_send_buffers(openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, size_t opt)
Calculate sending buffer size for each processor.
long int shift_box_ndec
From which decomposition the shift boxes are calculated.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &&vc)
Copy a vector.
void local_ghost_from_dec(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t g_m, size_t opt)
Local ghost from decomposition.
Grow policy define how the vector should grow every time we exceed the size.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > o_part_loc
Id of the local particle to replicate for ghost_get.
void init_decomposition(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
This class decompose a space into sub-sub-domains and distribute them across processors.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
Boundary conditions.
Definition: common.hpp:30
size_t rank()
Get the process unit id.
Helper class to merge data.
size_t getDecompositionGranularity()
Get the number of minimum sub-domain per processor.
void operator()(T &t)
It call the setMemory function for each property.
openfpm::vector< size_t > recv_sz_get_byte
Conversion to byte of recv_sz_get.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
size_t getProcessingUnits()
Get the total number of processors.
This class is an helper for the communication of vector_dist.
void init_decomposition_gr_cell(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &vc)
Copy a vector.
Out of bound policy it detect out of bound particles and decide what to do.
openfpm::vector< Box< dim, St >, Memory, layout_base > box_f_dev
The boxes touching the border of the domain + shift vector linearized from where they come from.
mgpu::ofp_context_t & getmgpuContext(bool iw=true)
If nvidia cuda is activated return a mgpu context.
vector_dist_comm(Decomposition &&dec)
Constructor.
void fill_send_ghost_put_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< send_vector > &g_send_prp, size_t &g_m, size_t opt)
This function fill the send buffer for ghost_put.
vector_dist_comm(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &v)
Copy Constructor.
void local_ghost_from_opart(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t opt)
Local ghost from labeled particles.
openfpm::vector< aggregate< int, int, int >, Memory, layout_base > m_opart
~vector_dist_comm()
Destructor.
void ghost_put_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
Ghost put.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_offset
Processor communication size.
openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity > send_pos_vector
definition of the send vector for position
Decomposition dec
Domain decomposition.
openfpm::vector< size_t > g_opart_sz
Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
void labelParticleProcessor(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< aggregate< int, int, int >, Memory, layout_base > &lbl_p, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
Label particles for mappings.
void ghost_wait_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
static void proc(size_t lbl, size_t cnt, size_t id, T1 &v_prp, T2 &m_prp)
process the particle
It create a boost::fusion vector with the selected properties.
void resize_retained_buffer(openfpm::vector_fr< Memory > &rt_buf, size_t nbf)
resize the retained buffer by nbf
size_t size()
Get the total number of processors.
const Decomposition & getDecomposition() const
Get the decomposition.
void fill_send_ghost_pos_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_pos_vector > &g_pos_send, size_t opt, bool async)
This function fill the send buffer for the particle position after the particles has been label with ...
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > starts
temporary buffer for the scan result
vector_dist_comm()
Constructor.
void createShiftBox()
For every internal ghost box we create a structure that order such internal local ghost box in shift ...
vector_dist_comm(const Decomposition &dec)
Constructor.
CudaMemory mem
Temporary CudaMemory to do stuff.
bool SSendRecv(openfpm::vector< T > &send, S &recv, openfpm::vector< size_t > &prc_send, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &sz_recv, size_t opt=NONE)
Semantic Send and receive, send the data to processors and receive from the other processors.
Definition: VCluster.hpp:797
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > proc_id_out
temporary buffer to processors ids
size_t get_last_ghost_get_num_proc()
Get the number of processor involved during the last ghost_get.
T & get(size_t id)
Get an element of the vector.
openfpm::vector< size_t > prc_g_opart
processor rank list of g_opart
It copy the properties from one object to another.
void labelParticlesGhost(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc, openfpm::vector< size_t > &prc_sz, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_offset, size_t &g_m, size_t opt)
Label the particles.
size_t get_last_ghost_get_received_parts(size_t i)
Get the number of particles received from each processor during the last ghost_get.