OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
vector_dist_comm_util_funcs.cuh
1 /*
2  * vector_dist_comm_util_funcs.hpp
3  *
4  * Created on: Sep 13, 2018
5  * Author: i-bird
6  */
7 
8 #ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
9 #define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
10 
11 #include "util/common_pdata.hpp"
12 
13 constexpr int NO_POSITION = 1;
14 constexpr int WITH_POSITION = 2;
15 constexpr int NO_CHANGE_ELEMENTS = 4;
16 
17 constexpr int BIND_DEC_TO_GHOST = 1;
18 
19 constexpr int MAP_LOCAL = 2;
20 
21 constexpr int GHOST_SYNC = 0;
22 constexpr int GHOST_ASYNC = 1;
23 
24 template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, bool is_ok_cuda>
26 {
27  static void run(CudaMemory & mem,
28  Decomposition & dec,
30  CudaMemory,
31  memory_traits_inte> & g_opart_device,
33  Memory,
34  layout_base> & proc_id_out,
36  Memory,
37  layout_base> & starts,
38  Vcluster<Memory> & v_cl,
39  openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
42  openfpm::vector<size_t> & prc_sz,
43  openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
44  size_t & g_m,
45  size_t opt)
46  {
47  std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
48  }
49 };
50 
51 
52 
53 template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition>
54 struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,true>
55 {
56  static void run(CudaMemory & mem,
57  Decomposition & dec,
59  CudaMemory,
60  memory_traits_inte> & g_opart_device,
62  Memory,
63  layout_base> & proc_id_out,
65  Memory,
66  layout_base> & starts,
67  Vcluster<Memory> & v_cl,
68  openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
71  openfpm::vector<size_t> & prc_sz,
72  openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
73  size_t & g_m,
74  size_t opt)
75  {
76 #if defined(CUDA_GPU) && defined(__NVCC__)
77 
78  if (v_cl.size() == 1)
79  {return;}
80 
81  proc_id_out.resize(v_pos.size()+1);
82  proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
83  proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);
84 
85  auto ite = v_pos.getGPUIterator();
86 
87  // no work to do return
88  if (ite.wthr.x == 0)
89  {return;}
90 
91  // First we have to see how many entry each particle produce
92  CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>),
93  ite,
94  dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
95 
96  // scan
97  //sc.scan_(proc_id_out,starts);
98  starts.resize(proc_id_out.size());
99  openfpm::scan((unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
100  starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
101  size_t sz = starts.template get<0>(starts.size()-1);
102 
103  // we compute processor id for each particle
104 
105  g_opart_device.resize(sz);
106 
107  ite = v_pos.getGPUIterator();
108 
109  // we compute processor id for each particle
110  CUDA_LAUNCH((proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>),
111  ite,
112  dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
113 
114  // sort particles
115  openfpm::sort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
116 
117  mem.allocate(sizeof(int));
118  mem.fill(0);
119  prc_offset.resize(v_cl.size());
120 
121  ite = g_opart_device.getGPUIterator();
122 
123  if (ite.wthr.x != 0)
124  {
125  // Find the buffer bases
126  CUDA_LAUNCH((find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())>),
127  ite,
128  g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
129  }
130 
131  // Trasfer the number of offsets on CPU
132  mem.deviceToHost();
133  int noff = *(int *)mem.getPointer();
134 
135  // create the terminal of prc_offset
136  prc_offset.resize(noff+1,DATA_ON_DEVICE);
137 
138  // Move the last processor index on device (id)
139  if (g_opart_device.size() != 0)
140  {g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
141  prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
142  if (g_opart_device.size() != 0)
143  {prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
144  else
145  {prc_offset.template get<1>(prc_offset.size()-1) = 0;}
146 
147  prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);
148 
149  // Here we reorder the offsets in ascending order
150  openfpm::sort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
151 
152  prc_offset.template deviceToHost<0,1>();
153 
154  // In this case we do not have communications at all
155  if (g_opart_device.size() == 0)
156  {noff = -1;}
157 
158  prc.resize(noff+1);
159  prc_sz.resize(noff+1);
160 
161  size_t base_offset = 0;
162 
163  // Transfert to prc the list of processors
164  prc.resize(noff+1);
165  for (size_t i = 0 ; i < noff+1 ; i++)
166  {
167  prc.get(i) = prc_offset.template get<1>(i);
168  prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset;
169  base_offset = prc_offset.template get<0>(i);
170  }
171 #else
172 
173  std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl;
174 
175 #endif
176  }
177 };
178 
179 template<bool with_pos,unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
181 {
182  static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
183  const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
184  openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
186  size_t opt)
187  {
188  std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
189  }
190 };
191 
192 template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
193 struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
194 {
195  static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
196  const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
197  openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
199  size_t opt)
200  {
201 #if defined(CUDA_GPU) && defined(__NVCC__)
202 
203  auto ite = o_part_loc.getGPUIterator();
204 
205  size_t old = v_pos.size();
206 
207  if (!(opt & NO_POSITION))
208  {v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);}
209 
210  if (!(opt & SKIP_LABELLING))
211  {
212  v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE);
213  }
214 
215 
216  if (ite.wthr.x != 0)
217  {
218  CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>),
219  ite,
220  o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);
221  }
222 #else
223  std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
224 #endif
225  }
226 };
227 
228 template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
230 {
231  static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
232  const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
233  openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
234  openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
235  Vcluster<Memory> & v_cl,
236  openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
237  openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
239  size_t & g_m,
240  size_t opt)
241  {
242  std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
243  }
244 };
245 
246 
247 template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
248 struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
249 {
250  static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
251  const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
252  openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
253  openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
254  Vcluster<Memory> & v_cl,
255  openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
256  openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
258  size_t & g_m,
259  size_t opt)
260  {
261 #if defined(CUDA_GPU) && defined(__NVCC__)
262 
263  o_part_loc.resize(g_m+1);
264  o_part_loc.template get<0>(o_part_loc.size()-1) = 0;
265  o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1);
266 
267  // Label the internal (assigned) particles
268  auto ite = v_pos.getGPUIteratorTo(g_m);
269 
270  // label particle processor
271  CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>),
272  ite,
273  box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
274 
275  starts.resize(o_part_loc.size());
276  openfpm::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
277 
278  starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
279  size_t total = starts.template get<0>(starts.size()-1);
280  size_t old = v_pos.size();
281 
282  v_pos.resize(v_pos.size() + total);
283  v_prp.resize(v_prp.size() + total);
284 
285  // Label the internal (assigned) particles
286  ite = v_pos.getGPUIteratorTo(g_m);
287 
288  // resize o_part_loc
289  o_part_loc.resize(total);
290 
291  CUDA_LAUNCH((shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
292  decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
293  decltype(starts.toKernel()),decltype(shifts.toKernel()),
294  decltype(o_part_loc.toKernel())>),
295  ite,
296  box_f_dev.toKernel(),box_f_sv.toKernel(),
297  v_pos.toKernel(),v_prp.toKernel(),
298  starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m);
299 
300 #else
301  std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
302 #endif
303  }
304 };
305 
306 #endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:352
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:27
size_t size()
Stub size.
Definition: map_vector.hpp:211
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:83
virtual void fill(unsigned char c)
fill the buffer with a byte
Definition: CudaMemory.cu:479
This class define the domain decomposition interface.
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:497
mgpu::ofp_context_t & getmgpuContext(bool iw=true)
If nvidia cuda is activated return a mgpu context.
virtual void deviceToHost()
Move memory from device to host.
Definition: CudaMemory.cu:367
size_t size()
Get the total number of processors.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202