OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
vector_dist_cuda_funcs.cuh
1 /*
2  * vector_dist_cuda_funcs.cuh
3  *
4  * Created on: Aug 14, 2018
5  * Author: i-bird
6  */
7 
8 #ifndef VECTOR_DIST_CUDA_FUNCS_CUH_
9 #define VECTOR_DIST_CUDA_FUNCS_CUH_
10 
11 #include "Vector/util/vector_dist_funcs.hpp"
12 #include "Decomposition/common.hpp"
13 #include "lib/pdata.hpp"
14 #include "util/cuda/kernels.cuh"
15 #include "util/cuda/scan_ofp.cuh"
16 #include "util/cuda/reduce_ofp.cuh"
17 #include "memory/CudaMemory.cuh"
18 
19 template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
20 __global__ void proc_label_id_ghost(decomposition_type dec,vector_type vd, start_type starts, output_type out)
21 {
22  int p = threadIdx.x + blockIdx.x * blockDim.x;
23 
24  if (p >= vd.size()) return;
25 
26  Point<dim,St> xp = vd.template get<0>(p);
27 
28  unsigned int base = starts.template get<0>(p);
29 
30  dec.ghost_processor_ID(xp,out,base,p);
31 }
32 
33 template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename output_type>
34 __global__ void num_proc_ghost_each_part(decomposition_type dec, vector_type vd, output_type out)
35 {
36  int p = threadIdx.x + blockIdx.x * blockDim.x;
37 
38  if (p >= vd.size()) return;
39 
40  Point<dim,St> xp = vd.template get<0>(p);
41 
42  out.template get<0>(p) = dec.ghost_processorID_N(xp);
43 }
44 
45 template<unsigned int dim, typename St, typename particles_type>
46 __global__ void apply_bc_each_part(Box<dim,St> domain, periodicity_int<dim> bc, particles_type parts)
47 {
48  int p = threadIdx.x + blockIdx.x * blockDim.x;
49 
50  if (p >= parts.size()) return;
51 
52  applyPointBC_no_dec(domain,bc,parts.get(p));
53 }
54 
55 template<bool merge_pos, typename vector_pos_type, typename vector_prp_type, typename stns_type, unsigned int ... prp>
56 __global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
57  vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
58  stns_type nss)
59 {
60  int p = threadIdx.x + blockIdx.x * blockDim.x;
61 
62  if (p >= vd_pos.size()) return;
63 
64  if (merge_pos == true)
65  {
66  vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
67  }
68 
69  vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p));
70 }
71 
72 template<typename vector_pos_type, typename vector_prp_type, typename stns_type, unsigned int ... prp>
73 __global__ void merge_sort_all(vector_pos_type vd_pos, vector_prp_type vd_prp,
74  vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
75  stns_type nss)
76 {
77  int p = threadIdx.x + blockIdx.x * blockDim.x;
78 
79  if (p >= vd_pos.size()) return;
80 
81  vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
82 
83  vd_prp.set(p,vd_prp_ord,nss.template get<0>(p));
84 }
85 
86 template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type>
87 __global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output, prc_sz_type prc_sz , int rank)
88 {
89  int p = threadIdx.x + blockIdx.x * blockDim.x;
90 
91  if (p >= parts.size()) return;
92 
93  cdg.applyPointBC(parts.get(p));
94  Point<dim,St> xp = parts.template get<0>(p);
95 
96  int pr = cdg.processorID(xp);
97 
98 #ifndef TEST1
99  output.template get<1>(p) = (pr == rank)?-1:pr;
100  output.template get<0>(p) = p;
101 #else
102  output.template get<1>(p) = pr;
103  int nl = atomicAdd(&prc_sz.template get<0>(pr), 1);
104  output.template get<2>(p) = nl;
105 #endif
106 }
107 
108 
109 template<typename vector_m_opart_type, typename vector_pos_type_out, typename vector_prp_type_out,
110  typename vector_pos_type_in, typename vector_prp_type_in>
111 __global__ void process_map_particles(vector_m_opart_type m_opart, vector_pos_type_out m_pos, vector_prp_type_out m_prp,
112  vector_pos_type_in v_pos, vector_prp_type_in v_prp, unsigned int offset)
113 {
114  int i = threadIdx.x + blockIdx.x * blockDim.x;
115 
116  if (i >= m_pos.size()) return;
117 
118  process_map_device_particle<proc_without_prp_device>(i,offset,m_opart,m_pos,m_prp,v_pos,v_prp);
119 }
120 
121 template<typename vector_g_opart_type, typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
122 __global__ void process_ghost_particles_prp(vector_g_opart_type g_opart, vector_prp_type_out m_prp,
123  vector_prp_type_in v_prp, unsigned int offset)
124 {
125  int i = threadIdx.x + blockIdx.x * blockDim.x;
126 
127  if (i >= m_prp.size()) return;
128 
129  process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
130 }
131 
132 
133 template<typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
134 __global__ void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
135  vector_prp_type_in v_prp, unsigned int offset)
136 {
137  int i = threadIdx.x + blockIdx.x * blockDim.x;
138 
139  if (i >= m_prp.size()) return;
140 
141  process_ghost_device_particle_prp<vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,m_prp,v_prp);
142 }
143 
144 template<unsigned int dim, typename vector_g_opart_type, typename vector_pos_type_out, typename vector_pos_type_in, typename vector_shift_type_in>
145 __global__ void process_ghost_particles_pos(vector_g_opart_type g_opart, vector_pos_type_out m_pos,
146  vector_pos_type_in v_pos, vector_shift_type_in shifts, unsigned int offset)
147 {
148  int i = threadIdx.x + blockIdx.x * blockDim.x;
149 
150  if (i >= m_pos.size()) return;
151 
152  unsigned long int psid = g_opart.template get<1>(i+offset);
153 
154  unsigned int id = psid & 0xFFFFFFFF;
155  unsigned int shift_id = psid >> 32;
156 
157  #pragma unroll
158  for (int j = 0; j < dim ; j++)
159  {
160  m_pos.template get<0>(i)[j] = v_pos.template get<0>(id)[j] - shifts.template get<0>(shift_id)[j];
161  }
162 }
163 
164 template<bool with_pos, unsigned int dim, typename vector_g_opart_type, typename vector_pos_type,
165  typename vector_prp_type, typename vector_shift_type_in>
166 __global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vector_pos_type v_pos, vector_prp_type v_prp,
167  vector_shift_type_in shifts, unsigned int base)
168 {
169  int i = threadIdx.x + blockIdx.x * blockDim.x;
170 
171  if (i >= g_opart.size()) return;
172 
173  unsigned int pid = g_opart.template get<0>(i);
174  unsigned int shift_id = g_opart.template get<1>(i);
175 
176  if (with_pos == true)
177  {
178  #pragma unroll
179  for (int j = 0; j < dim ; j++)
180  {
181  v_pos.template get<0>(base+i)[j] = v_pos.template get<0>(pid)[j] - shifts.template get<0>(shift_id)[j];
182  }
183  }
184 
185  v_prp.set(base+i,v_prp.get(pid));
186 }
187 
188 template<unsigned int dim, typename St, typename vector_of_box, typename vector_of_shifts, typename vector_type, typename output_type>
189 __global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv, vector_type vd, output_type out, unsigned int g_m)
190 {
191  unsigned int old_shift = (unsigned int)-1;
192  int p = threadIdx.x + blockIdx.x * blockDim.x;
193 
194  if (p >= g_m) return;
195 
196  Point<dim,St> xp = vd.template get<0>(p);
197 
198  unsigned int n = 0;
199 
200  for (unsigned int i = 0 ; i < box_f.size() ; i++)
201  {
202  unsigned int shift_actual = box_f_sv.template get<0>(i);
203  bool sw = (old_shift == shift_actual)?true:false;
204 
205  if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
206  {
207  old_shift = shift_actual;
208  n++;
209  }
210  }
211 
212  out.template get<0>(p) = n;
213 }
214 
215 template<unsigned int dim, typename St,
216  typename vector_of_box,
217  typename vector_of_shifts,
218  typename vector_type_pos,
219  typename vector_type_prp,
220  typename start_type,
221  typename shifts_type,
222  typename output_type>
223 __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
224  vector_type_pos v_pos, vector_type_prp v_prp,
225  start_type start, shifts_type shifts,
226  output_type output, unsigned int offset,unsigned int g_m)
227 {
228  unsigned int old_shift = (unsigned int)-1;
229  int p = threadIdx.x + blockIdx.x * blockDim.x;
230 
231  if (p >= g_m) return;
232 
233  Point<dim,St> xp = v_pos.template get<0>(p);
234 
235  unsigned int base_o = start.template get<0>(p);
236  unsigned int base = base_o + offset;
237 
238 
239  unsigned int n = 0;
240 
241  for (unsigned int i = 0 ; i < box_f.size() ; i++)
242  {
243  unsigned int shift_actual = box_f_sv.template get<0>(i);
244  bool sw = (old_shift == shift_actual)?true:false;
245 
246  if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
247  {
248 
249 #pragma unroll
250  for (unsigned int j = 0 ; j < dim ; j++)
251  {
252  v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
253  }
254 
255  output.template get<0>(base_o+n) = p;
256  output.template get<1>(base_o+n) = shift_actual;
257 
258  v_prp.set(base+n,v_prp.get(p));
259 
260  old_shift = shift_actual;
261  n++;
262  }
263  }
264 }
265 
266 template<typename vector_lbl_type, typename starts_type>
267 __global__ void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
268 {
269  int i = threadIdx.x + blockIdx.x * blockDim.x;
270 
271  if (i >= m_opart.size()) return;
272 
273  int pr = m_opart.template get<1>(i);
274 
275  m_opart.template get<0>(starts.template get<0>(pr) + m_opart.template get<2>(i)) = i;
276 }
277 
278 template<typename red_type>
279 struct _add_: mgpu::plus_t<red_type>
280 {};
281 
282 template<typename red_type>
283 struct _max_: mgpu::maximum_t<red_type>
284 {};
285 
286 template<unsigned int prp, template <typename> class op, typename vector_type>
287 auto reduce_local(vector_type & vd) -> typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
288 {
289  typedef typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type reduce_type;
290 
291  CudaMemory mem;
292  mem.allocate(sizeof(reduce_type));
293 
294  openfpm::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(),
295  vd.size_local(), (reduce_type *)mem.getDevicePointer() ,
296  op<reduce_type>(), vd.getVC().getmgpuContext());
297 
298  mem.deviceToHost();
299 
300  return *(reduce_type *)(mem.getPointer());
301 }
302 
303 template<typename vector_type>
304 __global__ void create_index(vector_type vd)
305 {
306  int i = threadIdx.x + blockIdx.x * blockDim.x;
307 
308  if (i >= vd.size()) return;
309 
310  vd.template get<0>(i) = i;
311 }
312 
313 template<unsigned int dim, typename vector_pos_type, typename vector_prp_type, typename ids_type>
314 __global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx)
315 {
316  int i = threadIdx.x + blockIdx.x * blockDim.x;
317 
318  if (i >= vd_prp_dst.size()) return;
319 
320  for (unsigned int k = 0 ; k < dim ; k++)
321  {vd_pos_dst.template get<0>(i)[k] = vd_pos_src.template get<0>(idx.template get<0>(i))[k];}
322 
323  vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i));
324 }
325 
326 template<unsigned int dim, unsigned int prp, typename vector_pos_type, typename vector_prp_type, typename scan_type>
327 __global__ void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan)
328 {
329  int i = threadIdx.x + blockIdx.x * blockDim.x;
330 
331  if (i >= scan.size()) return;
332 
333  auto sc = scan.template get<0>(i);
334 
335  if (vd_prp_src.template get<prp>(i) == 0) return;
336 
337  for (unsigned int k = 0 ; k < dim ; k++)
338  {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];}
339 
340  vd_prp_dst.set(sc,vd_prp_src,i);
341 }
342 
343 
344 template<unsigned int prp,typename vector_type>
345 __global__ void flip_one_to_zero(vector_type vd)
346 {
347  int i = threadIdx.x + blockIdx.x * blockDim.x;
348 
349  if (i >= vd.size_local()) return;
350 
351  vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0);
352 }
353 
354 
364 template<unsigned int prp, typename vector_type>
365 void remove_marked(vector_type & vd, const int n = 1024)
366 {
367  // This function make sense only if prp is an int or unsigned int
368  if (std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, int >::value == false &&
369  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, unsigned int >::value == false &&
370  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, float >::value == false &&
371  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, double >::value == false &&
372  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, size_t >::value == false)
373  {
374  std::cout << __FILE__ << ":" << __LINE__ << " error, the function remove_marked work only if is an integer or unsigned int" << std::endl;
375  return;
376  }
377 
378  if (vd.size_local() == 0)
379  {return;}
380 
381  typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type;
382 
383  // because we mark the one to remove we flip the one to zero and the zeros to one
384 
385  auto ite = vd.getDomainIteratorGPU(n);
386 
387  CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel());
388 
389  // first we scan
390 
392 
393  if (mem_tmp.ref() == 0)
394  {mem_tmp.incRef();}
395 
396  idx.setMemory(mem_tmp);
397  idx.resize(vd.size_local());
398 
399  openfpm::scan((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),vd.size_local(),(remove_type *)idx.template getDeviceBuffer<0>(),vd.getVC().getmgpuContext());
400 
401  // Check if we marked something
402 
403  idx.template deviceToHost<0>(idx.size()-1,idx.size()-1);
404  vd.template deviceToHostProp<prp>(vd.size_local()-1,vd.size_local()-1);
405 
406  int n_marked = vd.size_local() - (vd.template getProp<prp>(vd.size_local()-1) + idx.template get<0>(idx.size()-1));
407 
408  if (n_marked == 0)
409  {
410  // No particle has been marked Nothing to do
411 
412  return;
413  }
414 
416 
417  typename std::remove_reference<decltype(vd.getPosVector())>::type vd_pos_new;
418  typename std::remove_reference<decltype(vd.getPropVector())>::type vd_prp_new;
419 
420  // we reuse memory. this give us the possibility to avoid allocation and make the remove faster
421 
422  // Reference counter must be set to zero
423 
424 /* for (int i = 0 ; i < MAX_NUMER_OF_PROPERTIES ; i++)
425  {
426  for (int j = 0 ; j < exp_tmp2[i].ref() ; j++)
427  {exp_tmp2[i].decRef();}
428  }
429 
430  for (int j = 0 ; j < exp_tmp.ref() ; j++)
431  {exp_tmp.decRef();}*/
432 
433  vd_pos_new.setMemory(exp_tmp);
434  vd_prp_new.setMemoryArray((CudaMemory *)&exp_tmp2);
435 
436  // resize them
437 
438  vd_pos_new.resize(vd.size_local() - n_marked);
439  vd_prp_new.resize(vd.size_local() - n_marked);
440 
441  auto & vd_pos_old = vd.getPosVector();
442  auto & vd_prp_old = vd.getPropVector();
443 
444  CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
445 
446  vd.set_g_m(vd_pos_new.size());
447 
448  vd.getPosVector().swap_nomode(vd_pos_new);
449  vd.getPropVector().swap_nomode(vd_prp_new);
450 
451  // Increment v_pos_new and vd_prp_new memory counters
452 
454 }
455 
456 template<unsigned int prp, typename functor, typename particles_type, typename out_type>
457 __global__ void mark_indexes(particles_type vd, out_type out, unsigned int g_m)
458 {
459  unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
460 
461  if (p >= vd.size()) {return;}
462 
463  out.template get<0>(p) = functor::check(vd.template get<prp>(p)) == true && p < g_m;
464 }
465 
466 template<typename out_type, typename ids_type>
467 __global__ void fill_indexes(out_type scan, ids_type ids)
468 {
469  unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
470 
471  if (p >= scan.size()-1) {return;}
472 
473  auto sp = scan.template get<0>(p);
474  auto spp = scan.template get<0>(p+1);
475 
476  if (sp != spp)
477  {ids.template get<0>(scan.template get<0>(p)) = p;}
478 }
479 
491 template<unsigned int prp, typename functor, typename vector_type, typename ids_type>
492 void get_indexes_by_type(vector_type & vd, ids_type & ids, size_t end ,mgpu::ofp_context_t & context)
493 {
494  // first we do a scan of the property
496 
497  scan.setMemory(mem_tmp);
498  scan.resize(vd.size()+1);
499 
500  auto ite = scan.getGPUIterator();
501 
502  CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel(),end);
503 
504  openfpm::scan((unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(unsigned int *)scan.template getDeviceBuffer<0>(),context);
505 
506  // get the number of marked particles
507  scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
508  size_t nf = scan.template get<0>(scan.size()-1);
509  ids.resize(nf);
510 
511  CUDA_LAUNCH(fill_indexes,ite,scan.toKernel(),ids.toKernel());
512 }
513 
514 #endif /* VECTOR_DIST_CUDA_FUNCS_CUH_ */
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:352
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:27
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:497
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
In general a reduction of a type T produce a type T.
Definition: reduce_type.hpp:5
size_t size_local() const
return the local size of the vector
Boundary conditions.
Definition: common.hpp:30
Vcluster< Memory > & getVC()
Get the Virtual Cluster machine.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
void setReferenceCounterToOne()
mgpu::ofp_context_t & getmgpuContext(bool iw=true)
If nvidia cuda is activated return a mgpu context.
virtual void deviceToHost()
Move memory from device to host.
Definition: CudaMemory.cu:367
Distributed vector.
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
const vector_dist_pos & getPosVector() const
return the position vector of all the particles