OpenFPM  5.2.0
Project that contain the implementation of distributed structures
vector_dist_cuda_funcs.cuh
1 /*
2  * vector_dist_cuda_funcs.cuh
3  *
4  * Created on: Aug 14, 2018
5  * Author: i-bird
6  */
7 
8 #ifndef VECTOR_DIST_CUDA_FUNCS_CUH_
9 #define VECTOR_DIST_CUDA_FUNCS_CUH_
10 
11 #include "Vector/util/vector_dist_funcs.hpp"
12 #include "Decomposition/common.hpp"
13 #include "lib/pdata.hpp"
14 #include "util/cuda/kernels.cuh"
15 #include "util/cuda/scan_ofp.cuh"
16 #include "util/cuda/reduce_ofp.cuh"
17 #include "memory/CudaMemory.cuh"
18 
19 template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
20 __global__ void proc_label_id_ghost(decomposition_type dec,vector_type vd, start_type starts, output_type out)
21 {
22  int p = threadIdx.x + blockIdx.x * blockDim.x;
23 
24  if (p >= vd.size()) return;
25 
26  Point<dim,St> xp = vd.template get<0>(p);
27 
28  unsigned int base = starts.template get<0>(p);
29 
30  dec.ghost_processor_ID(xp,out,base,p);
31 }
32 
33 template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename output_type>
34 __global__ void num_proc_ghost_each_part(decomposition_type dec, vector_type vd, output_type out)
35 {
36  int p = threadIdx.x + blockIdx.x * blockDim.x;
37 
38  if (p >= vd.size()) return;
39 
40  Point<dim,St> xp = vd.template get<0>(p);
41 
42  out.template get<0>(p) = dec.ghost_processorID_N(xp);
43 }
44 
45 template<unsigned int dim, typename St, typename particles_type>
46 __global__ void apply_bc_each_part(Box<dim,St> domain, periodicity_int<dim> bc, particles_type parts)
47 {
48  int p = threadIdx.x + blockIdx.x * blockDim.x;
49 
50  if (p >= parts.size()) return;
51 
52  applyPointBC_no_dec(domain,bc,parts.get(p));
53 }
54 
55 template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type>
56 __global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output, prc_sz_type prc_sz , int rank)
57 {
58  int p = threadIdx.x + blockIdx.x * blockDim.x;
59 
60  if (p >= parts.size()) return;
61 
62  cdg.applyPointBC(parts.get(p));
63  Point<dim,St> xp = parts.template get<0>(p);
64 
65  int pr = cdg.processorID(xp);
66 
67 #ifndef TEST1
68  output.template get<1>(p) = (pr == rank)?-1:pr;
69  output.template get<0>(p) = p;
70 #else
71  output.template get<1>(p) = pr;
72  int nl = atomicAdd(&prc_sz.template get<0>(pr), 1);
73  output.template get<2>(p) = nl;
74 #endif
75 }
76 
77 
78 template<typename vector_m_opart_type, typename vector_pos_type_out, typename vector_prp_type_out,
79  typename vector_pos_type_in, typename vector_prp_type_in>
80 __global__ void process_map_particles(vector_m_opart_type m_opart, vector_pos_type_out m_pos, vector_prp_type_out m_prp,
81  vector_pos_type_in v_pos, vector_prp_type_in v_prp, unsigned int offset)
82 {
83  int i = threadIdx.x + blockIdx.x * blockDim.x;
84 
85  if (i >= m_pos.size()) return;
86 
87  process_map_device_particle<proc_without_prp_device>(i,offset,m_opart,m_pos,m_prp,v_pos,v_prp);
88 }
89 
90 template<typename vector_g_opart_type, typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
91 __global__ void process_ghost_particles_prp(vector_g_opart_type g_opart, vector_prp_type_out m_prp,
92  vector_prp_type_in v_prp, unsigned int offset)
93 {
94  int i = threadIdx.x + blockIdx.x * blockDim.x;
95 
96  if (i >= m_prp.size()) return;
97 
98  process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
99 }
100 
101 
102 template<typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
103 __global__ void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
104  vector_prp_type_in v_prp, unsigned int offset)
105 {
106  int i = threadIdx.x + blockIdx.x * blockDim.x;
107 
108  if (i >= m_prp.size()) return;
109 
110  process_ghost_device_particle_prp<vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,m_prp,v_prp);
111 }
112 
113 template<unsigned int dim, typename vector_g_opart_type, typename vector_pos_type_out, typename vector_pos_type_in, typename vector_shift_type_in>
114 __global__ void process_ghost_particles_pos(vector_g_opart_type g_opart, vector_pos_type_out m_pos,
115  vector_pos_type_in v_pos, vector_shift_type_in shifts, unsigned int offset)
116 {
117  int i = threadIdx.x + blockIdx.x * blockDim.x;
118 
119  if (i >= m_pos.size()) return;
120 
121  unsigned long int psid = g_opart.template get<1>(i+offset);
122 
123  unsigned int id = psid & 0xFFFFFFFF;
124  unsigned int shift_id = psid >> 32;
125 
126  #pragma unroll
127  for (int j = 0; j < dim ; j++)
128  {
129  m_pos.template get<0>(i)[j] = v_pos.template get<0>(id)[j] - shifts.template get<0>(shift_id)[j];
130  }
131 }
132 
133 template<bool with_pos, unsigned int dim, typename vector_g_opart_type, typename vector_pos_type,
134  typename vector_prp_type, typename vector_shift_type_in>
135 __global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vector_pos_type v_pos, vector_prp_type v_prp,
136  vector_shift_type_in shifts, unsigned int base)
137 {
138  int i = threadIdx.x + blockIdx.x * blockDim.x;
139 
140  if (i >= g_opart.size()) return;
141 
142  unsigned int pid = g_opart.template get<0>(i);
143  unsigned int shift_id = g_opart.template get<1>(i);
144 
145  if (with_pos == true)
146  {
147  #pragma unroll
148  for (int j = 0; j < dim ; j++)
149  {
150  v_pos.template get<0>(base+i)[j] = v_pos.template get<0>(pid)[j] - shifts.template get<0>(shift_id)[j];
151  }
152  }
153 
154  v_prp.set(base+i,v_prp.get(pid));
155 }
156 
157 template<unsigned int dim, typename St, typename vector_of_box, typename vector_of_shifts, typename vector_type, typename output_type>
158 __global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv, vector_type vd, output_type out, unsigned int ghostMarker)
159 {
160  unsigned int old_shift = (unsigned int)-1;
161  int p = threadIdx.x + blockIdx.x * blockDim.x;
162 
163  if (p >= ghostMarker) return;
164 
165  Point<dim,St> xp = vd.template get<0>(p);
166 
167  unsigned int n = 0;
168 
169  for (unsigned int i = 0 ; i < box_f.size() ; i++)
170  {
171  unsigned int shift_actual = box_f_sv.template get<0>(i);
172  bool sw = (old_shift == shift_actual)?true:false;
173 
174  if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
175  {
176  old_shift = shift_actual;
177  n++;
178  }
179  }
180 
181  out.template get<0>(p) = n;
182 }
183 
184 template<unsigned int dim, typename St,
185  typename vector_of_box,
186  typename vector_of_shifts,
187  typename vector_type_pos,
188  typename vector_type_prp,
189  typename start_type,
190  typename shifts_type,
191  typename output_type>
192 __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
193  vector_type_pos v_pos, vector_type_prp v_prp,
194  start_type start, shifts_type shifts,
195  output_type output, unsigned int offset,unsigned int ghostMarker)
196 {
197  unsigned int old_shift = (unsigned int)-1;
198  int p = threadIdx.x + blockIdx.x * blockDim.x;
199 
200  if (p >= ghostMarker) return;
201 
202  Point<dim,St> xp = v_pos.template get<0>(p);
203 
204  unsigned int base_o = start.template get<0>(p);
205  unsigned int base = base_o + offset;
206 
207 
208  unsigned int n = 0;
209 
210  for (unsigned int i = 0 ; i < box_f.size() ; i++)
211  {
212  unsigned int shift_actual = box_f_sv.template get<0>(i);
213  bool sw = (old_shift == shift_actual)?true:false;
214 
215  if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
216  {
217 
218 #pragma unroll
219  for (unsigned int j = 0 ; j < dim ; j++)
220  {
221  v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
222  }
223 
224  output.template get<0>(base_o+n) = p;
225  output.template get<1>(base_o+n) = shift_actual;
226 
227  v_prp.set(base+n,v_prp.get(p));
228 
229  old_shift = shift_actual;
230  n++;
231  }
232  }
233 }
234 
235 template<typename vector_lbl_type, typename starts_type>
236 __global__ void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
237 {
238  int i = threadIdx.x + blockIdx.x * blockDim.x;
239 
240  if (i >= m_opart.size()) return;
241 
242  int pr = m_opart.template get<1>(i);
243 
244  m_opart.template get<0>(starts.template get<0>(pr) + m_opart.template get<2>(i)) = i;
245 }
246 
247 template<typename red_type>
248 struct _add_: gpu::plus_t<red_type>
249 {};
250 
251 template<typename red_type>
252 struct _max_: gpu::maximum_t<red_type>
253 {};
254 
255 template<unsigned int prp, template <typename> class op, typename vector_type>
256 auto reduce_local(vector_type & vd) -> typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
257 {
258  typedef typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type reduce_type;
259 
260  CudaMemory mem;
261  mem.allocate(sizeof(reduce_type));
262 
263  openfpm::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(),
264  vd.size_local(), (reduce_type *)mem.getDevicePointer() ,
265  op<reduce_type>(), vd.getVC().getGpuContext());
266 
267  mem.deviceToHost();
268 
269  return *(reduce_type *)(mem.getPointer());
270 }
271 
272 template<typename vector_type>
273 __global__ void create_index(vector_type vd)
274 {
275  int i = threadIdx.x + blockIdx.x * blockDim.x;
276 
277  if (i >= vd.size()) return;
278 
279  vd.template get<0>(i) = i;
280 }
281 
282 template<unsigned int dim, typename vector_pos_type, typename vector_prp_type, typename ids_type>
283 __global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx)
284 {
285  int i = threadIdx.x + blockIdx.x * blockDim.x;
286 
287  if (i >= vd_prp_dst.size()) return;
288 
289  for (unsigned int k = 0 ; k < dim ; k++)
290  {vd_pos_dst.template get<0>(i)[k] = vd_pos_src.template get<0>(idx.template get<0>(i))[k];}
291 
292  vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i));
293 }
294 
295 template<unsigned int dim, unsigned int prp, typename vector_pos_type, typename vector_prp_type, typename scan_type>
296 __global__ void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan)
297 {
298  int i = threadIdx.x + blockIdx.x * blockDim.x;
299 
300  if (i >= scan.size()) return;
301 
302  auto sc = scan.template get<0>(i);
303 
304  if (vd_prp_src.template get<prp>(i) == 0) return;
305 
306  for (unsigned int k = 0 ; k < dim ; k++)
307  {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];}
308 
309  vd_prp_dst.set(sc,vd_prp_src,i);
310 }
311 
312 
313 template<unsigned int prp,typename vector_type>
314 __global__ void flip_one_to_zero(vector_type vd)
315 {
316  int i = threadIdx.x + blockIdx.x * blockDim.x;
317 
318  if (i >= vd.size_local()) return;
319 
320  vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0);
321 }
322 
323 
333 template<unsigned int prp, typename vector_type>
334 void remove_marked(vector_type & vd, const int n = 1024)
335 {
336  // This function make sense only if prp is an int or unsigned int
337  if (std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, int >::value == false &&
338  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, unsigned int >::value == false &&
339  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, float >::value == false &&
340  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, double >::value == false &&
341  std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, size_t >::value == false)
342  {
343  std::cout << __FILE__ << ":" << __LINE__ << " error, the function remove_marked work only if is an integer or unsigned int" << std::endl;
344  return;
345  }
346 
347  if (vd.size_local() == 0)
348  {return;}
349 
350  typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type;
351 
352  // because we mark the one to remove we flip the one to zero and the zeros to one
353 
354  auto ite = vd.getDomainIteratorGPU(n);
355 
356  CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel());
357 
358  // first we scan
359 
361 
362  if (mem_tmp.ref() == 0)
363  {mem_tmp.incRef();}
364 
365  idx.setMemory(mem_tmp);
366  idx.resize(vd.size_local());
367 
368  openfpm::scan((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),vd.size_local(),(remove_type *)idx.template getDeviceBuffer<0>(),vd.getVC().getGpuContext());
369 
370  // Check if we marked something
371 
372  idx.template deviceToHost<0>(idx.size()-1,idx.size()-1);
373  vd.template deviceToHostProp<prp>(vd.size_local()-1,vd.size_local()-1);
374 
375  int n_marked = vd.size_local() - (vd.template getProp<prp>(vd.size_local()-1) + idx.template get<0>(idx.size()-1));
376 
377  if (n_marked == 0)
378  {
379  // No particle has been marked Nothing to do
380 
381  return;
382  }
383 
385 
386  typename std::remove_reference<decltype(vd.getPosVector())>::type vd_pos_new;
387  typename std::remove_reference<decltype(vd.getPropVector())>::type vd_prp_new;
388 
389  // we reuse memory. this give us the possibility to avoid allocation and make the remove faster
390 
391  // Reference counter must be set to zero
392 
393 /* for (int i = 0 ; i < MAX_NUMER_OF_PROPERTIES ; i++)
394  {
395  for (int j = 0 ; j < exp_tmp2[i].ref() ; j++)
396  {exp_tmp2[i].decRef();}
397  }
398 
399  for (int j = 0 ; j < exp_tmp.ref() ; j++)
400  {exp_tmp.decRef();}*/
401 
402  vd_pos_new.setMemory(exp_tmp);
403  vd_prp_new.setMemoryArray((CudaMemory *)&exp_tmp2);
404 
405  // resize them
406 
407  vd_pos_new.resize(vd.size_local() - n_marked);
408  vd_prp_new.resize(vd.size_local() - n_marked);
409 
410  auto & vd_pos_old = vd.getPosVector();
411  auto & vd_prp_old = vd.getPropVector();
412 
413  CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
414 
415  vd.setGhostMarker(vd_pos_new.size());
416 
417  vd.getPosVector().swap_nomode(vd_pos_new);
418  vd.getPropVector().swap_nomode(vd_prp_new);
419 
420  // Increment v_pos_new and vd_prp_new memory counters
421 
423 }
424 
425 template<unsigned int prp, typename functor, typename particles_type, typename out_type>
426 __global__ void mark_indexes(particles_type vd, out_type out, unsigned int ghostMarker)
427 {
428  unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
429 
430  if (p >= vd.size()) {return;}
431 
432  out.template get<0>(p) = functor::check(vd.template get<prp>(p)) == true && p < ghostMarker;
433 }
434 
435 template<typename out_type, typename ids_type>
436 __global__ void fill_indexes(out_type scan, ids_type ids)
437 {
438  unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
439 
440  if (p >= scan.size()-1) {return;}
441 
442  auto sp = scan.template get<0>(p);
443  auto spp = scan.template get<0>(p+1);
444 
445  if (sp != spp)
446  {ids.template get<0>(scan.template get<0>(p)) = p;}
447 }
448 
460 template<unsigned int prp, typename functor, typename vector_type, typename ids_type>
461 void get_indexes_by_type(vector_type & vd, ids_type & ids, size_t end ,gpu::ofp_context_t& gpuContext)
462 {
463  // first we do a scan of the property
465 
466  scan.setMemory(mem_tmp);
467  scan.resize(vd.size()+1);
468 
469  auto ite = scan.getGPUIterator();
470 
471  CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel(),(unsigned int)end);
472 
473  openfpm::scan((unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(unsigned int *)scan.template getDeviceBuffer<0>(),gpuContext);
474 
475  // get the number of marked particles
476  scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
477  size_t nf = scan.template get<0>(scan.size()-1);
478  ids.resize(nf);
479 
480  CUDA_LAUNCH(fill_indexes,ite,scan.toKernel(),ids.toKernel());
481 }
482 
483 #endif /* VECTOR_DIST_CUDA_FUNCS_CUH_ */
__device__ __host__ bool isInsideNP(const Point< dim, T > &p) const
Check if the point is inside the region excluding the positive part.
Definition: Box.hpp:1045
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:503
virtual void deviceToHost()
Move memory from device to host.
Definition: CudaMemory.cu:369
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:354
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:28
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:204
size_t size()
Stub size.
Definition: map_vector.hpp:212
Distributed vector.
Vcluster< Memory > & getVC()
Get the Virtual Cluster machine.
size_t size_local() const
return the local size of the vector
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
const vector_dist_pos & getPosVector() const
return the position vector of all the particles
void setReferenceCounterToOne()
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
boost::fusion::vector< list... > type
internal type containing the data
Definition: aggregate.hpp:223
Boundary conditions.
Definition: common.hpp:31
In general a reduction of a type T produce a type T.
Definition: reduce_type.hpp:6