OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
vector_dist_cuda_funcs.cuh
1/*
2 * vector_dist_cuda_funcs.cuh
3 *
4 * Created on: Aug 14, 2018
5 * Author: i-bird
6 */
7
8#ifndef VECTOR_DIST_CUDA_FUNCS_CUH_
9#define VECTOR_DIST_CUDA_FUNCS_CUH_
10
11#include "Vector/util/vector_dist_funcs.hpp"
12#include "Decomposition/common.hpp"
13#include "lib/pdata.hpp"
14#include "util/cuda/kernels.cuh"
15#include "util/cuda/scan_ofp.cuh"
16#include "util/cuda/reduce_ofp.cuh"
17#include "memory/CudaMemory.cuh"
18
19template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
20__global__ void proc_label_id_ghost(decomposition_type dec,vector_type vd, start_type starts, output_type out)
21{
22 int p = threadIdx.x + blockIdx.x * blockDim.x;
23
24 if (p >= vd.size()) return;
25
26 Point<dim,St> xp = vd.template get<0>(p);
27
28 unsigned int base = starts.template get<0>(p);
29
30 dec.ghost_processor_ID(xp,out,base,p);
31}
32
33template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename output_type>
34__global__ void num_proc_ghost_each_part(decomposition_type dec, vector_type vd, output_type out)
35{
36 int p = threadIdx.x + blockIdx.x * blockDim.x;
37
38 if (p >= vd.size()) return;
39
40 Point<dim,St> xp = vd.template get<0>(p);
41
42 out.template get<0>(p) = dec.ghost_processorID_N(xp);
43}
44
45template<unsigned int dim, typename St, typename particles_type>
46__global__ void apply_bc_each_part(Box<dim,St> domain, periodicity_int<dim> bc, particles_type parts)
47{
48 int p = threadIdx.x + blockIdx.x * blockDim.x;
49
50 if (p >= parts.size()) return;
51
52 applyPointBC_no_dec(domain,bc,parts.get(p));
53}
54
55template<bool merge_pos, typename vector_pos_type, typename vector_prp_type, typename stns_type, unsigned int ... prp>
56__global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
57 vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
58 stns_type nss)
59{
60 int p = threadIdx.x + blockIdx.x * blockDim.x;
61
62 if (p >= vd_pos.size()) return;
63
64 if (merge_pos == true)
65 {
66 vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
67 }
68
69 vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p));
70}
71
72template<typename vector_pos_type, typename vector_prp_type, typename stns_type, unsigned int ... prp>
73__global__ void merge_sort_all(vector_pos_type vd_pos, vector_prp_type vd_prp,
74 vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
75 stns_type nss)
76{
77 int p = threadIdx.x + blockIdx.x * blockDim.x;
78
79 if (p >= vd_pos.size()) return;
80
81 vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
82
83 vd_prp.set(p,vd_prp_ord,nss.template get<0>(p));
84}
85
86template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type>
87__global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output, prc_sz_type prc_sz , int rank)
88{
89 int p = threadIdx.x + blockIdx.x * blockDim.x;
90
91 if (p >= parts.size()) return;
92
93 cdg.applyPointBC(parts.get(p));
94 Point<dim,St> xp = parts.template get<0>(p);
95
96 int pr = cdg.processorID(xp);
97
98#ifndef TEST1
99 output.template get<1>(p) = (pr == rank)?-1:pr;
100 output.template get<0>(p) = p;
101#else
102 output.template get<1>(p) = pr;
103 int nl = atomicAdd(&prc_sz.template get<0>(pr), 1);
104 output.template get<2>(p) = nl;
105#endif
106}
107
108
109template<typename vector_m_opart_type, typename vector_pos_type_out, typename vector_prp_type_out,
110 typename vector_pos_type_in, typename vector_prp_type_in>
111__global__ void process_map_particles(vector_m_opart_type m_opart, vector_pos_type_out m_pos, vector_prp_type_out m_prp,
112 vector_pos_type_in v_pos, vector_prp_type_in v_prp, unsigned int offset)
113{
114 int i = threadIdx.x + blockIdx.x * blockDim.x;
115
116 if (i >= m_pos.size()) return;
117
118 process_map_device_particle<proc_without_prp_device>(i,offset,m_opart,m_pos,m_prp,v_pos,v_prp);
119}
120
121template<typename vector_g_opart_type, typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
122__global__ void process_ghost_particles_prp(vector_g_opart_type g_opart, vector_prp_type_out m_prp,
123 vector_prp_type_in v_prp, unsigned int offset)
124{
125 int i = threadIdx.x + blockIdx.x * blockDim.x;
126
127 if (i >= m_prp.size()) return;
128
129 process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
130}
131
132
133template<typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
134__global__ void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
135 vector_prp_type_in v_prp, unsigned int offset)
136{
137 int i = threadIdx.x + blockIdx.x * blockDim.x;
138
139 if (i >= m_prp.size()) return;
140
141 process_ghost_device_particle_prp<vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,m_prp,v_prp);
142}
143
144template<unsigned int dim, typename vector_g_opart_type, typename vector_pos_type_out, typename vector_pos_type_in, typename vector_shift_type_in>
145__global__ void process_ghost_particles_pos(vector_g_opart_type g_opart, vector_pos_type_out m_pos,
146 vector_pos_type_in v_pos, vector_shift_type_in shifts, unsigned int offset)
147{
148 int i = threadIdx.x + blockIdx.x * blockDim.x;
149
150 if (i >= m_pos.size()) return;
151
152 unsigned long int psid = g_opart.template get<1>(i+offset);
153
154 unsigned int id = psid & 0xFFFFFFFF;
155 unsigned int shift_id = psid >> 32;
156
157 #pragma unroll
158 for (int j = 0; j < dim ; j++)
159 {
160 m_pos.template get<0>(i)[j] = v_pos.template get<0>(id)[j] - shifts.template get<0>(shift_id)[j];
161 }
162}
163
164template<bool with_pos, unsigned int dim, typename vector_g_opart_type, typename vector_pos_type,
165 typename vector_prp_type, typename vector_shift_type_in>
166__global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vector_pos_type v_pos, vector_prp_type v_prp,
167 vector_shift_type_in shifts, unsigned int base)
168{
169 int i = threadIdx.x + blockIdx.x * blockDim.x;
170
171 if (i >= g_opart.size()) return;
172
173 unsigned int pid = g_opart.template get<0>(i);
174 unsigned int shift_id = g_opart.template get<1>(i);
175
176 if (with_pos == true)
177 {
178 #pragma unroll
179 for (int j = 0; j < dim ; j++)
180 {
181 v_pos.template get<0>(base+i)[j] = v_pos.template get<0>(pid)[j] - shifts.template get<0>(shift_id)[j];
182 }
183 }
184
185 v_prp.set(base+i,v_prp.get(pid));
186}
187
188template<unsigned int dim, typename St, typename vector_of_box, typename vector_of_shifts, typename vector_type, typename output_type>
189__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv, vector_type vd, output_type out, unsigned int g_m)
190{
191 unsigned int old_shift = (unsigned int)-1;
192 int p = threadIdx.x + blockIdx.x * blockDim.x;
193
194 if (p >= g_m) return;
195
196 Point<dim,St> xp = vd.template get<0>(p);
197
198 unsigned int n = 0;
199
200 for (unsigned int i = 0 ; i < box_f.size() ; i++)
201 {
202 unsigned int shift_actual = box_f_sv.template get<0>(i);
203 bool sw = (old_shift == shift_actual)?true:false;
204
205 if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
206 {
207 old_shift = shift_actual;
208 n++;
209 }
210 }
211
212 out.template get<0>(p) = n;
213}
214
215template<unsigned int dim, typename St,
216 typename vector_of_box,
217 typename vector_of_shifts,
218 typename vector_type_pos,
219 typename vector_type_prp,
220 typename start_type,
221 typename shifts_type,
222 typename output_type>
223__global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
224 vector_type_pos v_pos, vector_type_prp v_prp,
225 start_type start, shifts_type shifts,
226 output_type output, unsigned int offset,unsigned int g_m)
227{
228 unsigned int old_shift = (unsigned int)-1;
229 int p = threadIdx.x + blockIdx.x * blockDim.x;
230
231 if (p >= g_m) return;
232
233 Point<dim,St> xp = v_pos.template get<0>(p);
234
235 unsigned int base_o = start.template get<0>(p);
236 unsigned int base = base_o + offset;
237
238
239 unsigned int n = 0;
240
241 for (unsigned int i = 0 ; i < box_f.size() ; i++)
242 {
243 unsigned int shift_actual = box_f_sv.template get<0>(i);
244 bool sw = (old_shift == shift_actual)?true:false;
245
246 if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
247 {
248
249#pragma unroll
250 for (unsigned int j = 0 ; j < dim ; j++)
251 {
252 v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
253 }
254
255 output.template get<0>(base_o+n) = p;
256 output.template get<1>(base_o+n) = shift_actual;
257
258 v_prp.set(base+n,v_prp.get(p));
259
260 old_shift = shift_actual;
261 n++;
262 }
263 }
264}
265
266template<typename vector_lbl_type, typename starts_type>
267__global__ void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
268{
269 int i = threadIdx.x + blockIdx.x * blockDim.x;
270
271 if (i >= m_opart.size()) return;
272
273 int pr = m_opart.template get<1>(i);
274
275 m_opart.template get<0>(starts.template get<0>(pr) + m_opart.template get<2>(i)) = i;
276}
277
278template<typename red_type>
279struct _add_: gpu::plus_t<red_type>
280{};
281
282template<typename red_type>
283struct _max_: gpu::maximum_t<red_type>
284{};
285
286template<unsigned int prp, template <typename> class op, typename vector_type>
287auto reduce_local(vector_type & vd) -> typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
288{
289 typedef typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type reduce_type;
290
291 CudaMemory mem;
292 mem.allocate(sizeof(reduce_type));
293
294 openfpm::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(),
295 vd.size_local(), (reduce_type *)mem.getDevicePointer() ,
296 op<reduce_type>(), vd.getVC().getgpuContext());
297
298 mem.deviceToHost();
299
300 return *(reduce_type *)(mem.getPointer());
301}
302
303template<typename vector_type>
304__global__ void create_index(vector_type vd)
305{
306 int i = threadIdx.x + blockIdx.x * blockDim.x;
307
308 if (i >= vd.size()) return;
309
310 vd.template get<0>(i) = i;
311}
312
313template<unsigned int dim, typename vector_pos_type, typename vector_prp_type, typename ids_type>
314__global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx)
315{
316 int i = threadIdx.x + blockIdx.x * blockDim.x;
317
318 if (i >= vd_prp_dst.size()) return;
319
320 for (unsigned int k = 0 ; k < dim ; k++)
321 {vd_pos_dst.template get<0>(i)[k] = vd_pos_src.template get<0>(idx.template get<0>(i))[k];}
322
323 vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i));
324}
325
326template<unsigned int dim, unsigned int prp, typename vector_pos_type, typename vector_prp_type, typename scan_type>
327__global__ void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan)
328{
329 int i = threadIdx.x + blockIdx.x * blockDim.x;
330
331 if (i >= scan.size()) return;
332
333 auto sc = scan.template get<0>(i);
334
335 if (vd_prp_src.template get<prp>(i) == 0) return;
336
337 for (unsigned int k = 0 ; k < dim ; k++)
338 {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];}
339
340 vd_prp_dst.set(sc,vd_prp_src,i);
341}
342
343
344template<unsigned int prp,typename vector_type>
345__global__ void flip_one_to_zero(vector_type vd)
346{
347 int i = threadIdx.x + blockIdx.x * blockDim.x;
348
349 if (i >= vd.size_local()) return;
350
351 vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0);
352}
353
354
364template<unsigned int prp, typename vector_type>
365void remove_marked(vector_type & vd, const int n = 1024)
366{
367 // This function make sense only if prp is an int or unsigned int
368 if (std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, int >::value == false &&
369 std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, unsigned int >::value == false &&
370 std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, float >::value == false &&
371 std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, double >::value == false &&
372 std::is_same< typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type, size_t >::value == false)
373 {
374 std::cout << __FILE__ << ":" << __LINE__ << " error, the function remove_marked work only if is an integer or unsigned int" << std::endl;
375 return;
376 }
377
378 if (vd.size_local() == 0)
379 {return;}
380
381 typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type;
382
383 // because we mark the one to remove we flip the one to zero and the zeros to one
384
385 auto ite = vd.getDomainIteratorGPU(n);
386
387 CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel());
388
389 // first we scan
390
392
393 if (mem_tmp.ref() == 0)
394 {mem_tmp.incRef();}
395
396 idx.setMemory(mem_tmp);
397 idx.resize(vd.size_local());
398
399 openfpm::scan((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),vd.size_local(),(remove_type *)idx.template getDeviceBuffer<0>(),vd.getVC().getgpuContext());
400
401 // Check if we marked something
402
403 idx.template deviceToHost<0>(idx.size()-1,idx.size()-1);
404 vd.template deviceToHostProp<prp>(vd.size_local()-1,vd.size_local()-1);
405
406 int n_marked = vd.size_local() - (vd.template getProp<prp>(vd.size_local()-1) + idx.template get<0>(idx.size()-1));
407
408 if (n_marked == 0)
409 {
410 // No particle has been marked Nothing to do
411
412 return;
413 }
414
416
417 typename std::remove_reference<decltype(vd.getPosVector())>::type vd_pos_new;
418 typename std::remove_reference<decltype(vd.getPropVector())>::type vd_prp_new;
419
420 // we reuse memory. this give us the possibility to avoid allocation and make the remove faster
421
422 // Reference counter must be set to zero
423
424/* for (int i = 0 ; i < MAX_NUMER_OF_PROPERTIES ; i++)
425 {
426 for (int j = 0 ; j < exp_tmp2[i].ref() ; j++)
427 {exp_tmp2[i].decRef();}
428 }
429
430 for (int j = 0 ; j < exp_tmp.ref() ; j++)
431 {exp_tmp.decRef();}*/
432
433 //vd_pos_new.setMemory(exp_tmp);
434 //vd_prp_new.setMemoryArray((CudaMemory *)&exp_tmp2);
435
436 // resize them
437
438 vd_pos_new.resize(vd.size_local() - n_marked);
439 vd_prp_new.resize(vd.size_local() - n_marked);
440
441 auto & vd_pos_old = vd.getPosVector();
442 auto & vd_prp_old = vd.getPropVector();
443
444 CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
445
446 vd.set_g_m(vd_pos_new.size());
447
448 vd.getPosVector().swap_nomode(vd_pos_new);
449 vd.getPropVector().swap_nomode(vd_prp_new);
450
451 // Increment v_pos_new and vd_prp_new memory counters
452
454}
455
456template<unsigned int prp, typename functor, typename particles_type, typename out_type>
457__global__ void mark_indexes(particles_type vd, out_type out, unsigned int g_m)
458{
459 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
460
461 if (p >= vd.size()) {return;}
462
463 out.template get<0>(p) = functor::check(vd.template get<prp>(p)) == true && p < g_m;
464}
465
466template<typename out_type, typename ids_type>
467__global__ void fill_indexes(out_type scan, ids_type ids)
468{
469 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
470
471 if (p >= scan.size()-1) {return;}
472
473 auto sp = scan.template get<0>(p);
474 auto spp = scan.template get<0>(p+1);
475
476 if (sp != spp)
477 {ids.template get<0>(scan.template get<0>(p)) = p;}
478}
479
491template<unsigned int prp, typename functor, typename vector_type, typename ids_type>
492void get_indexes_by_type(vector_type & vd, ids_type & ids, size_t end ,gpu::ofp_context_t & context)
493{
494 // first we do a scan of the property
496
497 scan.setMemory(mem_tmp);
498 scan.resize(vd.size()+1);
499
500 auto ite = scan.getGPUIterator();
501
502 CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel(),end);
503
504 openfpm::scan((unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(unsigned int *)scan.template getDeviceBuffer<0>(),context);
505
506 // get the number of marked particles
507 scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
508 size_t nf = scan.template get<0>(scan.size()-1);
509 ids.resize(nf);
510
511 CUDA_LAUNCH(fill_indexes,ite,scan.toKernel(),ids.toKernel());
512}
513
514#endif /* VECTOR_DIST_CUDA_FUNCS_CUH_ */
This class represent an N-dimensional box.
Definition Box.hpp:61
__device__ __host__ bool isInsideNP(const Point< dim, T > &p) const
Check if the point is inside the region excluding the positive part.
Definition Box.hpp:1034
virtual bool allocate(size_t sz)
allocate memory
Definition CudaMemory.cu:38
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition Point.hpp:172
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
Distributed vector.
size_t size_local() const
return the local size of the vector
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
void setReferenceCounterToOne()
const vector_dist_pos & getPosVector() const
return the position vector of all the particles
Vcluster< Memory > & getVC()
Get the Virtual Cluster machine.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Boundary conditions.
Definition common.hpp:31
In general a reduction of a type T produce a type T.