8#ifndef VECTOR_DIST_CUDA_FUNCS_CUH_
9#define VECTOR_DIST_CUDA_FUNCS_CUH_
11#include "Vector/util/vector_dist_funcs.hpp"
12#include "Decomposition/common.hpp"
13#include "lib/pdata.hpp"
14#include "util/cuda/kernels.cuh"
15#include "util/cuda/scan_ofp.cuh"
16#include "util/cuda/reduce_ofp.cuh"
17#include "memory/CudaMemory.cuh"
19template<
unsigned int dim,
typename St,
typename decomposition_type,
typename vector_type,
typename start_type,
typename output_type>
20__global__
void proc_label_id_ghost(decomposition_type dec,
vector_type vd, start_type starts, output_type out)
22 int p = threadIdx.x + blockIdx.x * blockDim.x;
24 if (p >= vd.size())
return;
28 unsigned int base = starts.template get<0>(p);
30 dec.ghost_processor_ID(xp,out,base,p);
33template<
unsigned int dim,
typename St,
typename decomposition_type,
typename vector_type,
typename output_type>
34__global__
void num_proc_ghost_each_part(decomposition_type dec,
vector_type vd, output_type out)
36 int p = threadIdx.x + blockIdx.x * blockDim.x;
38 if (p >= vd.size())
return;
42 out.template get<0>(p) = dec.ghost_processorID_N(xp);
45template<
unsigned int dim,
typename St,
typename particles_type>
48 int p = threadIdx.x + blockIdx.x * blockDim.x;
50 if (p >= parts.size())
return;
52 applyPointBC_no_dec(domain,bc,parts.get(p));
55template<
bool merge_pos,
typename vector_pos_type,
typename vector_prp_type,
typename stns_type,
unsigned int ... prp>
56__global__
void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
57 vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
60 int p = threadIdx.x + blockIdx.x * blockDim.x;
62 if (p >= vd_pos.size())
return;
64 if (merge_pos ==
true)
66 vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
69 vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p));
72template<
typename vector_pos_type,
typename vector_prp_type,
typename stns_type,
unsigned int ... prp>
73__global__
void merge_sort_all(vector_pos_type vd_pos, vector_prp_type vd_prp,
74 vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
77 int p = threadIdx.x + blockIdx.x * blockDim.x;
79 if (p >= vd_pos.size())
return;
81 vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
83 vd_prp.set(p,vd_prp_ord,nss.template get<0>(p));
86template<
unsigned int dim,
typename St,
typename cartdec_gpu,
typename particles_type,
typename vector_out,
typename prc_sz_type>
87__global__
void process_id_proc_each_part(cartdec_gpu cdg,
particles_type parts, vector_out output, prc_sz_type prc_sz ,
int rank)
89 int p = threadIdx.x + blockIdx.x * blockDim.x;
91 if (p >= parts.size())
return;
93 cdg.applyPointBC(parts.get(p));
96 int pr = cdg.processorID(xp);
99 output.template get<1>(p) = (pr == rank)?-1:pr;
100 output.template get<0>(p) = p;
102 output.template get<1>(p) = pr;
103 int nl = atomicAdd(&prc_sz.template get<0>(pr), 1);
104 output.template get<2>(p) = nl;
109template<
typename vector_m_opart_type,
typename vector_pos_type_out,
typename vector_prp_type_out,
110 typename vector_pos_type_in,
typename vector_prp_type_in>
111__global__
void process_map_particles(vector_m_opart_type m_opart, vector_pos_type_out m_pos, vector_prp_type_out m_prp,
112 vector_pos_type_in v_pos, vector_prp_type_in v_prp,
unsigned int offset)
114 int i = threadIdx.x + blockIdx.x * blockDim.x;
116 if (i >= m_pos.size())
return;
118 process_map_device_particle<proc_without_prp_device>(i,offset,m_opart,m_pos,m_prp,v_pos,v_prp);
121template<
typename vector_g_opart_type,
typename vector_prp_type_out,
typename vector_prp_type_in,
unsigned int ... prp>
122__global__
void process_ghost_particles_prp(vector_g_opart_type g_opart, vector_prp_type_out m_prp,
123 vector_prp_type_in v_prp,
unsigned int offset)
125 int i = threadIdx.x + blockIdx.x * blockDim.x;
127 if (i >= m_prp.size())
return;
129 process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
133template<
typename vector_prp_type_out,
typename vector_prp_type_in,
unsigned int ... prp>
134__global__
void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
135 vector_prp_type_in v_prp,
unsigned int offset)
137 int i = threadIdx.x + blockIdx.x * blockDim.x;
139 if (i >= m_prp.size())
return;
141 process_ghost_device_particle_prp<vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,m_prp,v_prp);
144template<
unsigned int dim,
typename vector_g_opart_type,
typename vector_pos_type_out,
typename vector_pos_type_in,
typename vector_shift_type_in>
145__global__
void process_ghost_particles_pos(vector_g_opart_type g_opart, vector_pos_type_out m_pos,
146 vector_pos_type_in v_pos, vector_shift_type_in shifts,
unsigned int offset)
148 int i = threadIdx.x + blockIdx.x * blockDim.x;
150 if (i >= m_pos.size())
return;
152 unsigned long int psid = g_opart.template get<1>(i+offset);
154 unsigned int id = psid & 0xFFFFFFFF;
155 unsigned int shift_id = psid >> 32;
158 for (
int j = 0; j < dim ; j++)
160 m_pos.template get<0>(i)[j] = v_pos.template get<0>(
id)[j] - shifts.template get<0>(shift_id)[j];
164template<
bool with_pos,
unsigned int dim,
typename vector_g_opart_type,
typename vector_pos_type,
165 typename vector_prp_type,
typename vector_shift_type_in>
166__global__
void process_ghost_particles_local(vector_g_opart_type g_opart, vector_pos_type v_pos, vector_prp_type v_prp,
167 vector_shift_type_in shifts,
unsigned int base)
169 int i = threadIdx.x + blockIdx.x * blockDim.x;
171 if (i >= g_opart.size())
return;
173 unsigned int pid = g_opart.template get<0>(i);
174 unsigned int shift_id = g_opart.template get<1>(i);
176 if (with_pos ==
true)
179 for (
int j = 0; j < dim ; j++)
181 v_pos.template get<0>(base+i)[j] = v_pos.template get<0>(pid)[j] - shifts.template get<0>(shift_id)[j];
185 v_prp.set(base+i,v_prp.get(pid));
188template<
unsigned int dim,
typename St,
typename vector_of_box,
typename vector_of_shifts,
typename vector_type,
typename output_type>
189__global__
void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
vector_type vd, output_type out,
unsigned int g_m)
191 unsigned int old_shift = (
unsigned int)-1;
192 int p = threadIdx.x + blockIdx.x * blockDim.x;
194 if (p >= g_m)
return;
200 for (
unsigned int i = 0 ; i < box_f.size() ; i++)
202 unsigned int shift_actual = box_f_sv.template get<0>(i);
203 bool sw = (old_shift == shift_actual)?
true:false;
207 old_shift = shift_actual;
212 out.template get<0>(p) = n;
215template<
unsigned int dim,
typename St,
216 typename vector_of_box,
217 typename vector_of_shifts,
218 typename vector_type_pos,
219 typename vector_type_prp,
221 typename shifts_type,
222 typename output_type>
223__global__
void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
224 vector_type_pos v_pos, vector_type_prp v_prp,
225 start_type start, shifts_type shifts,
226 output_type output,
unsigned int offset,
unsigned int g_m)
228 unsigned int old_shift = (
unsigned int)-1;
229 int p = threadIdx.x + blockIdx.x * blockDim.x;
231 if (p >= g_m)
return;
235 unsigned int base_o = start.template get<0>(p);
236 unsigned int base = base_o + offset;
241 for (
unsigned int i = 0 ; i < box_f.size() ; i++)
243 unsigned int shift_actual = box_f_sv.template get<0>(i);
244 bool sw = (old_shift == shift_actual)?
true:false;
250 for (
unsigned int j = 0 ; j < dim ; j++)
252 v_pos.template get<0>(base+n)[j] = xp.
get(j) - shifts.template get<0>(shift_actual)[j];
255 output.template get<0>(base_o+n) = p;
256 output.template get<1>(base_o+n) = shift_actual;
258 v_prp.set(base+n,v_prp.get(p));
260 old_shift = shift_actual;
266template<
typename vector_lbl_type,
typename starts_type>
267__global__
void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
269 int i = threadIdx.x + blockIdx.x * blockDim.x;
271 if (i >= m_opart.size())
return;
273 int pr = m_opart.template get<1>(i);
275 m_opart.template get<0>(starts.template get<0>(pr) + m_opart.template get<2>(i)) = i;
278template<
typename red_type>
282template<
typename red_type>
286template<
unsigned int prp,
template <
typename>
class op,
typename vector_type>
287auto reduce_local(
vector_type & vd) ->
typename std::remove_reference<
decltype(vd.template getProp<prp>(0))>::type
289 typedef typename std::remove_reference<
decltype(vd.template getProp<prp>(0))>::type
reduce_type;
296 op<reduce_type>(), vd.
getVC().getgpuContext());
303template<
typename vector_type>
306 int i = threadIdx.x + blockIdx.x * blockDim.x;
308 if (i >= vd.size())
return;
310 vd.template get<0>(i) = i;
313template<
unsigned int dim,
typename vector_pos_type,
typename vector_prp_type,
typename ids_type>
314__global__
void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx)
316 int i = threadIdx.x + blockIdx.x * blockDim.x;
318 if (i >= vd_prp_dst.size())
return;
320 for (
unsigned int k = 0 ; k < dim ; k++)
321 {vd_pos_dst.template get<0>(i)[k] = vd_pos_src.template get<0>(idx.template get<0>(i))[k];}
323 vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i));
326template<
unsigned int dim,
unsigned int prp,
typename vector_pos_type,
typename vector_prp_type,
typename scan_type>
327__global__
void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan)
329 int i = threadIdx.x + blockIdx.x * blockDim.x;
331 if (i >= scan.size())
return;
333 auto sc = scan.template get<0>(i);
335 if (vd_prp_src.template get<prp>(i) == 0)
return;
337 for (
unsigned int k = 0 ; k < dim ; k++)
338 {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];}
340 vd_prp_dst.set(sc,vd_prp_src,i);
344template<
unsigned int prp,
typename vector_type>
347 int i = threadIdx.x + blockIdx.x * blockDim.x;
351 vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0);
364template<
unsigned int prp,
typename vector_type>
365void remove_marked(
vector_type & vd,
const int n = 1024)
368 if (std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
int >::value ==
false &&
369 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
unsigned int >::value ==
false &&
370 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
float >::value ==
false &&
371 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
double >::value ==
false &&
372 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
size_t >::value ==
false)
374 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, the function remove_marked work only if is an integer or unsigned int" << std::endl;
381 typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type;
385 auto ite = vd.getDomainIteratorGPU(n);
387 CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel());
393 if (mem_tmp.ref() == 0)
396 idx.setMemory(mem_tmp);
399 openfpm::scan((remove_type *)vd.
getPropVector().template getDeviceBuffer<prp>(),vd.
size_local(),(remove_type *)idx.template getDeviceBuffer<0>(),vd.
getVC().getgpuContext());
403 idx.template deviceToHost<0>(idx.
size()-1,idx.
size()-1);
417 typename std::remove_reference<
decltype(vd.
getPosVector())>::type vd_pos_new;
418 typename std::remove_reference<
decltype(vd.
getPropVector())>::type vd_prp_new;
438 vd_pos_new.resize(vd.
size_local() - n_marked);
439 vd_prp_new.resize(vd.
size_local() - n_marked);
444 CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
446 vd.set_g_m(vd_pos_new.size());
456template<
unsigned int prp,
typename functor,
typename particles_type,
typename out_type>
457__global__
void mark_indexes(
particles_type vd, out_type out,
unsigned int g_m)
459 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
461 if (p >= vd.size()) {
return;}
463 out.template get<0>(p) = functor::check(vd.template get<prp>(p)) ==
true && p < g_m;
466template<
typename out_type,
typename ids_type>
467__global__
void fill_indexes(out_type scan, ids_type ids)
469 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
471 if (p >= scan.size()-1) {
return;}
473 auto sp = scan.template get<0>(p);
474 auto spp = scan.template get<0>(p+1);
477 {ids.template get<0>(scan.template get<0>(p)) = p;}
491template<
unsigned int prp,
typename functor,
typename vector_type,
typename ids_type>
497 scan.setMemory(mem_tmp);
498 scan.resize(vd.size()+1);
500 auto ite = scan.getGPUIterator();
502 CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel(),end);
504 openfpm::scan((
unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(
unsigned int *)scan.template getDeviceBuffer<0>(),context);
507 scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
508 size_t nf = scan.template get<0>(scan.size()-1);
511 CUDA_LAUNCH(fill_indexes,ite,scan.toKernel(),ids.toKernel());
This class represent an N-dimensional box.
__device__ __host__ bool isInsideNP(const Point< dim, T > &p) const
Check if the point is inside the region excluding the positive part.
virtual bool allocate(size_t sz)
allocate memory
This class implement the point shape in an N-dimensional space.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Implementation of 1-D std::vector like structure.
size_t size_local() const
return the local size of the vector
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
void setReferenceCounterToOne()
const vector_dist_pos & getPosVector() const
return the position vector of all the particles
Vcluster< Memory > & getVC()
Get the Virtual Cluster machine.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
In general a reduction of a type T produce a type T.