8 #ifndef VECTOR_DIST_CUDA_FUNCS_CUH_ 9 #define VECTOR_DIST_CUDA_FUNCS_CUH_ 11 #include "Vector/util/vector_dist_funcs.hpp" 12 #include "Decomposition/common.hpp" 13 #include "lib/pdata.hpp" 14 #include "util/cuda/kernels.cuh" 15 #include "util/cuda/scan_ofp.cuh" 16 #include "util/cuda/reduce_ofp.cuh" 17 #include "memory/CudaMemory.cuh" 19 template<
unsigned int dim,
typename St,
typename decomposition_type,
typename vector_type,
typename start_type,
typename output_type>
20 __global__
void proc_label_id_ghost(decomposition_type dec,
vector_type vd, start_type starts, output_type out)
22 int p = threadIdx.x + blockIdx.x * blockDim.x;
24 if (p >= vd.size())
return;
28 unsigned int base = starts.template get<0>(p);
30 dec.ghost_processor_ID(xp,out,base,p);
33 template<
unsigned int dim,
typename St,
typename decomposition_type,
typename vector_type,
typename output_type>
34 __global__
void num_proc_ghost_each_part(decomposition_type dec,
vector_type vd, output_type out)
36 int p = threadIdx.x + blockIdx.x * blockDim.x;
38 if (p >= vd.size())
return;
42 out.template get<0>(p) = dec.ghost_processorID_N(xp);
45 template<
unsigned int dim,
typename St,
typename particles_type>
48 int p = threadIdx.x + blockIdx.x * blockDim.x;
50 if (p >= parts.size())
return;
52 applyPointBC_no_dec(domain,bc,parts.get(p));
55 template<
bool merge_pos,
typename vector_pos_type,
typename vector_prp_type,
typename stns_type,
unsigned int ... prp>
56 __global__
void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
57 vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
60 int p = threadIdx.x + blockIdx.x * blockDim.x;
62 if (p >= vd_pos.size())
return;
64 if (merge_pos ==
true)
66 vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
69 vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p));
72 template<
typename vector_pos_type,
typename vector_prp_type,
typename stns_type,
unsigned int ... prp>
73 __global__
void merge_sort_all(vector_pos_type vd_pos, vector_prp_type vd_prp,
74 vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
77 int p = threadIdx.x + blockIdx.x * blockDim.x;
79 if (p >= vd_pos.size())
return;
81 vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
83 vd_prp.set(p,vd_prp_ord,nss.template get<0>(p));
86 template<
unsigned int dim,
typename St,
typename cartdec_gpu,
typename particles_type,
typename vector_out,
typename prc_sz_type>
87 __global__
void process_id_proc_each_part(cartdec_gpu cdg,
particles_type parts, vector_out output, prc_sz_type prc_sz ,
int rank)
89 int p = threadIdx.x + blockIdx.x * blockDim.x;
91 if (p >= parts.size())
return;
93 cdg.applyPointBC(parts.get(p));
96 int pr = cdg.processorID(xp);
99 output.template get<1>(p) = (pr == rank)?-1:pr;
100 output.template get<0>(p) = p;
102 output.template get<1>(p) = pr;
103 int nl = atomicAdd(&prc_sz.template get<0>(pr), 1);
104 output.template get<2>(p) = nl;
109 template<
typename vector_m_opart_type,
typename vector_pos_type_out,
typename vector_prp_type_out,
110 typename vector_pos_type_in,
typename vector_prp_type_in>
111 __global__
void process_map_particles(vector_m_opart_type m_opart, vector_pos_type_out m_pos, vector_prp_type_out m_prp,
112 vector_pos_type_in v_pos, vector_prp_type_in v_prp,
unsigned int offset)
114 int i = threadIdx.x + blockIdx.x * blockDim.x;
116 if (i >= m_pos.size())
return;
118 process_map_device_particle<proc_without_prp_device>(i,offset,m_opart,m_pos,m_prp,v_pos,v_prp);
121 template<
typename vector_g_opart_type,
typename vector_prp_type_out,
typename vector_prp_type_in,
unsigned int ... prp>
122 __global__
void process_ghost_particles_prp(vector_g_opart_type g_opart, vector_prp_type_out m_prp,
123 vector_prp_type_in v_prp,
unsigned int offset)
125 int i = threadIdx.x + blockIdx.x * blockDim.x;
127 if (i >= m_prp.size())
return;
129 process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
133 template<
typename vector_prp_type_out,
typename vector_prp_type_in,
unsigned int ... prp>
134 __global__
void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
135 vector_prp_type_in v_prp,
unsigned int offset)
137 int i = threadIdx.x + blockIdx.x * blockDim.x;
139 if (i >= m_prp.size())
return;
141 process_ghost_device_particle_prp<vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,m_prp,v_prp);
144 template<
unsigned int dim,
typename vector_g_opart_type,
typename vector_pos_type_out,
typename vector_pos_type_in,
typename vector_shift_type_in>
145 __global__
void process_ghost_particles_pos(vector_g_opart_type g_opart, vector_pos_type_out m_pos,
146 vector_pos_type_in v_pos, vector_shift_type_in shifts,
unsigned int offset)
148 int i = threadIdx.x + blockIdx.x * blockDim.x;
150 if (i >= m_pos.size())
return;
152 unsigned long int psid = g_opart.template get<1>(i+offset);
154 unsigned int id = psid & 0xFFFFFFFF;
155 unsigned int shift_id = psid >> 32;
158 for (
int j = 0; j < dim ; j++)
160 m_pos.template get<0>(i)[j] = v_pos.template get<0>(
id)[j] - shifts.template get<0>(shift_id)[j];
164 template<
bool with_pos,
unsigned int dim,
typename vector_g_opart_type,
typename vector_pos_type,
165 typename vector_prp_type,
typename vector_shift_type_in>
166 __global__
void process_ghost_particles_local(vector_g_opart_type g_opart, vector_pos_type v_pos, vector_prp_type v_prp,
167 vector_shift_type_in shifts,
unsigned int base)
169 int i = threadIdx.x + blockIdx.x * blockDim.x;
171 if (i >= g_opart.size())
return;
173 unsigned int pid = g_opart.template get<0>(i);
174 unsigned int shift_id = g_opart.template get<1>(i);
176 if (with_pos ==
true)
179 for (
int j = 0; j < dim ; j++)
181 v_pos.template get<0>(base+i)[j] = v_pos.template get<0>(pid)[j] - shifts.template get<0>(shift_id)[j];
185 v_prp.set(base+i,v_prp.get(pid));
188 template<
unsigned int dim,
typename St,
typename vector_of_box,
typename vector_of_shifts,
typename vector_type,
typename output_type>
189 __global__
void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
vector_type vd, output_type out,
unsigned int g_m)
191 unsigned int old_shift = (
unsigned int)-1;
192 int p = threadIdx.x + blockIdx.x * blockDim.x;
194 if (p >= g_m)
return;
200 for (
unsigned int i = 0 ; i < box_f.size() ; i++)
202 unsigned int shift_actual = box_f_sv.template get<0>(i);
203 bool sw = (old_shift == shift_actual)?
true:
false;
205 if (
Box<dim,St>(box_f.get(i)).isInsideNP(xp) ==
true && sw ==
false)
207 old_shift = shift_actual;
212 out.template get<0>(p) = n;
215 template<
unsigned int dim,
typename St,
216 typename vector_of_box,
217 typename vector_of_shifts,
218 typename vector_type_pos,
219 typename vector_type_prp,
221 typename shifts_type,
222 typename output_type>
223 __global__
void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
224 vector_type_pos v_pos, vector_type_prp v_prp,
225 start_type start, shifts_type shifts,
226 output_type output,
unsigned int offset,
unsigned int g_m)
228 unsigned int old_shift = (
unsigned int)-1;
229 int p = threadIdx.x + blockIdx.x * blockDim.x;
231 if (p >= g_m)
return;
235 unsigned int base_o = start.template get<0>(p);
236 unsigned int base = base_o + offset;
241 for (
unsigned int i = 0 ; i < box_f.size() ; i++)
243 unsigned int shift_actual = box_f_sv.template get<0>(i);
244 bool sw = (old_shift == shift_actual)?
true:
false;
246 if (
Box<dim,St>(box_f.get(i)).isInsideNP(xp) ==
true && sw ==
false)
250 for (
unsigned int j = 0 ; j < dim ; j++)
252 v_pos.template get<0>(base+n)[j] = xp.
get(j) - shifts.template get<0>(shift_actual)[j];
255 output.template get<0>(base_o+n) = p;
256 output.template get<1>(base_o+n) = shift_actual;
258 v_prp.set(base+n,v_prp.get(p));
260 old_shift = shift_actual;
266 template<
typename vector_lbl_type,
typename starts_type>
267 __global__
void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
269 int i = threadIdx.x + blockIdx.x * blockDim.x;
271 if (i >= m_opart.size())
return;
273 int pr = m_opart.template get<1>(i);
275 m_opart.template get<0>(starts.template get<0>(pr) + m_opart.template get<2>(i)) = i;
278 template<
typename red_type>
282 template<
typename red_type>
286 template<
unsigned int prp,
template <
typename>
class op,
typename vector_type>
287 auto reduce_local(
vector_type & vd) ->
typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
289 typedef typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
reduce_type;
303 template<
typename vector_type>
306 int i = threadIdx.x + blockIdx.x * blockDim.x;
308 if (i >= vd.size())
return;
310 vd.template get<0>(i) = i;
313 template<
unsigned int dim,
typename vector_pos_type,
typename vector_prp_type,
typename ids_type>
314 __global__
void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx)
316 int i = threadIdx.x + blockIdx.x * blockDim.x;
318 if (i >= vd_prp_dst.size())
return;
320 for (
unsigned int k = 0 ; k < dim ; k++)
321 {vd_pos_dst.template get<0>(i)[k] = vd_pos_src.template get<0>(idx.template get<0>(i))[k];}
323 vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i));
326 template<
unsigned int dim,
unsigned int prp,
typename vector_pos_type,
typename vector_prp_type,
typename scan_type>
327 __global__
void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan)
329 int i = threadIdx.x + blockIdx.x * blockDim.x;
331 if (i >= scan.size())
return;
333 auto sc = scan.template get<0>(i);
335 if (vd_prp_src.template get<prp>(i) == 0)
return;
337 for (
unsigned int k = 0 ; k < dim ; k++)
338 {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];}
340 vd_prp_dst.set(sc,vd_prp_src,i);
344 template<
unsigned int prp,
typename vector_type>
347 int i = threadIdx.x + blockIdx.x * blockDim.x;
351 vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0);
364 template<
unsigned int prp,
typename vector_type>
365 void remove_marked(
vector_type & vd,
const int n = 1024)
368 if (std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
int >::value ==
false &&
369 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
unsigned int >::value ==
false &&
370 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
float >::value ==
false &&
371 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
double >::value ==
false &&
372 std::is_same<
typename boost::mpl::at<
typename vector_type::value_type::type,boost::mpl::int_<prp>>::type,
size_t >::value ==
false)
374 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, the function remove_marked work only if is an integer or unsigned int" << std::endl;
381 typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type;
385 auto ite = vd.getDomainIteratorGPU(n);
387 CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel());
393 if (mem_tmp.ref() == 0)
396 idx.setMemory(mem_tmp);
403 idx.template deviceToHost<0>(idx.size()-1,idx.size()-1);
406 int n_marked = vd.
size_local() - (vd.template getProp<prp>(vd.
size_local()-1) + idx.template get<0>(idx.size()-1));
417 typename std::remove_reference<decltype(vd.
getPosVector())>::type vd_pos_new;
418 typename std::remove_reference<decltype(vd.
getPropVector())>::type vd_prp_new;
433 vd_pos_new.setMemory(exp_tmp);
434 vd_prp_new.setMemoryArray((
CudaMemory *)&exp_tmp2);
438 vd_pos_new.resize(vd.
size_local() - n_marked);
439 vd_prp_new.resize(vd.
size_local() - n_marked);
444 CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
446 vd.set_g_m(vd_pos_new.size());
456 template<
unsigned int prp,
typename functor,
typename particles_type,
typename out_type>
457 __global__
void mark_indexes(
particles_type vd, out_type out,
unsigned int g_m)
459 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
461 if (p >= vd.size()) {
return;}
463 out.template get<0>(p) = functor::check(vd.template get<prp>(p)) ==
true && p < g_m;
466 template<
typename out_type,
typename ids_type>
467 __global__
void fill_indexes(out_type scan, ids_type ids)
469 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
471 if (p >= scan.size()-1) {
return;}
473 auto sp = scan.template get<0>(p);
474 auto spp = scan.template get<0>(p+1);
477 {ids.template get<0>(scan.template get<0>(p)) = p;}
491 template<
unsigned int prp,
typename functor,
typename vector_type,
typename ids_type>
492 void get_indexes_by_type(
vector_type & vd, ids_type & ids,
size_t end ,mgpu::ofp_context_t & context)
497 scan.setMemory(mem_tmp);
498 scan.resize(vd.size()+1);
500 auto ite = scan.getGPUIterator();
502 CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel(),end);
504 openfpm::scan((
unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(
unsigned int *)scan.template getDeviceBuffer<0>(),context);
507 scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
508 size_t nf = scan.template get<0>(scan.size()-1);
511 CUDA_LAUNCH(fill_indexes,ite,scan.toKernel(),ids.toKernel());
virtual bool allocate(size_t sz)
allocate memory
virtual void * getPointer()
get a readable pointer with the data
This class implement the point shape in an N-dimensional space.
virtual void * getDevicePointer()
get a readable pointer with the data
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
In general a reduction of a type T produce a type T.
size_t size_local() const
return the local size of the vector
Vcluster< Memory > & getVC()
Get the Virtual Cluster machine.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
void setReferenceCounterToOne()
mgpu::ofp_context_t & getmgpuContext(bool iw=true)
If nvidia cuda is activated return a mgpu context.
virtual void deviceToHost()
Move memory from device to host.
Implementation of 1-D std::vector like structure.
const vector_dist_pos & getPosVector() const
return the position vector of all the particles