8 #ifndef VECTOR_DIST_CUDA_FUNCS_CUH_
9 #define VECTOR_DIST_CUDA_FUNCS_CUH_
11 #include "Vector/util/vector_dist_funcs.hpp"
12 #include "Decomposition/common.hpp"
13 #include "lib/pdata.hpp"
14 #include "util/cuda/kernels.cuh"
15 #include "util/cuda/scan_ofp.cuh"
16 #include "util/cuda/reduce_ofp.cuh"
17 #include "memory/CudaMemory.cuh"
19 template<
unsigned int dim,
typename St,
typename decomposition_type,
typename vector_type,
typename start_type,
typename output_type>
20 __global__
void proc_label_id_ghost(decomposition_type dec,
vector_type vd, start_type starts, output_type out)
22 int p = threadIdx.x + blockIdx.x * blockDim.x;
24 if (p >= vd.size())
return;
28 unsigned int base = starts.template get<0>(p);
30 dec.ghost_processor_ID(xp,out,base,p);
33 template<
unsigned int dim,
typename St,
typename decomposition_type,
typename vector_type,
typename output_type>
34 __global__
void num_proc_ghost_each_part(decomposition_type dec,
vector_type vd, output_type out)
36 int p = threadIdx.x + blockIdx.x * blockDim.x;
38 if (p >= vd.size())
return;
42 out.template get<0>(p) = dec.ghost_processorID_N(xp);
45 template<
unsigned int dim,
typename St,
typename particles_type>
48 int p = threadIdx.x + blockIdx.x * blockDim.x;
50 if (p >= parts.size())
return;
52 applyPointBC_no_dec(domain,bc,parts.get(p));
55 template<
unsigned int dim,
typename St,
typename cartdec_gpu,
typename particles_type,
typename vector_out,
typename prc_sz_type>
56 __global__
void process_id_proc_each_part(cartdec_gpu cdg,
particles_type parts, vector_out output, prc_sz_type prc_sz ,
int rank)
58 int p = threadIdx.x + blockIdx.x * blockDim.x;
60 if (p >= parts.size())
return;
62 cdg.applyPointBC(parts.get(p));
65 int pr = cdg.processorID(xp);
68 output.template get<1>(p) = (pr == rank)?-1:pr;
69 output.template get<0>(p) = p;
71 output.template get<1>(p) = pr;
72 int nl = atomicAdd(&prc_sz.template get<0>(pr), 1);
73 output.template get<2>(p) = nl;
78 template<
typename vector_m_opart_type,
typename vector_pos_type_out,
typename vector_prp_type_out,
79 typename vector_pos_type_in,
typename vector_prp_type_in>
80 __global__
void process_map_particles(vector_m_opart_type m_opart, vector_pos_type_out m_pos, vector_prp_type_out m_prp,
81 vector_pos_type_in v_pos, vector_prp_type_in v_prp,
unsigned int offset)
83 int i = threadIdx.x + blockIdx.x * blockDim.x;
85 if (i >= m_pos.size())
return;
87 process_map_device_particle<proc_without_prp_device>(i,offset,m_opart,m_pos,m_prp,v_pos,v_prp);
90 template<
typename vector_g_opart_type,
typename vector_prp_type_out,
typename vector_prp_type_in,
unsigned int ... prp>
91 __global__
void process_ghost_particles_prp(vector_g_opart_type g_opart, vector_prp_type_out m_prp,
92 vector_prp_type_in v_prp,
unsigned int offset)
94 int i = threadIdx.x + blockIdx.x * blockDim.x;
96 if (i >= m_prp.size())
return;
98 process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
102 template<
typename vector_prp_type_out,
typename vector_prp_type_in,
unsigned int ... prp>
103 __global__
void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
104 vector_prp_type_in v_prp,
unsigned int offset)
106 int i = threadIdx.x + blockIdx.x * blockDim.x;
108 if (i >= m_prp.size())
return;
110 process_ghost_device_particle_prp<vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,m_prp,v_prp);
113 template<
unsigned int dim,
typename vector_g_opart_type,
typename vector_pos_type_out,
typename vector_pos_type_in,
typename vector_shift_type_in>
114 __global__
void process_ghost_particles_pos(vector_g_opart_type g_opart, vector_pos_type_out m_pos,
115 vector_pos_type_in v_pos, vector_shift_type_in shifts,
unsigned int offset)
117 int i = threadIdx.x + blockIdx.x * blockDim.x;
119 if (i >= m_pos.size())
return;
121 unsigned long int psid = g_opart.template get<1>(i+offset);
123 unsigned int id = psid & 0xFFFFFFFF;
124 unsigned int shift_id = psid >> 32;
127 for (
int j = 0; j < dim ; j++)
129 m_pos.template get<0>(i)[j] = v_pos.template get<0>(
id)[j] - shifts.template get<0>(shift_id)[j];
133 template<
bool with_pos,
unsigned int dim,
typename vector_g_opart_type,
typename vector_pos_type,
134 typename vector_prp_type,
typename vector_shift_type_in>
135 __global__
void process_ghost_particles_local(vector_g_opart_type g_opart, vector_pos_type v_pos, vector_prp_type v_prp,
136 vector_shift_type_in shifts,
unsigned int base)
138 int i = threadIdx.x + blockIdx.x * blockDim.x;
140 if (i >= g_opart.size())
return;
142 unsigned int pid = g_opart.template get<0>(i);
143 unsigned int shift_id = g_opart.template get<1>(i);
145 if (with_pos ==
true)
148 for (
int j = 0; j < dim ; j++)
150 v_pos.template get<0>(base+i)[j] = v_pos.template get<0>(pid)[j] - shifts.template get<0>(shift_id)[j];
154 v_prp.set(base+i,v_prp.get(pid));
157 template<
unsigned int dim,
typename St,
typename vector_of_box,
typename vector_of_shifts,
typename vector_type,
typename output_type>
158 __global__
void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
vector_type vd, output_type out,
unsigned int ghostMarker)
160 unsigned int old_shift = (
unsigned int)-1;
161 int p = threadIdx.x + blockIdx.x * blockDim.x;
163 if (p >= ghostMarker)
return;
169 for (
unsigned int i = 0 ; i < box_f.size() ; i++)
171 unsigned int shift_actual = box_f_sv.template get<0>(i);
172 bool sw = (old_shift == shift_actual)?
true:
false;
176 old_shift = shift_actual;
181 out.template get<0>(p) = n;
184 template<
unsigned int dim,
typename St,
185 typename vector_of_box,
186 typename vector_of_shifts,
187 typename vector_type_pos,
188 typename vector_type_prp,
190 typename shifts_type,
191 typename output_type>
192 __global__
void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv,
193 vector_type_pos v_pos, vector_type_prp v_prp,
194 start_type start, shifts_type shifts,
195 output_type output,
unsigned int offset,
unsigned int ghostMarker)
197 unsigned int old_shift = (
unsigned int)-1;
198 int p = threadIdx.x + blockIdx.x * blockDim.x;
200 if (p >= ghostMarker)
return;
204 unsigned int base_o = start.template get<0>(p);
205 unsigned int base = base_o + offset;
210 for (
unsigned int i = 0 ; i < box_f.size() ; i++)
212 unsigned int shift_actual = box_f_sv.template get<0>(i);
213 bool sw = (old_shift == shift_actual)?
true:
false;
219 for (
unsigned int j = 0 ; j < dim ; j++)
221 v_pos.template get<0>(base+n)[j] = xp.
get(j) - shifts.template get<0>(shift_actual)[j];
224 output.template get<0>(base_o+n) = p;
225 output.template get<1>(base_o+n) = shift_actual;
227 v_prp.set(base+n,v_prp.get(p));
229 old_shift = shift_actual;
235 template<
typename vector_lbl_type,
typename starts_type>
236 __global__
void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
238 int i = threadIdx.x + blockIdx.x * blockDim.x;
240 if (i >= m_opart.size())
return;
242 int pr = m_opart.template get<1>(i);
244 m_opart.template get<0>(starts.template get<0>(pr) + m_opart.template get<2>(i)) = i;
247 template<
typename red_type>
251 template<
typename red_type>
255 template<
unsigned int prp,
template <
typename>
class op,
typename vector_type>
256 auto reduce_local(
vector_type & vd) ->
typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
258 typedef typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
reduce_type;
265 op<reduce_type>(), vd.
getVC().getGpuContext());
272 template<
typename vector_type>
275 int i = threadIdx.x + blockIdx.x * blockDim.x;
277 if (i >= vd.size())
return;
279 vd.template get<0>(i) = i;
282 template<
unsigned int dim,
typename vector_pos_type,
typename vector_prp_type,
typename ids_type>
283 __global__
void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx)
285 int i = threadIdx.x + blockIdx.x * blockDim.x;
287 if (i >= vd_prp_dst.size())
return;
289 for (
unsigned int k = 0 ; k < dim ; k++)
290 {vd_pos_dst.template get<0>(i)[k] = vd_pos_src.template get<0>(idx.template get<0>(i))[k];}
292 vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i));
295 template<
unsigned int dim,
unsigned int prp,
typename vector_pos_type,
typename vector_prp_type,
typename scan_type>
296 __global__
void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan)
298 int i = threadIdx.x + blockIdx.x * blockDim.x;
300 if (i >= scan.size())
return;
302 auto sc = scan.template get<0>(i);
304 if (vd_prp_src.template get<prp>(i) == 0)
return;
306 for (
unsigned int k = 0 ; k < dim ; k++)
307 {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];}
309 vd_prp_dst.set(sc,vd_prp_src,i);
313 template<
unsigned int prp,
typename vector_type>
316 int i = threadIdx.x + blockIdx.x * blockDim.x;
320 vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0);
333 template<
unsigned int prp,
typename vector_type>
334 void remove_marked(
vector_type & vd,
const int n = 1024)
343 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, the function remove_marked work only if is an integer or unsigned int" << std::endl;
350 typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type;
354 auto ite = vd.getDomainIteratorGPU(n);
356 CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel());
362 if (mem_tmp.ref() == 0)
365 idx.setMemory(mem_tmp);
368 openfpm::scan((remove_type *)vd.
getPropVector().template getDeviceBuffer<prp>(),vd.
size_local(),(remove_type *)idx.template getDeviceBuffer<0>(),vd.
getVC().getGpuContext());
372 idx.template deviceToHost<0>(idx.
size()-1,idx.
size()-1);
386 typename std::remove_reference<decltype(vd.
getPosVector())>::type vd_pos_new;
387 typename std::remove_reference<decltype(vd.
getPropVector())>::type vd_prp_new;
402 vd_pos_new.setMemory(exp_tmp);
403 vd_prp_new.setMemoryArray((
CudaMemory *)&exp_tmp2);
407 vd_pos_new.resize(vd.
size_local() - n_marked);
408 vd_prp_new.resize(vd.
size_local() - n_marked);
413 CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
415 vd.setGhostMarker(vd_pos_new.size());
425 template<
unsigned int prp,
typename functor,
typename particles_type,
typename out_type>
426 __global__
void mark_indexes(
particles_type vd, out_type out,
unsigned int ghostMarker)
428 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
430 if (p >= vd.size()) {
return;}
432 out.template get<0>(p) = functor::check(vd.template get<prp>(p)) ==
true && p < ghostMarker;
435 template<
typename out_type,
typename ids_type>
436 __global__
void fill_indexes(out_type scan, ids_type ids)
438 unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
440 if (p >= scan.size()-1) {
return;}
442 auto sp = scan.template get<0>(p);
443 auto spp = scan.template get<0>(p+1);
446 {ids.template get<0>(scan.template get<0>(p)) = p;}
460 template<
unsigned int prp,
typename functor,
typename vector_type,
typename ids_type>
466 scan.setMemory(mem_tmp);
467 scan.resize(vd.size()+1);
469 auto ite = scan.getGPUIterator();
471 CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel(),(
unsigned int)end);
473 openfpm::scan((
unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(
unsigned int *)scan.template getDeviceBuffer<0>(),gpuContext);
476 scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
477 size_t nf = scan.template get<0>(scan.size()-1);
480 CUDA_LAUNCH(fill_indexes,ite,scan.toKernel(),ids.toKernel());
__device__ __host__ bool isInsideNP(const Point< dim, T > &p) const
Check if the point is inside the region excluding the positive part.
virtual void * getDevicePointer()
get a readable pointer with the data
virtual void deviceToHost()
Move memory from device to host.
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
This class implement the point shape in an N-dimensional space.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Implementation of 1-D std::vector like structure.
Vcluster< Memory > & getVC()
Get the Virtual Cluster machine.
size_t size_local() const
return the local size of the vector
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
const vector_dist_pos & getPosVector() const
return the position vector of all the particles
void setReferenceCounterToOne()
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
boost::fusion::vector< list... > type
internal type containing the data
In general a reduction of a type T produce a type T.