8#ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
9#define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
11#include "util/common_pdata.hpp"
13constexpr int NO_POSITION = 1;
14constexpr int WITH_POSITION = 2;
15constexpr int NO_CHANGE_ELEMENTS = 4;
17constexpr int BIND_DEC_TO_GHOST = 1;
19constexpr int MAP_LOCAL = 2;
21constexpr int GHOST_SYNC = 0;
22constexpr int GHOST_ASYNC = 1;
24template<
unsigned int dim,
typename St,
typename prop,
typename Memory,
template<
typename>
class layout_base,
typename Decomposition,
bool is_ok_cuda>
34 layout_base> & proc_id_out,
37 layout_base> & starts,
47 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
53template<
unsigned int dim,
typename St,
typename prop,
typename Memory,
template<
typename>
class layout_base,
typename Decomposition>
63 layout_base> & proc_id_out,
66 layout_base> & starts,
76#if defined(CUDA_GPU) && defined(__NVCC__)
81 proc_id_out.resize(v_pos.size()+1);
82 proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
83 proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);
85 auto ite = v_pos.getGPUIterator();
92 CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,
decltype(dec.toKernel()),
decltype(v_pos.toKernel()),
decltype(proc_id_out.toKernel())>),
94 dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
98 starts.resize(proc_id_out.size());
99 openfpm::scan((
unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (
unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.
getgpuContext());
100 starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
101 size_t sz = starts.template get<0>(starts.size()-1);
105 g_opart_device.resize(sz);
107 ite = v_pos.getGPUIterator();
110 CUDA_LAUNCH((proc_label_id_ghost<dim,St,
decltype(dec.toKernel()),
decltype(v_pos.toKernel()),
decltype(starts.toKernel()),
decltype(g_opart_device.toKernel())>),
112 dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
115 openfpm::sort((
int *)g_opart_device.template getDeviceBuffer<0>(),(
long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), gpu::template less_t<int>(), v_cl.
getgpuContext());
119 prc_offset.resize(v_cl.
size());
121 ite = g_opart_device.getGPUIterator();
126 CUDA_LAUNCH((find_buffer_offsets<0,
decltype(g_opart_device.toKernel()),
decltype(prc_offset.toKernel())>),
128 g_opart_device.toKernel(),(
int *)mem.
getDevicePointer(),prc_offset.toKernel());
136 prc_offset.resize(noff+1,DATA_ON_DEVICE);
139 if (g_opart_device.size() != 0)
140 {g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
141 prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
142 if (g_opart_device.size() != 0)
143 {prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
145 {prc_offset.template get<1>(prc_offset.size()-1) = 0;}
147 prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);
150 openfpm::sort((
int *)prc_offset.template getDeviceBuffer<0>(),(
int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), gpu::template less_t<int>(), v_cl.
getgpuContext());
152 prc_offset.template deviceToHost<0,1>();
155 if (g_opart_device.size() == 0)
159 prc_sz.resize(noff+1);
161 size_t base_offset = 0;
165 for (
size_t i = 0 ; i < noff+1 ; i++)
167 prc.get(i) = prc_offset.template get<1>(i);
168 prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset;
169 base_offset = prc_offset.template get<0>(i);
173 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl;
179template<
bool with_pos,
unsigned int dim,
typename St,
typename prop,
typename Memory,
template <
typename>
class layout_base,
bool is_ok_cuda>
188 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
192template<
bool with_pos,
unsigned int dim,
typename St,
typename prop,
typename Memory,
template <
typename>
class layout_base>
201#if defined(CUDA_GPU) && defined(__NVCC__)
203 auto ite = o_part_loc.getGPUIterator();
205 size_t old = v_pos.size();
207 if (!(opt & NO_POSITION))
208 {v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);}
210 if (!(opt & SKIP_LABELLING))
212 v_prp.resize(v_prp.
size() + o_part_loc.size(),DATA_ON_DEVICE);
218 CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,
decltype(o_part_loc.toKernel()),
decltype(v_pos.toKernel()),
decltype(v_prp.toKernel()),
decltype(shifts.toKernel())>),
220 o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);
223 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
228template<
unsigned int dim,
typename St,
typename prop,
typename Memory,
template <
typename>
class layout_base,
bool is_ok_cuda>
242 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
247template<
unsigned int dim,
typename St,
typename prop,
typename Memory,
template <
typename>
class layout_base>
261#if defined(CUDA_GPU) && defined(__NVCC__)
263 o_part_loc.resize(g_m+1);
264 o_part_loc.template get<0>(o_part_loc.size()-1) = 0;
265 o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1);
268 auto ite = v_pos.getGPUIteratorTo(g_m);
271 CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,
decltype(box_f_dev.toKernel()),
decltype(box_f_sv.toKernel()),
decltype(v_pos.toKernel()),
decltype(o_part_loc.toKernel())>),
273 box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
275 starts.resize(o_part_loc.size());
276 openfpm::scan((
unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (
unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.
getgpuContext());
278 starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
279 size_t total = starts.template get<0>(starts.size()-1);
280 size_t old = v_pos.size();
282 v_pos.resize(v_pos.size() + total);
283 v_prp.resize(v_prp.
size() + total);
286 ite = v_pos.getGPUIteratorTo(g_m);
289 o_part_loc.resize(total);
291 CUDA_LAUNCH((shift_ghost_each_part<dim,St,
decltype(box_f_dev.toKernel()),
decltype(box_f_sv.toKernel()),
292 decltype(v_pos.toKernel()),
decltype(v_prp.toKernel()),
293 decltype(starts.toKernel()),
decltype(shifts.toKernel()),
294 decltype(o_part_loc.toKernel())>),
296 box_f_dev.toKernel(),box_f_sv.toKernel(),
297 v_pos.toKernel(),v_prp.toKernel(),
298 starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m);
301 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
This class represent an N-dimensional box.
virtual void * getDevicePointer()
get a readable pointer with the data
virtual void deviceToHost()
Move memory from device to host.
virtual void fill(unsigned char c)
fill the buffer with a byte
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
This class define the domain decomposition interface.
This class implement the point shape in an N-dimensional space.
size_t size()
Get the total number of processors.
gpu::ofp_context_t & getgpuContext(bool iw=true)
If nvidia cuda is activated return a gpu context.
Implementation of VCluster class.
Implementation of 1-D std::vector like structure.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)