OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
vector_dist_comm_util_funcs.cuh
1/*
2 * vector_dist_comm_util_funcs.hpp
3 *
4 * Created on: Sep 13, 2018
5 * Author: i-bird
6 */
7
8#ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
9#define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
10
11#include "util/common_pdata.hpp"
12
13constexpr int NO_POSITION = 1;
14constexpr int WITH_POSITION = 2;
15constexpr int NO_CHANGE_ELEMENTS = 4;
16
17constexpr int BIND_DEC_TO_GHOST = 1;
18
19constexpr int MAP_LOCAL = 2;
20
21constexpr int GHOST_SYNC = 0;
22constexpr int GHOST_ASYNC = 1;
23
24template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, bool is_ok_cuda>
26{
27 static void run(CudaMemory & mem,
28 Decomposition & dec,
31 memory_traits_inte> & g_opart_device,
33 Memory,
34 layout_base> & proc_id_out,
36 Memory,
37 layout_base> & starts,
38 Vcluster<Memory> & v_cl,
39 openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
43 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
44 size_t & g_m,
45 size_t opt)
46 {
47 std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
48 }
49};
50
51
52
53template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition>
54struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,true>
55{
56 static void run(CudaMemory & mem,
57 Decomposition & dec,
60 memory_traits_inte> & g_opart_device,
62 Memory,
63 layout_base> & proc_id_out,
65 Memory,
66 layout_base> & starts,
67 Vcluster<Memory> & v_cl,
68 openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
72 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
73 size_t & g_m,
74 size_t opt)
75 {
76#if defined(CUDA_GPU) && defined(__NVCC__)
77
78 if (v_cl.size() == 1)
79 {return;}
80
81 proc_id_out.resize(v_pos.size()+1);
82 proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
83 proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);
84
85 auto ite = v_pos.getGPUIterator();
86
87 // no work to do return
88 if (ite.wthr.x == 0)
89 {return;}
90
91 // First we have to see how many entry each particle produce
92 CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>),
93 ite,
94 dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
95
96 // scan
97 //sc.scan_(proc_id_out,starts);
98 starts.resize(proc_id_out.size());
99 openfpm::scan((unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getgpuContext());
100 starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
101 size_t sz = starts.template get<0>(starts.size()-1);
102
103 // we compute processor id for each particle
104
105 g_opart_device.resize(sz);
106
107 ite = v_pos.getGPUIterator();
108
109 // we compute processor id for each particle
110 CUDA_LAUNCH((proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>),
111 ite,
112 dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
113
114 // sort particles
115 openfpm::sort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), gpu::template less_t<int>(), v_cl.getgpuContext());
116
117 mem.allocate(sizeof(int));
118 mem.fill(0);
119 prc_offset.resize(v_cl.size());
120
121 ite = g_opart_device.getGPUIterator();
122
123 if (ite.wthr.x != 0)
124 {
125 // Find the buffer bases
126 CUDA_LAUNCH((find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())>),
127 ite,
128 g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
129 }
130
131 // Trasfer the number of offsets on CPU
132 mem.deviceToHost();
133 int noff = *(int *)mem.getPointer();
134
135 // create the terminal of prc_offset
136 prc_offset.resize(noff+1,DATA_ON_DEVICE);
137
138 // Move the last processor index on device (id)
139 if (g_opart_device.size() != 0)
140 {g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
141 prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
142 if (g_opart_device.size() != 0)
143 {prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
144 else
145 {prc_offset.template get<1>(prc_offset.size()-1) = 0;}
146
147 prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);
148
149 // Here we reorder the offsets in ascending order
150 openfpm::sort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), gpu::template less_t<int>(), v_cl.getgpuContext());
151
152 prc_offset.template deviceToHost<0,1>();
153
154 // In this case we do not have communications at all
155 if (g_opart_device.size() == 0)
156 {noff = -1;}
157
158 prc.resize(noff+1);
159 prc_sz.resize(noff+1);
160
161 size_t base_offset = 0;
162
163 // Transfert to prc the list of processors
164 prc.resize(noff+1);
165 for (size_t i = 0 ; i < noff+1 ; i++)
166 {
167 prc.get(i) = prc_offset.template get<1>(i);
168 prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset;
169 base_offset = prc_offset.template get<0>(i);
170 }
171#else
172
173 std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl;
174
175#endif
176 }
177};
178
179template<bool with_pos,unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
181{
182 static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
183 const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
184 openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
186 size_t opt)
187 {
188 std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
189 }
190};
191
192template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
193struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
194{
195 static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
196 const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
197 openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
199 size_t opt)
200 {
201#if defined(CUDA_GPU) && defined(__NVCC__)
202
203 auto ite = o_part_loc.getGPUIterator();
204
205 size_t old = v_pos.size();
206
207 if (!(opt & NO_POSITION))
208 {v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);}
209
210 if (!(opt & SKIP_LABELLING))
211 {
212 v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE);
213 }
214
215
216 if (ite.wthr.x != 0)
217 {
218 CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>),
219 ite,
220 o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);
221 }
222#else
223 std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
224#endif
225 }
226};
227
228template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
230{
231 static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
232 const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
233 openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
234 openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
235 Vcluster<Memory> & v_cl,
236 openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
237 openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
239 size_t & g_m,
240 size_t opt)
241 {
242 std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
243 }
244};
245
246
247template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
248struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
249{
250 static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
251 const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
252 openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
253 openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
254 Vcluster<Memory> & v_cl,
255 openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
256 openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
258 size_t & g_m,
259 size_t opt)
260 {
261#if defined(CUDA_GPU) && defined(__NVCC__)
262
263 o_part_loc.resize(g_m+1);
264 o_part_loc.template get<0>(o_part_loc.size()-1) = 0;
265 o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1);
266
267 // Label the internal (assigned) particles
268 auto ite = v_pos.getGPUIteratorTo(g_m);
269
270 // label particle processor
271 CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>),
272 ite,
273 box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
274
275 starts.resize(o_part_loc.size());
276 openfpm::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getgpuContext());
277
278 starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
279 size_t total = starts.template get<0>(starts.size()-1);
280 size_t old = v_pos.size();
281
282 v_pos.resize(v_pos.size() + total);
283 v_prp.resize(v_prp.size() + total);
284
285 // Label the internal (assigned) particles
286 ite = v_pos.getGPUIteratorTo(g_m);
287
288 // resize o_part_loc
289 o_part_loc.resize(total);
290
291 CUDA_LAUNCH((shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
292 decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
293 decltype(starts.toKernel()),decltype(shifts.toKernel()),
294 decltype(o_part_loc.toKernel())>),
295 ite,
296 box_f_dev.toKernel(),box_f_sv.toKernel(),
297 v_pos.toKernel(),v_prp.toKernel(),
298 starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m);
299
300#else
301 std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
302#endif
303 }
304};
305
306#endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */
This class represent an N-dimensional box.
Definition Box.hpp:61
virtual void * getDevicePointer()
get a readable pointer with the data
virtual void deviceToHost()
Move memory from device to host.
virtual void fill(unsigned char c)
fill the buffer with a byte
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
Definition CudaMemory.cu:38
This class define the domain decomposition interface.
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28
size_t size()
Get the total number of processors.
gpu::ofp_context_t & getgpuContext(bool iw=true)
If nvidia cuda is activated return a gpu context.
Implementation of VCluster class.
Definition VCluster.hpp:59
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)