8#ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
9#define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
13#if defined(CUDA_GPU) && defined(__NVCC__)
14#include "Vector/cuda/vector_dist_cuda_funcs.cuh"
15#include "util/cuda/kernels.cuh"
18#include "Vector/util/vector_dist_funcs.hpp"
19#include "cuda/vector_dist_comm_util_funcs.cuh"
20#include "util/cuda/scan_ofp.cuh"
25 static float ret(T & tmp)
34 static float ret(
float & tmp)
44inline static size_t compute_options(
size_t opt)
47 if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING)
48 {opt_ = RECEIVE_KNOWN | KNOWN_ELEMENT_OR_BYTE;}
50 if (opt & RUN_ON_DEVICE)
52#if defined(CUDA_GPU) && defined(__NVCC__)
54 opt_ |= MPI_GPU_DIRECT;
56 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
69template<
unsigned int impl,
template<
typename>
class layout_base,
unsigned int ... prp>
72 template<
typename Vcluster_type,
typename vector_prop_type,
73 typename vector_pos_type,
typename send_vector,
74 typename prc_recv_get_type,
typename prc_g_opart_type,
75 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
76 typename g_opart_sz_type>
77 static inline void sendrecv_prp(Vcluster_type & v_cl,
79 vector_prop_type & v_prp,
80 vector_pos_type & v_pos,
81 prc_g_opart_type & prc_g_opart,
82 prc_recv_get_type & prc_recv_get,
83 recv_sz_get_type & recv_sz_get,
84 recv_sz_get_byte_type & recv_sz_get_byte,
85 g_opart_sz_type & g_opart_sz,
92 if (
sizeof...(prp) != 0)
94 size_t opt_ = compute_options(opt);
95 if (opt & SKIP_LABELLING)
97 if (opt & RUN_ON_DEVICE)
100 v_cl.template SSendRecvP_op<
op_ssend_gg_recv_merge_run_device,send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
105 v_cl.template SSendRecvP_op<
op_ssend_gg_recv_merge,send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
109 {v_cl.template SSendRecvP<send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
112 g_opart_sz.resize(prc_g_opart.size());
114 for (
size_t i = 0 ; i < prc_g_opart.size() ; i++)
115 g_opart_sz.get(i) = g_send_prp.get(i).
size();
119 template<
typename Vcluster_type,
typename vector_prop_type,
120 typename vector_pos_type,
typename send_pos_vector,
121 typename prc_recv_get_type,
typename prc_g_opart_type,
122 typename recv_sz_get_type>
123 static inline void sendrecv_pos(Vcluster_type & v_cl,
125 vector_prop_type & v_prp,
126 vector_pos_type & v_pos,
127 prc_recv_get_type & prc_recv_get,
128 recv_sz_get_type & recv_sz_get,
129 prc_g_opart_type & prc_g_opart,
132 size_t opt_ = compute_options(opt);
133 if (opt & SKIP_LABELLING)
135 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
139 prc_recv_get.clear();
141 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
145 template<
typename Vcluster_type,
typename vector_prop_type,
146 typename vector_pos_type,
typename send_pos_vector,
147 typename prc_recv_get_type,
typename prc_g_opart_type,
148 typename recv_sz_get_type>
149 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
151 vector_prop_type & v_prp,
152 vector_pos_type & v_pos,
153 prc_recv_get_type & prc_recv_get,
154 recv_sz_get_type & recv_sz_get,
155 prc_g_opart_type & prc_g_opart,
159 template<
typename Vcluster_type,
typename vector_prop_type,
160 typename vector_pos_type,
typename send_vector,
161 typename prc_recv_get_type,
typename prc_g_opart_type,
162 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
163 typename g_opart_sz_type>
164 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
166 vector_prop_type & v_prp,
167 vector_pos_type & v_pos,
168 prc_g_opart_type & prc_g_opart,
169 prc_recv_get_type & prc_recv_get,
170 recv_sz_get_type & recv_sz_get,
171 recv_sz_get_byte_type & recv_sz_get_byte,
172 g_opart_sz_type & g_opart_sz,
179template<
template<
typename>
class layout_base,
unsigned int ... prp>
182 template<
typename Vcluster_type,
typename vector_prop_type,
183 typename vector_pos_type,
typename send_vector,
184 typename prc_recv_get_type,
typename prc_g_opart_type,
185 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
186 typename g_opart_sz_type>
187 static inline void sendrecv_prp(Vcluster_type & v_cl,
189 vector_prop_type & v_prp,
190 vector_pos_type & v_pos,
191 prc_g_opart_type & prc_g_opart,
192 prc_recv_get_type & prc_recv_get,
193 recv_sz_get_type & recv_sz_get,
194 recv_sz_get_byte_type & recv_sz_get_byte,
195 g_opart_sz_type & g_opart_sz,
199 prc_recv_get.clear();
205 if (
sizeof...(prp) != 0)
207 size_t opt_ = compute_options(opt);
208 if (opt & SKIP_LABELLING)
210 if (opt & RUN_ON_DEVICE)
213 v_cl.template SSendRecvP_opAsync<
op_ssend_gg_recv_merge_run_device,send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
218 v_cl.template SSendRecvP_opAsync<
op_ssend_gg_recv_merge,send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
222 {v_cl.template SSendRecvPAsync<send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
226 g_opart_sz.resize(prc_g_opart.size());
228 for (
size_t i = 0 ; i < prc_g_opart.size() ; i++)
229 {g_opart_sz.get(i) = g_send_prp.get(i).
size();}
232 template<
typename Vcluster_type,
typename vector_prop_type,
233 typename vector_pos_type,
typename send_pos_vector,
234 typename prc_recv_get_type,
typename prc_g_opart_type,
235 typename recv_sz_get_type>
236 static inline void sendrecv_pos(Vcluster_type & v_cl,
238 vector_prop_type & v_prp,
239 vector_pos_type & v_pos,
240 prc_recv_get_type & prc_recv_get,
241 recv_sz_get_type & recv_sz_get,
242 prc_g_opart_type & prc_g_opart,
245 prc_recv_get.clear();
248 size_t opt_ = compute_options(opt);
249 if (opt & SKIP_LABELLING)
251 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
255 prc_recv_get.clear();
257 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
261 template<
typename Vcluster_type,
typename vector_prop_type,
262 typename vector_pos_type,
typename send_pos_vector,
263 typename prc_recv_get_type,
typename prc_g_opart_type,
264 typename recv_sz_get_type>
265 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
267 vector_prop_type & v_prp,
268 vector_pos_type & v_pos,
269 prc_recv_get_type & prc_recv_get,
270 recv_sz_get_type & recv_sz_get,
271 prc_g_opart_type & prc_g_opart,
274 size_t opt_ = compute_options(opt);
275 if (opt & SKIP_LABELLING)
277 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
281 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
285 template<
typename Vcluster_type,
typename vector_prop_type,
286 typename vector_pos_type,
typename send_vector,
287 typename prc_recv_get_type,
typename prc_g_opart_type,
288 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
289 typename g_opart_sz_type>
290 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
292 vector_prop_type & v_prp,
293 vector_pos_type & v_pos,
294 prc_g_opart_type & prc_g_opart,
295 prc_recv_get_type & prc_recv_get,
296 recv_sz_get_type & recv_sz_get,
297 recv_sz_get_byte_type & recv_sz_get_byte,
298 g_opart_sz_type & g_opart_sz,
305 if (
sizeof...(prp) != 0)
307 size_t opt_ = compute_options(opt);
308 if (opt & SKIP_LABELLING)
310 if (opt & RUN_ON_DEVICE)
313 v_cl.template SSendRecvP_opWait<
op_ssend_gg_recv_merge_run_device,send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
318 v_cl.template SSendRecvP_opWait<
op_ssend_gg_recv_merge,send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
322 {v_cl.template SSendRecvPWait<send_vector,
decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
340template<
unsigned int dim,
447 template<
typename prp_object,
int ... prp>
451 template<
typename T1,
typename T2>
inline static void proc(
size_t lbl,
size_t cnt,
size_t id, T1 & v_prp, T2 & m_prp)
459 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(
id), m_prp.get(lbl).get(cnt));
473 if (recv_sz_get_prp.
size() != 0)
474 {
return recv_sz_get_prp.get(i);}
485 if (prc_recv_get_prp.
size() != 0)
486 {
return prc_recv_get_prp.
size();}
497 if (prc_recv_get_prp.
size() != 0)
498 {
return prc_recv_get_prp;}
514 if (opt & RUN_ON_DEVICE)
518 for (
size_t i = 0; i <
prc_sz.size() ; i++)
520 if (
prc_sz.template get<1>(i) != (
unsigned int)-1)
522 prc_r.add(
prc_sz.template get<1>(i));
523 prc_sz_r.add(
prc_sz.template get<0>(i) - prev_off);
525 prev_off =
prc_sz.template get<0>(i);
532 for (
size_t i = 0; i <
v_cl.getProcessingUnits(); i++)
534 if (
prc_sz.template get<0>(i) != 0 &&
v_cl.rank() != i)
537 prc_sz_r.add(
prc_sz.template get<0>(i));
549 for (
size_t i = 0; i <
v_cl.getProcessingUnits(); i++)
551 if (
prc_sz.template get<0>(i) != 0)
555 prc_sz_r.add(
prc_sz.template get<0>(i));
597 unsigned int box_f_sv;
600 bool operator<(
const sh_box & tmp)
const
602 return shift_id < tmp.shift_id;
613 for (
size_t i = 0; i <
dec.getNLocalSub(); i++)
615 size_t Nl =
dec.getLocalNIGhost(i);
617 for (
size_t j = 0; j < Nl; j++)
621 if (
dec.getLocalIGhostPos(i, j).n_zero() == dim)
625 auto it =
map_cmb.find(
dec.getLocalIGhostPos(i, j).lin());
630 box_f.last().add(
dec.getLocalIGhostBox(i, j));
637 box_f.get(it->second).add(
dec.getLocalIGhostBox(i, j));
641 reord_shift.last().shift_id =
dec.getLocalIGhostPos(i, j).lin();
642 reord_shift.last().box_f_dev =
dec.getLocalIGhostBox(i, j);
643 reord_shift.last().box_f_sv =
dec.convertShift(
dec.getLocalIGhostPos(i, j));
651 box_f_sv.resize(reord_shift.
size());
653 for (
size_t i = 0 ; i < reord_shift.
size() ; i++)
655 box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
656 box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv;
663 box_f_sv.template hostToDevice<0>();
684 if (!(opt & NO_POSITION))
686 if (opt & RUN_ON_DEVICE)
693 for (
size_t i = 0 ; i <
o_part_loc.size() ; i++)
695 size_t lin_id =
o_part_loc.template get<1>(i);
700 p -= shifts.get(lin_id);
704 v_prp.get(
lg_m+i) = v_prp.get(key);
710 if (opt & RUN_ON_DEVICE)
717 for (
size_t i = 0 ; i <
o_part_loc.size() ; i++)
721 v_prp.get(
lg_m+i) = v_prp.get(key);
736 size_t g_m,
size_t opt)
743 if (opt & RUN_ON_DEVICE)
751 auto it = v_pos.getIteratorTo(g_m);
758 for (
size_t i = 0; i <
box_f.size(); i++)
760 for (
size_t j = 0; j <
box_f.get(i).size(); j++)
762 if (
box_f.get(i).get(j).isInsideNP(v_pos.get(key)) ==
true)
764 size_t lin_id =
dec.convertShift(
box_cmb.get(i));
772 p -= shifts.get(lin_id);
777 v_prp.last() = v_prp.get(key);
853 if (!(opt & SKIP_LABELLING))
856 if (
box_f.size() == 0)
860 if (opt & SKIP_LABELLING)
883 g_pos_send.resize(
prc_sz.size());
885 size_t old_hsmem_size = 0;
896 for (
size_t i = 0; i < g_pos_send.
size(); i++)
900 if (
hsmem.get(i+old_hsmem_size).ref() == 0)
901 {
hsmem.get(i+old_hsmem_size).incRef();}
904 g_pos_send.get(i).setMemory(
hsmem.get(i+old_hsmem_size));
907 g_pos_send.get(i).resize(
prc_sz.get(i));
910 if (opt & RUN_ON_DEVICE)
912#if defined(CUDA_GPU) && defined(__NVCC__)
917 for (
size_t i = 0 ; i < g_pos_send.
size() ; i++)
919 auto ite = g_pos_send.get(i).getGPUIterator();
921 CUDA_LAUNCH((process_ghost_particles_pos<dim,
decltype(
g_opart_device.toKernel()),
decltype(g_pos_send.get(i).toKernel()),
decltype(v_pos.toKernel()),
decltype(shifts.toKernel())>),
924 v_pos.toKernel(),shifts.toKernel(),offset);
931 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
940 for (
size_t j = 0; j <
g_opart.get(i).size(); j++)
943 s -= shifts.get(
g_opart.get(i).template get<1>(j));
944 g_pos_send.get(i).set(j, s);
961 template<
typename send_vector,
typename prp_object,
int ... prp>
974 g_send_prp.resize(nproc);
978 for (
size_t i = 0; i < g_send_prp.
size(); i++)
982 if (
hsmem.get(i).ref() == 0)
983 hsmem.get(i).incRef();
986 g_send_prp.get(i).setMemory(
hsmem.get(i));
991 g_send_prp.get(i).resize(n_part_recv);
996 if (opt & RUN_ON_DEVICE)
998#if defined(CUDA_GPU) && defined(__NVCC__)
1000 if (
sizeof...(prp) != 0)
1003 for (
size_t i = 0 ; i < g_send_prp.
size() ; i++)
1007 auto ite = g_send_prp.get(i).getGPUIterator();
1009 if (ite.nblocks() == 0) {
continue;}
1011 CUDA_LAUNCH((process_ghost_particles_prp_put<
decltype(g_send_prp.get(i).toKernel()),
decltype(v_prp.toKernel()),prp...>),
1013 g_send_prp.get(i).toKernel(),
1014 v_prp.toKernel(),accum);
1016 accum = accum + n_part_recv;
1022 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1029 for (
size_t i = 0; i < g_send_prp.
size(); i++)
1034 for (
size_t j = accum; j < accum + n_part_recv; j++)
1042 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2));
1047 accum = accum + n_part_recv;
1059 for (
size_t i = nbf ; i < rt_buf.size() ; i++)
1061 rt_buf.get(i).decRef();
1071 template<
typename send_vector,
typename v_mpl>
1078 openfpm::vector_fr<Memory> & hsmem;
1083 openfpm::vector_fr<Memory> & hsmem,
size_t j)
1084 :g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j)
1088 template<
typename T>
1091 g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j));
1097 template<
bool inte_or_lin,
typename send_vector,
typename v_mpl>
1103 openfpm::vector_fr<Memory> &
hsmem,
1107 g_send_prp.get(i).setMemory(
hsmem.get(j));
1110 g_send_prp.get(i).resize(
prc_sz.get(i));
1116 template<
typename send_vector,
typename v_mpl>
1122 openfpm::vector_fr<Memory> &
hsmem,
1127 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi);
1130 if (boost::mpl::size<v_mpl>::type::value != 0)
1133 g_send_prp.get(i).resize(
prc_sz.get(i));
1150 template<
typename send_vector,
typename prp_object,
int ... prp>
1160 if (
is_layout_inte<layout_base<prop>>::value ==
true) {factor *=
sizeof...(prp);}
1163 g_send_prp.resize(
prc_sz.size());
1167 for (
size_t i = 0; i <
hsmem.size(); i++)
1171 if (
hsmem.get(i).ref() == 0)
1172 {
hsmem.get(i).incRef();}
1176 for (
size_t i = 0; i < g_send_prp.
size(); i++)
1181 if (opt & RUN_ON_DEVICE)
1183#if defined(CUDA_GPU) && defined(__NVCC__)
1187 if (
sizeof...(prp) != 0)
1190 for (
size_t i = 0 ; i < g_send_prp.
size() ; i++)
1192 auto ite = g_send_prp.get(i).getGPUIterator();
1194 CUDA_LAUNCH((process_ghost_particles_prp<
decltype(
g_opart_device.toKernel()),
decltype(g_send_prp.get(i).toKernel()),
decltype(v_prp.toKernel()),prp...>),
1197 v_prp.toKernel(),offset);
1205 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1212 if (
sizeof...(prp) == 0) {
return;}
1217 for (
size_t j = 0; j <
g_opart.get(i).size(); j++)
1220 typedef decltype(v_prp.get(
g_opart.get(i).template get<0>(j))) encap_src;
1222 typedef decltype(g_send_prp.get(i).get(j)) encap_dst;
1225 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(
g_opart.get(i).template get<0>(j)), g_send_prp.get(i).get(j));
1251 m_prp.resize(prc_sz_r.
size());
1252 m_pos.resize(prc_sz_r.
size());
1255 for (
size_t i = 0; i < prc_sz_r.
size() ; i++)
1258 m_pos.get(i).resize(prc_sz_r.get(i));
1259 m_prp.get(i).resize(prc_sz_r.get(i));
1263 if (opt & RUN_ON_DEVICE)
1265 if (
v_cl.size() == 1)
1268#if defined(CUDA_GPU) && defined(__NVCC__)
1272 int rank =
v_cl.rank();
1279 starts.template deviceToHost<0>();
1280 size_t offset =
starts.template get<0>(rank);
1283 if (ite.wthr.x != 0)
1286 CUDA_LAUNCH((process_map_particles<
decltype(
m_opart.toKernel()),
decltype(
v_pos_tmp.toKernel()),
decltype(
v_prp_tmp.toKernel()),
1287 decltype(v_pos.toKernel()),
decltype(v_prp.toKernel())>),
1290 v_pos.toKernel(),v_prp.toKernel(),offset);
1294 for (
size_t i = 0 ; i < m_pos.size() ; i++)
1296 size_t offset =
starts.template get<0>(prc_r.template get<0>(i));
1298 auto ite = m_pos.get(i).getGPUIterator();
1301 if (ite.wthr.x != 0)
1304 CUDA_LAUNCH((process_map_particles<
decltype(
m_opart.toKernel()),
decltype(m_pos.get(i).toKernel()),
decltype(m_prp.get(i).toKernel()),
1305 decltype(v_pos.toKernel()),
decltype(v_prp.toKernel())>),
1307 m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1308 v_pos.toKernel(),v_prp.toKernel(),offset);
1319 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1326 long int id_end = v_pos.
size();
1329 long int end =
m_opart.size()-1;
1332 for (
size_t i = 0; i <
m_opart.size(); i++)
1334 process_map_particle<proc_without_prp>(i,end,id_end,
m_opart,
p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1337 v_pos.resize(v_pos.size() -
m_opart.size());
1355 template<
typename prp_object,
int ... prp>
1362 m_prp.resize(prc_sz_r.
size());
1363 m_pos.resize(prc_sz_r.
size());
1366 for (
size_t i = 0; i < prc_sz_r.
size(); i++)
1369 m_pos.get(i).resize(prc_sz_r.get(i));
1370 m_prp.get(i).resize(prc_sz_r.get(i));
1375 long int id_end = v_pos.size();
1378 long int end =
m_opart.size()-1;
1381 for (
size_t i = 0; i <
m_opart.size(); i++)
1383 process_map_particle<
proc_with_prp<prp_object,prp...>>(i,end,id_end,
m_opart,
p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1386 v_pos.resize(v_pos.size() -
m_opart.size());
1401 layout_base> & lbl_p,
1405 if (opt == RUN_ON_DEVICE)
1411 lbl_p.resize(v_pos.size());
1415 prc_sz.template fill<0>(0);
1417 auto ite = v_pos.getGPUIterator();
1418 if (ite.wthr.x == 0)
1421 starts.template fill<0>(0);
1426 if (
v_cl.size() == 1)
1432 for (
size_t i = 0 ; i < dim ; i++) {bc.bc[i] =
dec.periodicity(i);}
1434 CUDA_LAUNCH((apply_bc_each_part<dim,St,
decltype(v_pos.toKernel())>),ite,
dec.getDomain(),bc,v_pos.toKernel());
1440 CUDA_LAUNCH((process_id_proc_each_part<dim,St,
decltype(
dec.toKernel()),
decltype(v_pos.toKernel()),
decltype(lbl_p.toKernel()),
decltype(
prc_sz.toKernel())>),
1442 dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),
prc_sz.toKernel(),
v_cl.rank());
1445 openfpm::scan((
unsigned int *)
prc_sz.template getDeviceBuffer<0>(),
prc_sz.size(), (
unsigned int *)
starts.template getDeviceBuffer<0>() ,
v_cl.getgpuContext());
1448 prc_sz.template deviceToHost<0>();
1450 ite = lbl_p.getGPUIterator();
1453 CUDA_LAUNCH((reorder_lbl<
decltype(lbl_p.toKernel()),
decltype(
starts.toKernel())>),ite,lbl_p.toKernel(),
starts.toKernel());
1458 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
1472 prc_sz.template fill<0>(0);
1474 auto it = v_pos.getIterator();
1479 auto key = it.get();
1482 dec.applyPointBC(v_pos.get(key));
1487 if (
dec.getDomain().isInside(v_pos.get(key)) ==
true)
1488 {p_id =
dec.processorID(v_pos.get(key));}
1490 {p_id = obp::out(key,
v_cl.getProcessUnitID());}
1493 if (p_id !=
v_cl.getProcessUnitID())
1495 if ((
long int) p_id != -1)
1497 prc_sz.template get<0>(p_id)++;
1499 lbl_p.last().template get<0>() = key;
1500 lbl_p.last().template get<2>() = p_id;
1505 lbl_p.last().template get<0>() = key;
1506 lbl_p.last().template get<2>() = p_id;
1544 if (opt & RUN_ON_DEVICE)
1548 ::run(
mem,
dec,
g_opart_device,
proc_id_out,
starts,
v_cl,v_pos,v_prp,prc,
prc_sz,
prc_offset,g_m,opt);
1553 auto it = v_pos.getIteratorTo(g_m);
1556 auto key = it.get();
1562 for (
size_t i = 0; i < vp_id.
size(); i++)
1565 size_t p_id = vp_id.get(i).first;
1569 g_opart.get(p_id).last().template get<0>() = key;
1570 g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second;
1586 g_opart.get(i).swap(g_opart_f.last());
1587 prc.add(
dec.IDtoProc(i));
1593#ifdef EXTREA_TRACE_PRE_COMM
1594 Extrae_user_function (0);
1611 static void *
message_alloc_map(
size_t msg_i,
size_t total_msg,
size_t total_p,
size_t i,
size_t ri,
void * ptr)
1614 vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> * vd =
static_cast<vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> *
>(ptr);
1616 vd->recv_mem_gm.resize(vd->
v_cl.getProcessingUnits());
1617 vd->recv_mem_gm.get(i).resize(msg_i);
1619 return vd->recv_mem_gm.get(i).getPointer();
1630 :
v_cl(create_vcluster<Memory>()),
dec(create_vcluster()),
lg_m(0)
1662 :
v_cl(create_vcluster<Memory>()),
dec(create_vcluster()),
lg_m(0)
1673 for (
size_t i = 0 ; i <
hsmem.size() ; i++)
1675 if (
hsmem.get(i).ref() == 1)
1676 hsmem.get(i).decRef();
1678 std::cout << __FILE__ <<
":" << __LINE__ <<
" internal error memory is in an invalid state " << std::endl;
1700 this->v_sub_unit_factor = n_sub;
1712 const size_t (& bc)[dim],
1719 if (opt & BIND_DEC_TO_GHOST)
1725 CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
1728 cl_param_calculateSym<dim,St>(box,cd_sm,g,pad);
1730 for (
size_t i = 0 ; i < dim ; i++)
1731 {div[i] = cd_sm.getDiv()[i] - 2*pad;}
1734 dec.setParameters(div, box, bc, g, gdist);
1752 const size_t (& bc)[dim],
1759 for (
size_t i = 0 ; i < dim ; i++)
1760 {div[i] = gdist.
size(i);}
1763 dec.setParameters(div, box, bc, g);
1781 size_t opt = WITH_POSITION)
1783#ifdef PROFILE_SCOREP
1784 SCOREP_USER_REGION(
"ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
1793 if (!(opt & NO_POSITION))
1794 {v_pos.resize(g_m);}
1798 if (!(opt & SKIP_LABELLING))
1799 {v_prp.resize(g_m);}
1802 if ((opt & SKIP_LABELLING) ==
false)
1811 #if defined(CUDA_GPU) && defined(__NVCC__)
1812 cudaDeviceSynchronize();
1823 if (!(opt & NO_POSITION))
1830#if defined(CUDA_GPU) && defined(__NVCC__)
1831 cudaDeviceSynchronize();
1847 if (!(opt & SKIP_LABELLING))
1849 v_prp.resize(v_pos.size());
1868 size_t opt = WITH_POSITION)
1907 if (opt & RUN_ON_DEVICE)
1909 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" map_list is unsupported on device (coming soon)" << std::endl;
1931 for (
size_t i = 0; i <
v_cl.getProcessingUnits(); i++)
1933 if (
prc_sz.template get<0>(i) != 0)
1937 prc_sz_r.add(
prc_sz.template get<0>(i));
1941 if (opt & MAP_LOCAL)
1946 for (
size_t i = 0 ; i <
dec.getNNProcessors() ; i++)
1961 v_cl.template SSendRecvP<openfpm::vector<prp_object>,
decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,
prc_recv_map,
recv_sz_map,opt);
1981 template<
typename obp = KillParticle>
1986#ifdef PROFILE_SCOREP
1987 SCOREP_USER_REGION(
"map",SCOREP_USER_REGION_TYPE_FUNCTION)
2014 if (opt & RUN_ON_DEVICE)
2016#if defined(CUDA_GPU) && defined(__NVCC__)
2018 cudaDeviceSynchronize();
2019 opt_ |= MPI_GPU_DIRECT;
2021 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2030 v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>,
2067 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> &
operator=(
const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & vc)
2081 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> &
operator=(
vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> && vc)
2099 template<
template<
typename,
typename>
class op,
int ... prp>
2114 if (opt & RUN_ON_DEVICE)
2116#if defined(CUDA_GPU) && defined(__NVCC__)
2118 cudaDeviceSynchronize();
2120 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2125 if (opt & NO_CHANGE_ELEMENTS)
2127 size_t opt_ = compute_options(opt);
2129 if (opt & RUN_ON_DEVICE)
2150 size_t opt_ = compute_options(opt);
2152 if (opt & RUN_ON_DEVICE)
2176 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" Local ghost particles = " << v_prp.
size() -
lg_m <<
" != " <<
o_part_loc.size() << std::endl;
2177 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" Check that you did a ghost_get before a ghost_put" << std::endl;
2181 if (opt & RUN_ON_DEVICE)
2183 v_prp.template merge_prp_v_device<op,
prop,Memory,
2190 v_prp.template merge_prp_v<op,
prop,Memory,
This class represent an N-dimensional box.
This class decompose a space into sub-sub-domains and distribute them across processors.
This class define the domain decomposition interface.
This class allocate, and destroy CPU memory.
This class implement the point shape in an N-dimensional space.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Implementation of VCluster class.
__device__ __host__ size_t size() const
Return the size of the grid.
Grow policy define how the vector should grow every time we exceed the size.
Grow policy define how the vector should grow every time we exceed the size.
Implementation of 1-D std::vector like structure.
This class is an helper for the communication of vector_dist.
openfpm::vector< size_t > prc_recv_get_pos
void init_decomposition(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
void map_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
void init_decomposition_gr_cell(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
openfpm::vector< size_t > recv_sz_get_byte
Conversion to byte of recv_sz_get.
vector_dist_comm(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &v)
Copy Constructor.
openfpm::vector< size_t > recv_sz_put
The same as recv_sz_get but for put.
Decomposition dec
Domain decomposition.
void map_list_(openfpm::vector< Point< dim, St > > &v_pos, openfpm::vector< prop > &v_prp, size_t &g_m, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &&vc)
Copy a vector.
~vector_dist_comm()
Destructor.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_offset
Processor communication size.
size_t get_last_ghost_get_received_parts(size_t i)
Get the number of particles received from each processor during the last ghost_get.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_sz
Processor communication size.
openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity > send_pos_vector
definition of the send vector for position
openfpm::vector< Box< dim, St >, Memory, layout_base > box_f_dev
The boxes touching the border of the domain + shift vector linearized from where they come from.
std::unordered_map< size_t, size_t > map_cmb
this map is used to check if a combination is already present
vector_dist_comm()
Constructor.
openfpm::vector_fr< Memory > hsmem
Sending buffer.
void ghost_put_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
Ghost put.
openfpm::vector< size_t > prc_sz_gg
elements sent for each processors (ghost_get)
openfpm::vector< aggregate< int, int, int >, Memory, layout_base > m_opart
openfpm::vector< Point< dim, St >, Memory, layout_base > v_pos_tmp
Helper buffer for computation (on GPU) of local particles (position)
openfpm::vector< size_t > prc_recv_put
the same as prc_recv_get but for put
openfpm::vector< aggregate< unsigned int, unsigned long int >, CudaMemory, memory_traits_inte > g_opart_device
Same as g_opart but on device, the vector of vector is flatten into a single vector.
openfpm::vector< openfpm::vector< aggregate< size_t, size_t > > > g_opart
void resize_retained_buffer(openfpm::vector_fr< Memory > &rt_buf, size_t nbf)
resize the retained buffer by nbf
openfpm::vector< size_t > prc_g_opart
processor rank list of g_opart
openfpm::vector< size_t > & get_last_ghost_get_num_proc_vector()
Get the number of processor involved during the last ghost_get.
CudaMemory mem
Temporary CudaMemory to do stuff.
void ghost_wait_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
openfpm::vector_std< openfpm::vector_std< Box< dim, St > > > box_f
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > o_part_loc
Id of the local particle to replicate for ghost_get.
void local_ghost_from_dec(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t g_m, size_t opt)
Local ghost from decomposition.
size_t get_last_ghost_get_num_proc()
Get the number of processor involved during the last ghost_get.
openfpm::vector< prop, Memory, layout_base > v_prp_tmp
Helper buffer for computation (on GPU) of local particles (properties)
void createShiftBox()
For every internal ghost box we create a structure that order such internal local ghost box in shift ...
Vcluster< Memory > & v_cl
VCluster.
vector_dist_comm(Decomposition &&dec)
Constructor.
Decomposition & getDecomposition()
Get the decomposition.
long int shift_box_ndec
From which decomposition the shift boxes are calculated.
void local_ghost_from_opart(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t opt)
Local ghost from labeled particles.
static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void *ptr)
Call-back to allocate buffer to receive incoming elements (particles)
void fill_send_map_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, openfpm::vector< openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity > > &m_pos, openfpm::vector< openfpm::vector< prop, Memory, layout_base, openfpm::grow_policy_identity > > &m_prp, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
allocate and fill the send buffer for the map function
size_t getDecompositionGranularity()
Get the number of minimum sub-domain per processor.
openfpm::vector< size_t > prc_recv_map
the same as prc_recv_get but for map
void add_loc_particles_bc(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
Add local particles based on the boundary conditions.
openfpm::vector< size_t > recv_sz_map
The same as recv_sz_get but for map.
void fill_send_ghost_pos_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_pos_vector > &g_pos_send, size_t opt, bool async)
This function fill the send buffer for the particle position after the particles has been label with ...
openfpm::vector< size_t > p_map_req
It map the processor id with the communication request into map procedure.
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > starts
temporary buffer for the scan result
void labelParticlesGhost(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc, openfpm::vector< size_t > &prc_sz, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_offset, size_t &g_m, size_t opt)
Label the particles.
void fill_send_ghost_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_vector > &g_send_prp, size_t opt)
This function fill the send buffer for properties after the particles has been label with labelPartic...
const Decomposition & getDecomposition() const
Get the decomposition.
openfpm::vector_std< comb< dim > > box_cmb
Store the sector for each group (previous vector)
vector_dist_comm(const Decomposition &dec)
Constructor.
openfpm::vector< size_t > recv_sz_get_pos
void calc_send_buffers(openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, size_t opt)
Calculate sending buffer size for each processor.
openfpm::vector< size_t > g_opart_sz
Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
size_t v_sub_unit_factor
Number of units for each sub-domain.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &vc)
Copy a vector.
void fill_send_ghost_put_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< send_vector > &g_send_prp, size_t &g_m, size_t opt)
This function fill the send buffer for ghost_put.
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > proc_id_out
temporary buffer to processors ids
void fill_send_map_buf_list(openfpm::vector< Point< dim, St > > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< openfpm::vector< Point< dim, St > > > &m_pos, openfpm::vector< openfpm::vector< prp_object > > &m_prp)
allocate and fill the send buffer for the map function
void labelParticleProcessor(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< aggregate< int, int, int >, Memory, layout_base > &lbl_p, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
Label particles for mappings.
void setDecompositionGranularity(size_t n_sub)
Set the minimum number of sub-domain per processor.
void ghost_get_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
Out of bound policy it detect out of bound particles and decide what to do.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
boost::fusion::vector< list... > type
internal type containing the data
template selector for asynchronous or not asynchronous
Transform the boost::fusion::vector into memory specification (memory_traits)
Transform the boost::fusion::vector into memory specification (memory_traits)
It create a boost::fusion vector with the selected properties.
It copy the properties from one object to another.
Helper class to merge data.
Helper class to merge data.
Helper class to merge data.
Helper class to merge data.
process the particle with properties
static void proc(size_t lbl, size_t cnt, size_t id, T1 &v_prp, T2 &m_prp)
process the particle
Set the buffer for each property.
void operator()(T &t)
It call the setMemory function for each property.