8 #ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_ 9 #define SRC_VECTOR_VECTOR_DIST_COMM_HPP_ 13 #if defined(CUDA_GPU) && defined(__NVCC__) 14 #include "Vector/cuda/vector_dist_cuda_funcs.cuh" 15 #include "util/cuda/kernels.cuh" 18 #include "Vector/util/vector_dist_funcs.hpp" 19 #include "cuda/vector_dist_comm_util_funcs.cuh" 20 #include "util/cuda/scan_ofp.cuh" 25 static float ret(T & tmp)
34 static float ret(
float & tmp)
44 inline static size_t compute_options(
size_t opt)
47 if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING)
48 {opt_ = RECEIVE_KNOWN | KNOWN_ELEMENT_OR_BYTE;}
50 if (opt & RUN_ON_DEVICE)
52 #if defined(CUDA_GPU) && defined(__NVCC__) 54 opt_ |= MPI_GPU_DIRECT;
56 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
69 template<
unsigned int impl,
template<
typename>
class layout_base,
unsigned int ... prp>
72 template<
typename Vcluster_type,
typename vector_prop_type,
73 typename vector_pos_type,
typename send_vector,
74 typename prc_recv_get_type,
typename prc_g_opart_type,
75 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
76 typename g_opart_sz_type>
77 static inline void sendrecv_prp(Vcluster_type & v_cl,
79 vector_prop_type & v_prp,
80 vector_pos_type & v_pos,
81 prc_g_opart_type & prc_g_opart,
82 prc_recv_get_type & prc_recv_get,
83 recv_sz_get_type & recv_sz_get,
84 recv_sz_get_byte_type & recv_sz_get_byte,
85 g_opart_sz_type & g_opart_sz,
92 if (
sizeof...(prp) != 0)
94 size_t opt_ = compute_options(opt);
95 if (opt & SKIP_LABELLING)
97 if (opt & RUN_ON_DEVICE)
100 v_cl.template SSendRecvP_op<
op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
105 v_cl.template SSendRecvP_op<
op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
109 {v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
112 g_opart_sz.resize(prc_g_opart.size());
114 for (
size_t i = 0 ; i < prc_g_opart.size() ; i++)
115 g_opart_sz.get(i) = g_send_prp.get(i).
size();
119 template<
typename Vcluster_type,
typename vector_prop_type,
120 typename vector_pos_type,
typename send_pos_vector,
121 typename prc_recv_get_type,
typename prc_g_opart_type,
122 typename recv_sz_get_type>
123 static inline void sendrecv_pos(Vcluster_type & v_cl,
125 vector_prop_type & v_prp,
126 vector_pos_type & v_pos,
127 prc_recv_get_type & prc_recv_get,
128 recv_sz_get_type & recv_sz_get,
129 prc_g_opart_type & prc_g_opart,
132 size_t opt_ = compute_options(opt);
133 if (opt & SKIP_LABELLING)
135 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
139 prc_recv_get.clear();
141 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
145 template<
typename Vcluster_type,
typename vector_prop_type,
146 typename vector_pos_type,
typename send_pos_vector,
147 typename prc_recv_get_type,
typename prc_g_opart_type,
148 typename recv_sz_get_type>
149 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
151 vector_prop_type & v_prp,
152 vector_pos_type & v_pos,
153 prc_recv_get_type & prc_recv_get,
154 recv_sz_get_type & recv_sz_get,
155 prc_g_opart_type & prc_g_opart,
159 template<
typename Vcluster_type,
typename vector_prop_type,
160 typename vector_pos_type,
typename send_vector,
161 typename prc_recv_get_type,
typename prc_g_opart_type,
162 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
163 typename g_opart_sz_type>
164 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
166 vector_prop_type & v_prp,
167 vector_pos_type & v_pos,
168 prc_g_opart_type & prc_g_opart,
169 prc_recv_get_type & prc_recv_get,
170 recv_sz_get_type & recv_sz_get,
171 recv_sz_get_byte_type & recv_sz_get_byte,
172 g_opart_sz_type & g_opart_sz,
179 template<
template<
typename>
class layout_base,
unsigned int ... prp>
182 template<
typename Vcluster_type,
typename vector_prop_type,
183 typename vector_pos_type,
typename send_vector,
184 typename prc_recv_get_type,
typename prc_g_opart_type,
185 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
186 typename g_opart_sz_type>
187 static inline void sendrecv_prp(Vcluster_type & v_cl,
189 vector_prop_type & v_prp,
190 vector_pos_type & v_pos,
191 prc_g_opart_type & prc_g_opart,
192 prc_recv_get_type & prc_recv_get,
193 recv_sz_get_type & recv_sz_get,
194 recv_sz_get_byte_type & recv_sz_get_byte,
195 g_opart_sz_type & g_opart_sz,
199 prc_recv_get.clear();
205 if (
sizeof...(prp) != 0)
207 size_t opt_ = compute_options(opt);
208 if (opt & SKIP_LABELLING)
210 if (opt & RUN_ON_DEVICE)
213 v_cl.template SSendRecvP_opAsync<
op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
218 v_cl.template SSendRecvP_opAsync<
op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
222 {v_cl.template SSendRecvPAsync<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
226 g_opart_sz.resize(prc_g_opart.size());
228 for (
size_t i = 0 ; i < prc_g_opart.size() ; i++)
229 {g_opart_sz.get(i) = g_send_prp.get(i).
size();}
232 template<
typename Vcluster_type,
typename vector_prop_type,
233 typename vector_pos_type,
typename send_pos_vector,
234 typename prc_recv_get_type,
typename prc_g_opart_type,
235 typename recv_sz_get_type>
236 static inline void sendrecv_pos(Vcluster_type & v_cl,
238 vector_prop_type & v_prp,
239 vector_pos_type & v_pos,
240 prc_recv_get_type & prc_recv_get,
241 recv_sz_get_type & recv_sz_get,
242 prc_g_opart_type & prc_g_opart,
245 prc_recv_get.clear();
248 size_t opt_ = compute_options(opt);
249 if (opt & SKIP_LABELLING)
251 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
255 prc_recv_get.clear();
257 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
261 template<
typename Vcluster_type,
typename vector_prop_type,
262 typename vector_pos_type,
typename send_pos_vector,
263 typename prc_recv_get_type,
typename prc_g_opart_type,
264 typename recv_sz_get_type>
265 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
267 vector_prop_type & v_prp,
268 vector_pos_type & v_pos,
269 prc_recv_get_type & prc_recv_get,
270 recv_sz_get_type & recv_sz_get,
271 prc_g_opart_type & prc_g_opart,
274 size_t opt_ = compute_options(opt);
275 if (opt & SKIP_LABELLING)
277 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
281 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
285 template<
typename Vcluster_type,
typename vector_prop_type,
286 typename vector_pos_type,
typename send_vector,
287 typename prc_recv_get_type,
typename prc_g_opart_type,
288 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
289 typename g_opart_sz_type>
290 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
292 vector_prop_type & v_prp,
293 vector_pos_type & v_pos,
294 prc_g_opart_type & prc_g_opart,
295 prc_recv_get_type & prc_recv_get,
296 recv_sz_get_type & recv_sz_get,
297 recv_sz_get_byte_type & recv_sz_get_byte,
298 g_opart_sz_type & g_opart_sz,
305 if (
sizeof...(prp) != 0)
307 size_t opt_ = compute_options(opt);
308 if (opt & SKIP_LABELLING)
310 if (opt & RUN_ON_DEVICE)
313 v_cl.template SSendRecvP_opWait<
op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
318 v_cl.template SSendRecvP_opWait<
op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
322 {v_cl.template SSendRecvPWait<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
340 template<
unsigned int dim,
447 template<
typename prp_object,
int ... prp>
451 template<
typename T1,
typename T2>
inline static void proc(
size_t lbl,
size_t cnt,
size_t id, T1 & v_prp, T2 & m_prp)
459 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(
id), m_prp.get(lbl).get(cnt));
473 if (recv_sz_get_prp.
size() != 0)
474 {
return recv_sz_get_prp.get(i);}
485 if (prc_recv_get_prp.
size() != 0)
486 {
return prc_recv_get_prp.
size();}
497 if (prc_recv_get_prp.
size() != 0)
498 {
return prc_recv_get_prp;}
514 if (opt & RUN_ON_DEVICE)
520 if (
prc_sz.template get<1>(i) != (
unsigned int)-1)
522 prc_r.add(
prc_sz.template get<1>(i));
523 prc_sz_r.add(
prc_sz.template get<0>(i) - prev_off);
525 prev_off =
prc_sz.template get<0>(i);
537 prc_sz_r.add(
prc_sz.template get<0>(i));
551 if (
prc_sz.template get<0>(i) != 0)
555 prc_sz_r.add(
prc_sz.template get<0>(i));
597 unsigned int box_f_sv;
600 bool operator<(
const sh_box & tmp)
const 602 return shift_id < tmp.shift_id;
613 for (
size_t i = 0; i <
dec.getNLocalSub(); i++)
615 size_t Nl =
dec.getLocalNIGhost(i);
617 for (
size_t j = 0; j < Nl; j++)
621 if (
dec.getLocalIGhostPos(i, j).n_zero() == dim)
625 auto it =
map_cmb.find(
dec.getLocalIGhostPos(i, j).lin());
630 box_f.last().add(
dec.getLocalIGhostBox(i, j));
637 box_f.get(it->second).add(
dec.getLocalIGhostBox(i, j));
641 reord_shift.last().shift_id =
dec.getLocalIGhostPos(i, j).lin();
642 reord_shift.last().box_f_dev =
dec.getLocalIGhostBox(i, j);
643 reord_shift.last().box_f_sv =
dec.convertShift(
dec.getLocalIGhostPos(i, j));
651 box_f_sv.resize(reord_shift.
size());
653 for (
size_t i = 0 ; i < reord_shift.
size() ; i++)
655 box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
656 box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv;
663 box_f_sv.template hostToDevice<0>();
684 if (!(opt & NO_POSITION))
686 if (opt & RUN_ON_DEVICE)
695 size_t lin_id =
o_part_loc.template get<1>(i);
700 p -= shifts.get(lin_id);
704 v_prp.get(
lg_m+i) = v_prp.get(key);
710 if (opt & RUN_ON_DEVICE)
721 v_prp.get(
lg_m+i) = v_prp.get(key);
736 size_t g_m,
size_t opt)
743 if (opt & RUN_ON_DEVICE)
751 auto it = v_pos.getIteratorTo(g_m);
758 for (
size_t i = 0; i <
box_f.size(); i++)
760 for (
size_t j = 0; j <
box_f.get(i).size(); j++)
762 if (
box_f.get(i).get(j).isInsideNP(v_pos.get(key)) ==
true)
764 size_t lin_id =
dec.convertShift(
box_cmb.get(i));
772 p -= shifts.get(lin_id);
777 v_prp.last() = v_prp.get(key);
853 if (!(opt & SKIP_LABELLING))
856 if (
box_f.size() == 0)
860 if (opt & SKIP_LABELLING)
885 size_t old_hsmem_size = 0;
896 for (
size_t i = 0; i < g_pos_send.
size(); i++)
900 if (
hsmem.
get(i+old_hsmem_size).ref() == 0)
901 {
hsmem.
get(i+old_hsmem_size).incRef();}
904 g_pos_send.get(i).setMemory(
hsmem.
get(i+old_hsmem_size));
907 g_pos_send.get(i).resize(
prc_sz.get(i));
910 if (opt & RUN_ON_DEVICE)
912 #if defined(CUDA_GPU) && defined(__NVCC__) 917 for (
size_t i = 0 ; i < g_pos_send.
size() ; i++)
919 auto ite = g_pos_send.get(i).getGPUIterator();
921 CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(
g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>),
924 v_pos.toKernel(),shifts.toKernel(),offset);
931 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
943 s -= shifts.get(
g_opart.get(i).template get<1>(j));
944 g_pos_send.get(i).set(j, s);
961 template<
typename send_vector,
typename prp_object,
int ... prp>
974 g_send_prp.resize(nproc);
978 for (
size_t i = 0; i < g_send_prp.
size(); i++)
986 g_send_prp.get(i).setMemory(
hsmem.
get(i));
991 g_send_prp.get(i).resize(n_part_recv);
996 if (opt & RUN_ON_DEVICE)
998 #if defined(CUDA_GPU) && defined(__NVCC__) 1000 if (
sizeof...(prp) != 0)
1003 for (
size_t i = 0 ; i < g_send_prp.
size() ; i++)
1007 auto ite = g_send_prp.get(i).getGPUIterator();
1009 if (ite.nblocks() == 0) {
continue;}
1011 CUDA_LAUNCH((process_ghost_particles_prp_put<decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1013 g_send_prp.get(i).toKernel(),
1014 v_prp.toKernel(),accum);
1016 accum = accum + n_part_recv;
1022 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1029 for (
size_t i = 0; i < g_send_prp.
size(); i++)
1034 for (
size_t j = accum; j < accum + n_part_recv; j++)
1042 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2));
1047 accum = accum + n_part_recv;
1059 for (
size_t i = nbf ; i < rt_buf.
size() ; i++)
1061 rt_buf.
get(i).decRef();
1071 template<
typename send_vector,
typename v_mpl>
1084 :g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j)
1088 template<
typename T>
1091 g_send_prp.get(i).template setMemory<T::value>(hsmem.
get(j));
1097 template<
bool inte_or_lin,
typename send_vector,
typename v_mpl>
1107 g_send_prp.get(i).setMemory(
hsmem.
get(j));
1110 g_send_prp.get(i).resize(
prc_sz.get(i));
1116 template<
typename send_vector,
typename v_mpl>
1127 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi);
1130 if (boost::mpl::size<v_mpl>::type::value != 0)
1133 g_send_prp.get(i).resize(
prc_sz.get(i));
1150 template<
typename send_vector,
typename prp_object,
int ... prp>
1160 if (
is_layout_inte<layout_base<prop>>::value ==
true) {factor *=
sizeof...(prp);}
1167 for (
size_t i = 0; i <
hsmem.
size(); i++)
1176 for (
size_t i = 0; i < g_send_prp.
size(); i++)
1178 j = set_mem_retained_buffers<is_layout_inte<layout_base<prop>>::value,send_vector,v_mpl>::set_mem_retained_buffers_(g_send_prp,
prc_sz,i,
hsmem,j);
1181 if (opt & RUN_ON_DEVICE)
1183 #if defined(CUDA_GPU) && defined(__NVCC__) 1187 if (
sizeof...(prp) != 0)
1190 for (
size_t i = 0 ; i < g_send_prp.
size() ; i++)
1192 auto ite = g_send_prp.get(i).getGPUIterator();
1194 CUDA_LAUNCH((process_ghost_particles_prp<decltype(
g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1197 v_prp.toKernel(),offset);
1205 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1212 if (
sizeof...(prp) == 0) {
return;}
1217 for (
size_t j = 0; j <
g_opart.get(i).
size(); j++)
1220 typedef decltype(v_prp.get(
g_opart.get(i).template get<0>(j))) encap_src;
1222 typedef decltype(g_send_prp.get(i).get(j)) encap_dst;
1225 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(
g_opart.get(i).template get<0>(j)), g_send_prp.get(i).get(j));
1251 m_prp.resize(prc_sz_r.
size());
1252 m_pos.resize(prc_sz_r.
size());
1255 for (
size_t i = 0; i < prc_sz_r.
size() ; i++)
1258 m_pos.get(i).resize(prc_sz_r.get(i));
1259 m_prp.get(i).resize(prc_sz_r.get(i));
1263 if (opt & RUN_ON_DEVICE)
1268 #if defined(CUDA_GPU) && defined(__NVCC__) 1279 starts.template deviceToHost<0>();
1280 size_t offset =
starts.template get<0>(rank);
1283 if (ite.wthr.x != 0)
1286 CUDA_LAUNCH((process_map_particles<decltype(
m_opart.toKernel()),decltype(
v_pos_tmp.toKernel()),decltype(
v_prp_tmp.toKernel()),
1287 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1290 v_pos.toKernel(),v_prp.toKernel(),offset);
1294 for (
size_t i = 0 ; i < m_pos.size() ; i++)
1296 size_t offset =
starts.template get<0>(prc_r.template get<0>(i));
1298 auto ite = m_pos.get(i).getGPUIterator();
1301 if (ite.wthr.x != 0)
1304 CUDA_LAUNCH((process_map_particles<decltype(
m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
1305 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1307 m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1308 v_pos.toKernel(),v_prp.toKernel(),offset);
1319 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1326 long int id_end = v_pos.
size();
1334 process_map_particle<proc_without_prp>(i,end,id_end,
m_opart,
p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1355 template<
typename prp_object,
int ... prp>
1362 m_prp.resize(prc_sz_r.
size());
1363 m_pos.resize(prc_sz_r.
size());
1366 for (
size_t i = 0; i < prc_sz_r.
size(); i++)
1369 m_pos.get(i).resize(prc_sz_r.get(i));
1370 m_prp.get(i).resize(prc_sz_r.get(i));
1375 long int id_end = v_pos.size();
1383 process_map_particle<proc_with_prp<prp_object,prp...>>(i,end,id_end,
m_opart,
p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1401 layout_base> & lbl_p,
1405 if (opt == RUN_ON_DEVICE)
1411 lbl_p.resize(v_pos.size());
1415 prc_sz.template fill<0>(0);
1417 auto ite = v_pos.getGPUIterator();
1418 if (ite.wthr.x == 0)
1421 starts.template fill<0>(0);
1432 for (
size_t i = 0 ; i < dim ; i++) {bc.bc[i] =
dec.periodicity(i);}
1434 CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite,
dec.getDomain(),bc,v_pos.toKernel());
1440 CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(
dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(
prc_sz.toKernel())>),
1442 dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),
prc_sz.toKernel(),
v_cl.
rank());
1448 prc_sz.template deviceToHost<0>();
1450 ite = lbl_p.getGPUIterator();
1453 CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(
starts.toKernel())>),ite,lbl_p.toKernel(),
starts.toKernel());
1458 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
1472 prc_sz.template fill<0>(0);
1474 auto it = v_pos.getIterator();
1479 auto key = it.get();
1482 dec.applyPointBC(v_pos.get(key));
1487 if (
dec.getDomain().isInside(v_pos.get(key)) ==
true)
1488 {p_id =
dec.processorID(v_pos.get(key));}
1495 if ((
long int) p_id != -1)
1497 prc_sz.template get<0>(p_id)++;
1499 lbl_p.last().template get<0>() = key;
1500 lbl_p.last().template get<2>() = p_id;
1505 lbl_p.last().template get<0>() = key;
1506 lbl_p.last().template get<2>() = p_id;
1544 if (opt & RUN_ON_DEVICE)
1548 ::run(
mem,
dec,
g_opart_device,
proc_id_out,
starts,
v_cl,v_pos,v_prp,prc,
prc_sz,
prc_offset,g_m,opt);
1553 auto it = v_pos.getIteratorTo(g_m);
1556 auto key = it.get();
1562 for (
size_t i = 0; i < vp_id.
size(); i++)
1565 size_t p_id = vp_id.get(i).first;
1569 g_opart.get(p_id).last().template get<0>() = key;
1570 g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second;
1586 g_opart.get(i).swap(g_opart_f.last());
1587 prc.add(
dec.IDtoProc(i));
1593 #ifdef EXTREA_TRACE_PRE_COMM 1594 Extrae_user_function (0);
1611 static void *
message_alloc_map(
size_t msg_i,
size_t total_msg,
size_t total_p,
size_t i,
size_t ri,
void * ptr)
1614 vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> * vd =
static_cast<vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> *
>(ptr);
1617 vd->recv_mem_gm.get(i).resize(msg_i);
1619 return vd->recv_mem_gm.get(i).getPointer();
1630 :
v_cl(create_vcluster<Memory>()),
dec(create_vcluster()),
lg_m(0)
1662 :
v_cl(create_vcluster<Memory>()),
dec(create_vcluster()),
lg_m(0)
1673 for (
size_t i = 0 ; i <
hsmem.
size() ; i++)
1678 std::cout << __FILE__ <<
":" << __LINE__ <<
" internal error memory is in an invalid state " << std::endl;
1700 this->v_sub_unit_factor = n_sub;
1712 const size_t (& bc)[dim],
1719 if (opt & BIND_DEC_TO_GHOST)
1725 CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
1728 cl_param_calculateSym<dim,St>(box,cd_sm,g,pad);
1730 for (
size_t i = 0 ; i < dim ; i++)
1731 {div[i] = cd_sm.getDiv()[i] - 2*pad;}
1734 dec.setParameters(div, box, bc, g, gdist);
1752 const size_t (& bc)[dim],
1759 for (
size_t i = 0 ; i < dim ; i++)
1760 {div[i] = gdist.
size(i);}
1763 dec.setParameters(div, box, bc, g);
1781 size_t opt = WITH_POSITION)
1783 #ifdef PROFILE_SCOREP 1784 SCOREP_USER_REGION(
"ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
1788 typedef object<
typename object_creator<
typename prop::type, prp...>::type> prp_object;
1793 if (!(opt & NO_POSITION))
1794 {v_pos.resize(g_m);}
1798 if (!(opt & SKIP_LABELLING))
1799 {v_prp.resize(g_m);}
1802 if ((opt & SKIP_LABELLING) ==
false)
1811 #if defined(CUDA_GPU) && defined(__NVCC__) 1812 cudaDeviceSynchronize();
1823 if (!(opt & NO_POSITION))
1830 #if defined(CUDA_GPU) && defined(__NVCC__) 1831 cudaDeviceSynchronize();
1847 if (!(opt & SKIP_LABELLING))
1849 v_prp.resize(v_pos.size());
1868 size_t opt = WITH_POSITION)
1871 typedef object<
typename object_creator<
typename prop::type, prp...>::type> prp_object;
1907 if (opt & RUN_ON_DEVICE)
1909 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" map_list is unsupported on device (coming soon)" << std::endl;
1933 if (
prc_sz.template get<0>(i) != 0)
1937 prc_sz_r.add(
prc_sz.template get<0>(i));
1941 if (opt & MAP_LOCAL)
1946 for (
size_t i = 0 ; i <
dec.getNNProcessors() ; i++)
1951 typedef object<
typename object_creator<
typename prop::type, prp...>::type> prp_object;
1961 v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,
prc_recv_map,
recv_sz_map,opt);
1981 template<
typename obp = KillParticle>
1986 #ifdef PROFILE_SCOREP 1987 SCOREP_USER_REGION(
"map",SCOREP_USER_REGION_TYPE_FUNCTION)
2014 if (opt & RUN_ON_DEVICE)
2016 #if defined(CUDA_GPU) && defined(__NVCC__) 2018 cudaDeviceSynchronize();
2019 opt_ |= MPI_GPU_DIRECT;
2021 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2030 v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>,
2067 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> &
operator=(
const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & vc)
2081 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> &
operator=(
vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> && vc)
2099 template<
template<
typename,
typename>
class op,
int ... prp>
2106 typedef object<
typename object_creator<
typename prop::type, prp...>::type> prp_object;
2114 if (opt & RUN_ON_DEVICE)
2116 #if defined(CUDA_GPU) && defined(__NVCC__) 2118 cudaDeviceSynchronize();
2120 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2125 if (opt & NO_CHANGE_ELEMENTS)
2127 size_t opt_ = compute_options(opt);
2129 if (opt & RUN_ON_DEVICE)
2150 size_t opt_ = compute_options(opt);
2152 if (opt & RUN_ON_DEVICE)
2176 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" Local ghost particles = " << v_prp.
size() -
lg_m <<
" != " <<
o_part_loc.
size() << std::endl;
2177 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" Check that you did a ghost_get before a ghost_put" << std::endl;
2181 if (opt & RUN_ON_DEVICE)
2183 v_prp.template merge_prp_v_device<op,prop,Memory,
2190 v_prp.template merge_prp_v<op,prop,Memory,
openfpm::vector_std< openfpm::vector_std< Box< dim, St > > > box_f
std::unordered_map< size_t, size_t > map_cmb
this map is used to check if a combination is already present
openfpm::vector< size_t > recv_sz_get_pos
process the particle with properties
static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void *ptr)
Call-back to allocate buffer to receive incoming elements (particles)
openfpm::vector< Point< dim, St >, Memory, layout_base > v_pos_tmp
Helper buffer for computation (on GPU) of local particles (position)
openfpm::vector_std< comb< dim > > box_cmb
Store the sector for each group (previous vector)
openfpm::vector< openfpm::vector< aggregate< size_t, size_t > > > g_opart
Vcluster< Memory > & v_cl
VCluster.
Transform the boost::fusion::vector into memory specification (memory_traits)
openfpm::vector< size_t > prc_sz_gg
elements sent for each processors (ghost_get)
size_t getProcessUnitID()
Get the process unit id.
openfpm::vector_fr< Memory > hsmem
Sending buffer.
openfpm::vector< size_t > prc_recv_map
the same as prc_recv_get but for map
size_t v_sub_unit_factor
Number of units for each sub-domain.
Helper class to merge data.
Grow policy define how the vector should grow every time we exceed the size.
openfpm::vector< size_t > p_map_req
It map the processor id with the communication request into map procedure.
size_t size()
return the size of the vector
void map_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
__device__ __host__ size_t size() const
Return the size of the grid.
openfpm::vector< aggregate< unsigned int, unsigned long int >, CudaMemory, memory_traits_inte > g_opart_device
Same as g_opart but on device, the vector of vector is flatten into a single vector.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_sz
Processor communication size.
openfpm::vector< prop, Memory, layout_base > v_prp_tmp
Helper buffer for computation (on GPU) of local particles (properties)
openfpm::vector< size_t > recv_sz_put
The same as recv_sz_get but for put.
void ghost_get_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
void fill_send_map_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, openfpm::vector< openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity >> &m_pos, openfpm::vector< openfpm::vector< prop, Memory, layout_base, openfpm::grow_policy_identity >> &m_prp, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
allocate and fill the send buffer for the map function
void map_list_(openfpm::vector< Point< dim, St >> &v_pos, openfpm::vector< prop > &v_prp, size_t &g_m, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
void fill_send_ghost_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_vector > &g_send_prp, size_t opt)
This function fill the send buffer for properties after the particles has been label with labelPartic...
Helper class to merge data.
void add_loc_particles_bc(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
Add local particles based on the boundary conditions.
This class implement the point shape in an N-dimensional space.
void setDecompositionGranularity(size_t n_sub)
Set the minimum number of sub-domain per processor.
template selector for asynchronous or not asynchronous
openfpm::vector< size_t > prc_recv_get_pos
Decomposition & getDecomposition()
Get the decomposition.
openfpm::vector< size_t > recv_sz_map
The same as recv_sz_get but for map.
openfpm::vector< size_t > & get_last_ghost_get_num_proc_vector()
Get the number of processor involved during the last ghost_get.
This class allocate, and destroy CPU memory.
void fill_send_map_buf_list(openfpm::vector< Point< dim, St >> &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< openfpm::vector< Point< dim, St >>> &m_pos, openfpm::vector< openfpm::vector< prp_object >> &m_prp)
allocate and fill the send buffer for the map function
Transform the boost::fusion::vector into memory specification (memory_traits)
void resize(size_t sz)
resize the vector retaining the objects
This class define the domain decomposition interface.
openfpm::vector< size_t > prc_recv_put
the same as prc_recv_get but for put
void calc_send_buffers(openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, size_t opt)
Calculate sending buffer size for each processor.
long int shift_box_ndec
From which decomposition the shift boxes are calculated.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &&vc)
Copy a vector.
void local_ghost_from_dec(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t g_m, size_t opt)
Local ghost from decomposition.
Grow policy define how the vector should grow every time we exceed the size.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > o_part_loc
Id of the local particle to replicate for ghost_get.
void init_decomposition(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
This class decompose a space into sub-sub-domains and distribute them across processors.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
size_t rank()
Get the process unit id.
Helper class to merge data.
size_t getDecompositionGranularity()
Get the number of minimum sub-domain per processor.
void operator()(T &t)
It call the setMemory function for each property.
openfpm::vector< size_t > recv_sz_get_byte
Conversion to byte of recv_sz_get.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
size_t getProcessingUnits()
Get the total number of processors.
This class is an helper for the communication of vector_dist.
void init_decomposition_gr_cell(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &vc)
Copy a vector.
Out of bound policy it detect out of bound particles and decide what to do.
openfpm::vector< Box< dim, St >, Memory, layout_base > box_f_dev
The boxes touching the border of the domain + shift vector linearized from where they come from.
mgpu::ofp_context_t & getmgpuContext(bool iw=true)
If nvidia cuda is activated return a mgpu context.
vector_dist_comm(Decomposition &&dec)
Constructor.
void fill_send_ghost_put_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< send_vector > &g_send_prp, size_t &g_m, size_t opt)
This function fill the send buffer for ghost_put.
vector_dist_comm(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &v)
Copy Constructor.
void local_ghost_from_opart(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t opt)
Local ghost from labeled particles.
openfpm::vector< aggregate< int, int, int >, Memory, layout_base > m_opart
~vector_dist_comm()
Destructor.
void ghost_put_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt)
Ghost put.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_offset
Processor communication size.
openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity > send_pos_vector
definition of the send vector for position
Decomposition dec
Domain decomposition.
openfpm::vector< size_t > g_opart_sz
Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
void labelParticleProcessor(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< aggregate< int, int, int >, Memory, layout_base > &lbl_p, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
Label particles for mappings.
void ghost_wait_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &g_m, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
static void proc(size_t lbl, size_t cnt, size_t id, T1 &v_prp, T2 &m_prp)
process the particle
Helper class to merge data.
It create a boost::fusion vector with the selected properties.
void resize_retained_buffer(openfpm::vector_fr< Memory > &rt_buf, size_t nbf)
resize the retained buffer by nbf
size_t size()
Get the total number of processors.
const Decomposition & getDecomposition() const
Get the decomposition.
void fill_send_ghost_pos_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_pos_vector > &g_pos_send, size_t opt, bool async)
This function fill the send buffer for the particle position after the particles has been label with ...
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > starts
temporary buffer for the scan result
vector_dist_comm()
Constructor.
Set the buffer for each property.
void createShiftBox()
For every internal ghost box we create a structure that order such internal local ghost box in shift ...
vector_dist_comm(const Decomposition &dec)
Constructor.
CudaMemory mem
Temporary CudaMemory to do stuff.
bool SSendRecv(openfpm::vector< T > &send, S &recv, openfpm::vector< size_t > &prc_send, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &sz_recv, size_t opt=NONE)
Semantic Send and receive, send the data to processors and receive from the other processors.
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > proc_id_out
temporary buffer to processors ids
size_t get_last_ghost_get_num_proc()
Get the number of processor involved during the last ghost_get.
T & get(size_t id)
Get an element of the vector.
openfpm::vector< size_t > prc_g_opart
processor rank list of g_opart
It copy the properties from one object to another.
void labelParticlesGhost(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc, openfpm::vector< size_t > &prc_sz, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_offset, size_t &g_m, size_t opt)
Label the particles.
size_t get_last_ghost_get_received_parts(size_t i)
Get the number of particles received from each processor during the last ghost_get.