8 #ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
9 #define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
13 #if defined(CUDA_GPU) && defined(__NVCC__)
14 #include "Vector/cuda/vector_dist_cuda_funcs.cuh"
15 #include "util/cuda/kernels.cuh"
18 #include "Vector/util/vector_dist_funcs.hpp"
19 #include "cuda/vector_dist_comm_util_funcs.cuh"
20 #include "util/cuda/scan_ofp.cuh"
25 static float ret(T & tmp)
34 static float ret(
float & tmp)
44 inline static size_t compute_options(
size_t opt)
47 if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING)
48 {opt_ = RECEIVE_KNOWN | KNOWN_ELEMENT_OR_BYTE;}
50 if (opt & RUN_ON_DEVICE)
52 #if defined(CUDA_GPU) && defined(__NVCC__)
54 opt_ |= MPI_GPU_DIRECT;
56 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
69 template<
unsigned int impl,
template<
typename>
class layout_base,
unsigned int ... prp>
72 template<
typename Vcluster_type,
typename vector_prop_type,
73 typename vector_pos_type,
typename send_vector,
74 typename prc_recv_get_type,
typename prc_g_opart_type,
75 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
76 typename g_opart_sz_type>
77 static inline void sendrecv_prp(Vcluster_type & v_cl,
79 vector_prop_type & v_prp,
80 vector_pos_type & v_pos,
81 prc_g_opart_type & prc_g_opart,
82 prc_recv_get_type & prc_recv_get,
83 recv_sz_get_type & recv_sz_get,
84 recv_sz_get_byte_type & recv_sz_get_byte,
85 g_opart_sz_type & g_opart_sz,
92 if (
sizeof...(prp) != 0)
94 size_t opt_ = compute_options(opt);
95 if (opt & SKIP_LABELLING)
97 if (opt & RUN_ON_DEVICE)
100 v_cl.template SSendRecvP_op<
op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
105 v_cl.template SSendRecvP_op<
op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
109 {v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
112 g_opart_sz.resize(prc_g_opart.size());
114 for (
size_t i = 0 ; i < prc_g_opart.size() ; i++)
115 g_opart_sz.get(i) = g_send_prp.get(i).
size();
119 template<
typename Vcluster_type,
typename vector_prop_type,
120 typename vector_pos_type,
typename send_pos_vector,
121 typename prc_recv_get_type,
typename prc_g_opart_type,
122 typename recv_sz_get_type>
123 static inline void sendrecv_pos(Vcluster_type & v_cl,
125 vector_prop_type & v_prp,
126 vector_pos_type & v_pos,
127 prc_recv_get_type & prc_recv_get,
128 recv_sz_get_type & recv_sz_get,
129 prc_g_opart_type & prc_g_opart,
132 size_t opt_ = compute_options(opt);
133 if (opt & SKIP_LABELLING)
135 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
139 prc_recv_get.clear();
141 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
145 template<
typename Vcluster_type,
typename vector_prop_type,
146 typename vector_pos_type,
typename send_pos_vector,
147 typename prc_recv_get_type,
typename prc_g_opart_type,
148 typename recv_sz_get_type>
149 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
151 vector_prop_type & v_prp,
152 vector_pos_type & v_pos,
153 prc_recv_get_type & prc_recv_get,
154 recv_sz_get_type & recv_sz_get,
155 prc_g_opart_type & prc_g_opart,
159 template<
typename Vcluster_type,
typename vector_prop_type,
160 typename vector_pos_type,
typename send_vector,
161 typename prc_recv_get_type,
typename prc_g_opart_type,
162 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
163 typename g_opart_sz_type>
164 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
166 vector_prop_type & v_prp,
167 vector_pos_type & v_pos,
168 prc_g_opart_type & prc_g_opart,
169 prc_recv_get_type & prc_recv_get,
170 recv_sz_get_type & recv_sz_get,
171 recv_sz_get_byte_type & recv_sz_get_byte,
172 g_opart_sz_type & g_opart_sz,
179 template<
template<
typename>
class layout_base,
unsigned int ... prp>
182 template<
typename Vcluster_type,
typename vector_prop_type,
183 typename vector_pos_type,
typename send_vector,
184 typename prc_recv_get_type,
typename prc_g_opart_type,
185 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
186 typename g_opart_sz_type>
187 static inline void sendrecv_prp(Vcluster_type & v_cl,
189 vector_prop_type & v_prp,
190 vector_pos_type & v_pos,
191 prc_g_opart_type & prc_g_opart,
192 prc_recv_get_type & prc_recv_get,
193 recv_sz_get_type & recv_sz_get,
194 recv_sz_get_byte_type & recv_sz_get_byte,
195 g_opart_sz_type & g_opart_sz,
199 prc_recv_get.clear();
205 if (
sizeof...(prp) != 0)
207 size_t opt_ = compute_options(opt);
208 if (opt & SKIP_LABELLING)
210 if (opt & RUN_ON_DEVICE)
213 v_cl.template SSendRecvP_opAsync<
op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
218 v_cl.template SSendRecvP_opAsync<
op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
222 {v_cl.template SSendRecvPAsync<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
226 g_opart_sz.resize(prc_g_opart.size());
228 for (
size_t i = 0 ; i < prc_g_opart.size() ; i++)
229 {g_opart_sz.get(i) = g_send_prp.get(i).
size();}
232 template<
typename Vcluster_type,
typename vector_prop_type,
233 typename vector_pos_type,
typename send_pos_vector,
234 typename prc_recv_get_type,
typename prc_g_opart_type,
235 typename recv_sz_get_type>
236 static inline void sendrecv_pos(Vcluster_type & v_cl,
238 vector_prop_type & v_prp,
239 vector_pos_type & v_pos,
240 prc_recv_get_type & prc_recv_get,
241 recv_sz_get_type & recv_sz_get,
242 prc_g_opart_type & prc_g_opart,
245 prc_recv_get.clear();
248 size_t opt_ = compute_options(opt);
249 if (opt & SKIP_LABELLING)
251 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
255 prc_recv_get.clear();
257 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
261 template<
typename Vcluster_type,
typename vector_prop_type,
262 typename vector_pos_type,
typename send_pos_vector,
263 typename prc_recv_get_type,
typename prc_g_opart_type,
264 typename recv_sz_get_type>
265 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
267 vector_prop_type & v_prp,
268 vector_pos_type & v_pos,
269 prc_recv_get_type & prc_recv_get,
270 recv_sz_get_type & recv_sz_get,
271 prc_g_opart_type & prc_g_opart,
274 size_t opt_ = compute_options(opt);
275 if (opt & SKIP_LABELLING)
277 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
281 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
285 template<
typename Vcluster_type,
typename vector_prop_type,
286 typename vector_pos_type,
typename send_vector,
287 typename prc_recv_get_type,
typename prc_g_opart_type,
288 typename recv_sz_get_type,
typename recv_sz_get_byte_type,
289 typename g_opart_sz_type>
290 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
292 vector_prop_type & v_prp,
293 vector_pos_type & v_pos,
294 prc_g_opart_type & prc_g_opart,
295 prc_recv_get_type & prc_recv_get,
296 recv_sz_get_type & recv_sz_get,
297 recv_sz_get_byte_type & recv_sz_get_byte,
298 g_opart_sz_type & g_opart_sz,
305 if (
sizeof...(prp) != 0)
307 size_t opt_ = compute_options(opt);
308 if (opt & SKIP_LABELLING)
310 if (opt & RUN_ON_DEVICE)
313 v_cl.template SSendRecvP_opWait<
op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
318 v_cl.template SSendRecvP_opWait<
op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
322 {v_cl.template SSendRecvPWait<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
340 template<
unsigned int dim,
447 template<
typename prp_object,
int ... prp>
451 template<
typename T1,
typename T2>
inline static void proc(
size_t lbl,
size_t cnt,
size_t id, T1 & v_prp, T2 & m_prp)
459 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(
id), m_prp.get(lbl).get(cnt));
473 if (recv_sz_get_prp.
size() != 0)
474 {
return recv_sz_get_prp.get(i);}
485 if (prc_recv_get_prp.
size() != 0)
486 {
return prc_recv_get_prp.
size();}
497 if (prc_recv_get_prp.
size() != 0)
498 {
return prc_recv_get_prp;}
514 if (opt & RUN_ON_DEVICE)
518 for (
size_t i = 0; i <
prc_sz.size() ; i++)
520 if (
prc_sz.template get<1>(i) != (
unsigned int)-1)
522 prc_r.add(
prc_sz.template get<1>(i));
523 prc_sz_r.add(
prc_sz.template get<0>(i) - prev_off);
525 prev_off =
prc_sz.template get<0>(i);
532 for (
size_t i = 0; i <
v_cl.getProcessingUnits(); i++)
534 if (
prc_sz.template get<0>(i) != 0 &&
v_cl.rank() != i)
537 prc_sz_r.add(
prc_sz.template get<0>(i));
549 for (
size_t i = 0; i <
v_cl.getProcessingUnits(); i++)
551 if (
prc_sz.template get<0>(i) != 0)
555 prc_sz_r.add(
prc_sz.template get<0>(i));
597 unsigned int box_f_sv;
600 bool operator<(
const sh_box & tmp)
const
602 return shift_id < tmp.shift_id;
613 for (
size_t i = 0; i <
dec.getNLocalSub(); i++)
615 size_t Nl =
dec.getLocalNIGhost(i);
617 for (
size_t j = 0; j < Nl; j++)
621 if (
dec.getLocalIGhostPos(i, j).n_zero() == dim)
625 auto it =
map_cmb.find(
dec.getLocalIGhostPos(i, j).lin());
630 box_f.last().add(
dec.getLocalIGhostBox(i, j));
637 box_f.get(it->second).add(
dec.getLocalIGhostBox(i, j));
641 reord_shift.last().shift_id =
dec.getLocalIGhostPos(i, j).lin();
642 reord_shift.last().box_f_dev =
dec.getLocalIGhostBox(i, j);
643 reord_shift.last().box_f_sv =
dec.convertShift(
dec.getLocalIGhostPos(i, j));
651 box_f_sv.resize(reord_shift.
size());
653 for (
size_t i = 0 ; i < reord_shift.
size() ; i++)
655 box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
656 box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv;
663 box_f_sv.template hostToDevice<0>();
684 if (!(opt & NO_POSITION))
686 if (opt & RUN_ON_DEVICE)
693 for (
size_t i = 0 ; i <
o_part_loc.size() ; i++)
695 size_t lin_id =
o_part_loc.template get<1>(i);
700 p -= shifts.get(lin_id);
704 v_prp.get(
lg_m+i) = v_prp.get(key);
710 if (opt & RUN_ON_DEVICE)
717 for (
size_t i = 0 ; i <
o_part_loc.size() ; i++)
721 v_prp.get(
lg_m+i) = v_prp.get(key);
736 size_t ghostMarker,
size_t opt)
743 if (opt & RUN_ON_DEVICE)
751 auto it = v_pos.getIteratorTo(ghostMarker);
758 for (
size_t i = 0; i <
box_f.size(); i++)
760 for (
size_t j = 0; j <
box_f.get(i).size(); j++)
762 if (
box_f.get(i).get(j).isInsideNP(v_pos.get(key)) ==
true)
764 size_t lin_id =
dec.convertShift(
box_cmb.get(i));
772 p -= shifts.get(lin_id);
777 v_prp.last() = v_prp.get(key);
847 size_t & ghostMarker,
853 if (!(opt & SKIP_LABELLING))
856 if (
box_f.size() == 0)
860 if (opt & SKIP_LABELLING)
883 g_pos_send.resize(
prc_sz.size());
885 size_t old_hsmem_size = 0;
890 old_hsmem_size =
hsmem.size();
896 for (
size_t i = 0; i < g_pos_send.
size(); i++)
900 if (
hsmem.get(i+old_hsmem_size).ref() == 0)
901 {
hsmem.get(i+old_hsmem_size).incRef();}
904 g_pos_send.get(i).setMemory(
hsmem.get(i+old_hsmem_size));
907 g_pos_send.get(i).resize(
prc_sz.get(i));
910 if (opt & RUN_ON_DEVICE)
912 #if defined(CUDA_GPU) && defined(__NVCC__)
917 for (
size_t i = 0 ; i < g_pos_send.
size() ; i++)
919 auto ite = g_pos_send.get(i).getGPUIterator();
921 CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(
g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>),
924 v_pos.toKernel(),shifts.toKernel(),(
unsigned int)offset);
931 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
940 for (
size_t j = 0; j <
g_opart.get(i).size(); j++)
943 s -= shifts.get(
g_opart.get(i).template get<1>(j));
944 g_pos_send.get(i).set(j, s);
961 template<
typename send_vector,
typename prp_object,
int ... prp>
964 size_t & ghostMarker,
974 g_send_prp.resize(nproc);
978 for (
size_t i = 0; i < g_send_prp.
size(); i++)
982 if (
hsmem.get(i).ref() == 0)
983 hsmem.get(i).incRef();
986 g_send_prp.get(i).setMemory(
hsmem.get(i));
991 g_send_prp.get(i).resize(n_part_recv);
994 size_t accum = ghostMarker;
996 if (opt & RUN_ON_DEVICE)
998 #if defined(CUDA_GPU) && defined(__NVCC__)
1000 if (
sizeof...(prp) != 0)
1003 for (
size_t i = 0 ; i < g_send_prp.
size() ; i++)
1007 auto ite = g_send_prp.get(i).getGPUIterator();
1009 if (ite.nblocks() == 0) {
continue;}
1011 CUDA_LAUNCH((process_ghost_particles_prp_put<decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1013 g_send_prp.get(i).toKernel(),
1014 v_prp.toKernel(),(
unsigned int)accum);
1016 accum = accum + n_part_recv;
1022 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1029 for (
size_t i = 0; i < g_send_prp.
size(); i++)
1034 for (
size_t j = accum; j < accum + n_part_recv; j++)
1042 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2));
1047 accum = accum + n_part_recv;
1059 for (
size_t i = nbf ; i < rt_buf.size() ; i++)
1061 rt_buf.get(i).decRef();
1071 template<
typename send_vector,
typename v_mpl>
1078 openfpm::vector_fr<Memory> & hsmem;
1083 openfpm::vector_fr<Memory> & hsmem,
size_t j)
1084 :g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j)
1088 template<
typename T>
1091 g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j));
1097 template<
bool inte_or_lin,
typename send_vector,
typename v_mpl>
1103 openfpm::vector_fr<Memory> &
hsmem,
1107 g_send_prp.get(i).setMemory(
hsmem.get(j));
1110 g_send_prp.get(i).resize(
prc_sz.get(i));
1116 template<
typename send_vector,
typename v_mpl>
1122 openfpm::vector_fr<Memory> &
hsmem,
1127 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi);
1130 if (boost::mpl::size<v_mpl>::type::value != 0)
1133 g_send_prp.get(i).resize(
prc_sz.get(i));
1150 template<
typename send_vector,
typename prp_object,
int ... prp>
1160 if (
is_layout_inte<layout_base<prop>>::value ==
true) {factor *=
sizeof...(prp);}
1163 g_send_prp.resize(
prc_sz.size());
1167 for (
size_t i = 0; i <
hsmem.size(); i++)
1171 if (
hsmem.get(i).ref() == 0)
1172 {
hsmem.get(i).incRef();}
1176 for (
size_t i = 0; i < g_send_prp.
size(); i++)
1181 if (opt & RUN_ON_DEVICE)
1183 #if defined(CUDA_GPU) && defined(__NVCC__)
1187 if (
sizeof...(prp) != 0)
1190 for (
size_t i = 0 ; i < g_send_prp.
size() ; i++)
1192 auto ite = g_send_prp.get(i).getGPUIterator();
1194 CUDA_LAUNCH((process_ghost_particles_prp<decltype(
g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1197 v_prp.toKernel(),(
unsigned int)offset);
1205 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1212 if (
sizeof...(prp) == 0) {
return;}
1217 for (
size_t j = 0; j <
g_opart.get(i).size(); j++)
1220 typedef decltype(v_prp.get(
g_opart.get(i).template get<0>(j))) encap_src;
1222 typedef decltype(g_send_prp.get(i).get(j)) encap_dst;
1225 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(
g_opart.get(i).template get<0>(j)), g_send_prp.get(i).get(j));
1251 m_prp.resize(prc_sz_r.
size());
1252 m_pos.resize(prc_sz_r.
size());
1255 for (
size_t i = 0; i < prc_sz_r.
size() ; i++)
1258 m_pos.get(i).resize(prc_sz_r.get(i));
1259 m_prp.get(i).resize(prc_sz_r.get(i));
1263 if (opt & RUN_ON_DEVICE)
1265 if (
v_cl.size() == 1)
1268 #if defined(CUDA_GPU) && defined(__NVCC__)
1272 int rank =
v_cl.rank();
1279 starts.template deviceToHost<0>();
1280 size_t offset =
starts.template get<0>(rank);
1283 if (ite.wthr.x != 0)
1286 CUDA_LAUNCH((process_map_particles<decltype(
m_opart.toKernel()),decltype(
v_pos_tmp.toKernel()),decltype(
v_prp_tmp.toKernel()),
1287 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1290 v_pos.toKernel(),v_prp.toKernel(),(
unsigned int)offset);
1294 for (
size_t i = 0 ; i < m_pos.size() ; i++)
1296 size_t offset =
starts.template get<0>(prc_r.template get<0>(i));
1298 auto ite = m_pos.get(i).getGPUIterator();
1301 if (ite.wthr.x != 0)
1304 CUDA_LAUNCH((process_map_particles<decltype(
m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
1305 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1307 m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1308 v_pos.toKernel(),v_prp.toKernel(),(
unsigned int)offset);
1319 std::cout << __FILE__ <<
":" << __LINE__ <<
" error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1326 long int id_end = v_pos.size();
1329 long int end =
m_opart.size()-1;
1332 for (
size_t i = 0; i <
m_opart.size(); i++)
1334 process_map_particle<proc_without_prp>(i,end,id_end,
m_opart,
p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1337 v_pos.resize(v_pos.size() -
m_opart.size());
1355 template<
typename prp_object,
int ... prp>
1362 m_prp.resize(prc_sz_r.
size());
1363 m_pos.resize(prc_sz_r.
size());
1366 for (
size_t i = 0; i < prc_sz_r.
size(); i++)
1369 m_pos.get(i).resize(prc_sz_r.get(i));
1370 m_prp.get(i).resize(prc_sz_r.get(i));
1375 long int id_end = v_pos.size();
1378 long int end =
m_opart.size()-1;
1381 for (
size_t i = 0; i <
m_opart.size(); i++)
1383 process_map_particle<
proc_with_prp<prp_object,prp...>>(i,end,id_end,
m_opart,
p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1386 v_pos.resize(v_pos.size() -
m_opart.size());
1401 layout_base> & lbl_p,
1405 if (opt == RUN_ON_DEVICE)
1411 lbl_p.resize(v_pos.size());
1415 prc_sz.template fill<0>(0);
1417 auto ite = v_pos.getGPUIterator();
1418 if (ite.wthr.x == 0)
1421 starts.template fill<0>(0);
1426 if (
v_cl.size() == 1)
1432 for (
size_t i = 0 ; i < dim ; i++) {bc.bc[i] =
dec.periodicity(i);}
1434 CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite,
dec.getDomain(),bc,v_pos.toKernel());
1440 CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(
dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(
prc_sz.toKernel())>),
1442 dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),
prc_sz.toKernel(),(
int)
v_cl.rank());
1445 openfpm::scan((
unsigned int *)
prc_sz.template getDeviceBuffer<0>(),
prc_sz.size(), (
unsigned int *)
starts.template getDeviceBuffer<0>() ,
v_cl.getGpuContext());
1448 prc_sz.template deviceToHost<0>();
1450 ite = lbl_p.getGPUIterator();
1453 CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(
starts.toKernel())>),ite,lbl_p.toKernel(),
starts.toKernel());
1458 std::cout << __FILE__ <<
":" << __LINE__ <<
" error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
1472 prc_sz.template fill<0>(0);
1474 auto it = v_pos.getIterator();
1479 auto key = it.get();
1482 dec.applyPointBC(v_pos.get(key));
1487 if (
dec.getDomain().isInside(v_pos.get(key)) ==
true)
1488 {p_id =
dec.processorID(v_pos.get(key));}
1490 {p_id = obp::out(key,
v_cl.getProcessUnitID());}
1493 if (p_id !=
v_cl.getProcessUnitID())
1495 if ((
long int) p_id != -1)
1497 prc_sz.template get<0>(p_id)++;
1499 lbl_p.last().template get<0>() = key;
1500 lbl_p.last().template get<2>() = p_id;
1505 lbl_p.last().template get<0>() = key;
1506 lbl_p.last().template get<2>() = p_id;
1535 size_t & ghostMarker,
1544 if (opt & RUN_ON_DEVICE)
1548 ::run(
mem,
dec,
g_opart_device,
proc_id_out,
starts,
v_cl,v_pos,v_prp,prc,
prc_sz,
prc_offset,ghostMarker,opt);
1553 auto it = v_pos.getIteratorTo(ghostMarker);
1556 auto key = it.get();
1562 for (
size_t i = 0; i < vp_id.
size(); i++)
1565 size_t p_id = vp_id.get(i).first;
1569 g_opart.get(p_id).last().template get<0>() = key;
1570 g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second;
1586 g_opart.get(i).swap(g_opart_f.last());
1587 prc.add(
dec.IDtoProc(i));
1593 #ifdef EXTREA_TRACE_PRE_COMM
1594 Extrae_user_function (0);
1606 :
v_cl(create_vcluster<Memory>()),
dec(create_vcluster()),
lg_m(0)
1638 :
v_cl(create_vcluster<Memory>()),
dec(create_vcluster()),
lg_m(0)
1649 for (
size_t i = 0 ; i <
hsmem.size() ; i++)
1651 if (
hsmem.get(i).ref() == 1)
1652 hsmem.get(i).decRef();
1654 std::cout << __FILE__ <<
":" << __LINE__ <<
" internal error memory is in an invalid state " << std::endl;
1676 this->v_sub_unit_factor = n_sub;
1688 const size_t (& bc)[dim],
1695 if (opt & BIND_DEC_TO_GHOST)
1701 CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
1704 cl_param_calculateSym<dim,St>(box,cd_sm,g,pad);
1706 for (
size_t i = 0 ; i < dim ; i++)
1707 {div[i] = cd_sm.getDiv()[i] - 2*pad;}
1710 dec.setParameters(div, box, bc, g, gdist);
1728 const size_t (& bc)[dim],
1735 for (
size_t i = 0 ; i < dim ; i++)
1736 {div[i] = gdist.
size(i);}
1739 dec.setParameters(div, box, bc, g);
1756 size_t & ghostMarker,
1757 size_t opt = WITH_POSITION)
1759 #ifdef PROFILE_SCOREP
1760 SCOREP_USER_REGION(
"ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
1769 if (!(opt & NO_POSITION))
1770 {v_pos.resize(ghostMarker);}
1774 if (!(opt & SKIP_LABELLING))
1775 {v_prp.resize(ghostMarker);}
1778 if ((opt & SKIP_LABELLING) ==
false)
1787 #if defined(CUDA_GPU) && defined(__NVCC__)
1788 cudaDeviceSynchronize();
1799 if (!(opt & NO_POSITION))
1806 #if defined(CUDA_GPU) && defined(__NVCC__)
1807 cudaDeviceSynchronize();
1823 if (!(opt & SKIP_LABELLING))
1825 v_prp.resize(v_pos.size());
1843 size_t & ghostMarker,
1844 size_t opt = WITH_POSITION)
1883 if (opt & RUN_ON_DEVICE)
1885 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" map_list is unsupported on device (coming soon)" << std::endl;
1895 v_pos.resize(ghostMarker);
1896 v_prp.resize(ghostMarker);
1907 for (
size_t i = 0; i <
v_cl.getProcessingUnits(); i++)
1909 if (
prc_sz.template get<0>(i) != 0)
1913 prc_sz_r.add(
prc_sz.template get<0>(i));
1917 if (opt & MAP_LOCAL)
1922 for (
size_t i = 0 ; i <
dec.getNNProcessors() ; i++)
1937 v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,
prc_recv_map,
recv_sz_map,opt);
1941 ghostMarker = v_pos.size();
1957 template<
typename obp = KillParticle>
1962 #ifdef PROFILE_SCOREP
1963 SCOREP_USER_REGION(
"map",SCOREP_USER_REGION_TYPE_FUNCTION)
1969 v_pos.resize(ghostMarker);
1970 v_prp.resize(ghostMarker);
1990 if (opt & RUN_ON_DEVICE)
1992 #if defined(CUDA_GPU) && defined(__NVCC__)
1994 cudaDeviceSynchronize();
1995 opt_ |= MPI_GPU_DIRECT;
1997 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2006 v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>,
2013 ghostMarker = v_pos.size();
2053 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> &
operator=(
const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & vc)
2067 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> &
operator=(
vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> && vc)
2085 template<
template<
typename,
typename>
class op,
int ... prp>
2088 size_t & ghostMarker,
2100 if (opt & RUN_ON_DEVICE)
2102 #if defined(CUDA_GPU) && defined(__NVCC__)
2104 cudaDeviceSynchronize();
2106 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2111 if (opt & NO_CHANGE_ELEMENTS)
2113 size_t opt_ = compute_options(opt);
2115 if (opt & RUN_ON_DEVICE)
2136 size_t opt_ = compute_options(opt);
2138 if (opt & RUN_ON_DEVICE)
2162 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" Local ghost particles = " << v_prp.
size() -
lg_m <<
" != " <<
o_part_loc.size() << std::endl;
2163 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" Check that you did a ghost_get before a ghost_put" << std::endl;
2167 if (opt & RUN_ON_DEVICE)
2169 v_prp.template merge_prp_v_device<op,
prop,Memory,
2176 v_prp.template merge_prp_v<op,
prop,Memory,
This class define the domain decomposition interface.
This class allocate, and destroy CPU memory.
This class implement the point shape in an N-dimensional space.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Implementation of VCluster class.
__device__ __host__ size_t size() const
Return the size of the grid.
Grow policy define how the vector should grow every time we exceed the size.
Grow policy define how the vector should grow every time we exceed the size.
This class is an helper for the communication of vector_dist.
openfpm::vector< size_t > prc_recv_get_pos
void init_decomposition(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
void init_decomposition_gr_cell(Box< dim, St > &box, const size_t(&bc)[dim], const Ghost< dim, St > &g, size_t opt, const grid_sm< dim, void > &gdist)
Initialize the decomposition.
void map_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &ghostMarker, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
openfpm::vector< size_t > recv_sz_get_byte
Conversion to byte of recv_sz_get.
void ghost_wait_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &ghostMarker, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
vector_dist_comm(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &v)
Copy Constructor.
void fill_send_map_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, openfpm::vector< openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity >> &m_pos, openfpm::vector< openfpm::vector< prop, Memory, layout_base, openfpm::grow_policy_identity >> &m_prp, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
allocate and fill the send buffer for the map function
void ghost_get_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &ghostMarker, size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
openfpm::vector< size_t > recv_sz_put
The same as recv_sz_get but for put.
Decomposition dec
Domain decomposition.
~vector_dist_comm()
Destructor.
const Decomposition & getDecomposition() const
Get the decomposition.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_offset
Processor communication size.
size_t get_last_ghost_get_received_parts(size_t i)
Get the number of particles received from each processor during the last ghost_get.
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > prc_sz
Processor communication size.
Decomposition & getDecomposition()
Get the decomposition.
openfpm::vector< size_t > & get_last_ghost_get_num_proc_vector()
Get the number of processor involved during the last ghost_get.
openfpm::vector< Point< dim, St >, Memory, layout_base, openfpm::grow_policy_identity > send_pos_vector
definition of the send vector for position
openfpm::vector< Box< dim, St >, Memory, layout_base > box_f_dev
The boxes touching the border of the domain + shift vector linearized from where they come from.
std::unordered_map< size_t, size_t > map_cmb
this map is used to check if a combination is already present
vector_dist_comm()
Constructor.
openfpm::vector_fr< Memory > hsmem
Sending buffer.
openfpm::vector< size_t > prc_sz_gg
elements sent for each processors (ghost_get)
openfpm::vector< aggregate< int, int, int >, Memory, layout_base > m_opart
openfpm::vector< Point< dim, St >, Memory, layout_base > v_pos_tmp
Helper buffer for computation (on GPU) of local particles (position)
void map_list_(openfpm::vector< Point< dim, St >> &v_pos, openfpm::vector< prop > &v_prp, size_t &ghostMarker, size_t opt)
It move all the particles that does not belong to the local processor to the respective processor.
openfpm::vector< size_t > prc_recv_put
the same as prc_recv_get but for put
openfpm::vector< aggregate< unsigned int, unsigned long int >, CudaMemory, memory_traits_inte > g_opart_device
Same as g_opart but on device, the vector of vector is flatten into a single vector.
openfpm::vector< openfpm::vector< aggregate< size_t, size_t > > > g_opart
void resize_retained_buffer(openfpm::vector_fr< Memory > &rt_buf, size_t nbf)
resize the retained buffer by nbf
openfpm::vector< size_t > prc_g_opart
processor rank list of g_opart
void fill_send_map_buf_list(openfpm::vector< Point< dim, St >> &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< openfpm::vector< Point< dim, St >>> &m_pos, openfpm::vector< openfpm::vector< prp_object >> &m_prp)
allocate and fill the send buffer for the map function
void setDecomposition(Decomposition &dec2)
Set the decomposition.
void labelParticlesGhost(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc, openfpm::vector< size_t > &prc_sz, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_offset, size_t &ghostMarker, size_t opt)
Label the particles.
CudaMemory mem
Temporary CudaMemory to do stuff.
openfpm::vector_std< openfpm::vector_std< Box< dim, St > > > box_f
openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > o_part_loc
Id of the local particle to replicate for ghost_get.
size_t get_last_ghost_get_num_proc()
Get the number of processor involved during the last ghost_get.
openfpm::vector< prop, Memory, layout_base > v_prp_tmp
Helper buffer for computation (on GPU) of local particles (properties)
void createShiftBox()
For every internal ghost box we create a structure that order such internal local ghost box in shift ...
Vcluster< Memory > & v_cl
VCluster.
vector_dist_comm(Decomposition &&dec)
Constructor.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &&vc)
Copy a vector.
long int shift_box_ndec
From which decomposition the shift boxes are calculated.
void local_ghost_from_opart(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t opt)
Local ghost from labeled particles.
vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > & operator=(const vector_dist_comm< dim, St, prop, Decomposition, Memory, layout_base > &vc)
Copy a vector.
size_t getDecompositionGranularity()
Get the number of minimum sub-domain per processor.
openfpm::vector< size_t > prc_recv_map
the same as prc_recv_get but for map
openfpm::vector< size_t > recv_sz_map
The same as recv_sz_get but for map.
void fill_send_ghost_pos_buf(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_pos_vector > &g_pos_send, size_t opt, bool async)
This function fill the send buffer for the particle position after the particles has been label with ...
openfpm::vector< size_t > p_map_req
It map the processor id with the communication request into map procedure.
void add_loc_particles_bc(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &ghostMarker, size_t opt)
Add local particles based on the boundary conditions.
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > starts
temporary buffer for the scan result
void local_ghost_from_dec(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t ghostMarker, size_t opt)
Local ghost from decomposition.
void fill_send_ghost_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< size_t > &prc_sz, openfpm::vector< send_vector > &g_send_prp, size_t opt)
This function fill the send buffer for properties after the particles has been label with labelPartic...
void fill_send_ghost_put_prp_buf(openfpm::vector< prop, Memory, layout_base > &v_prp, openfpm::vector< send_vector > &g_send_prp, size_t &ghostMarker, size_t opt)
This function fill the send buffer for ghost_put.
openfpm::vector_std< comb< dim > > box_cmb
Store the sector for each group (previous vector)
vector_dist_comm(const Decomposition &dec)
Constructor.
openfpm::vector< size_t > recv_sz_get_pos
void calc_send_buffers(openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, openfpm::vector< size_t > &prc_sz_r, openfpm::vector< size_t > &prc_r, size_t opt)
Calculate sending buffer size for each processor.
openfpm::vector< size_t > g_opart_sz
Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
size_t v_sub_unit_factor
Number of units for each sub-domain.
void ghost_put_(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< prop, Memory, layout_base > &v_prp, size_t &ghostMarker, size_t opt)
Ghost put.
openfpm::vector< aggregate< unsigned int >, Memory, layout_base > proc_id_out
temporary buffer to processors ids
void labelParticleProcessor(openfpm::vector< Point< dim, St >, Memory, layout_base > &v_pos, openfpm::vector< aggregate< int, int, int >, Memory, layout_base > &lbl_p, openfpm::vector< aggregate< unsigned int, unsigned int >, Memory, layout_base > &prc_sz, size_t opt)
Label particles for mappings.
void setDecompositionGranularity(size_t n_sub)
Set the minimum number of sub-domain per processor.
Out of bound policy it detect out of bound particles and decide what to do.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
boost::fusion::vector< list... > type
internal type containing the data
template selector for asynchronous or not asynchronous
Transform the boost::fusion::vector into memory specification (memory_traits)
Transform the boost::fusion::vector into memory specification (memory_traits)
It create a boost::fusion vector with the selected properties.
It copy the properties from one object to another.
Helper class to merge data.
Helper class to merge data.
Helper class to merge data.
Helper class to merge data.
process the particle with properties
static void proc(size_t lbl, size_t cnt, size_t id, T1 &v_prp, T2 &m_prp)
process the particle
Set the buffer for each property.
void operator()(T &t)
It call the setMemory function for each property.