1 #ifndef VCLUSTER_BASE_HPP_
2 #define VCLUSTER_BASE_HPP_
5 #include "util/cuda_util.hpp"
12 #include "MPI_wrapper/MPI_util.hpp"
13 #include "Vector/map_vector.hpp"
14 #include "MPI_wrapper/MPI_IallreduceW.hpp"
15 #include "MPI_wrapper/MPI_IrecvW.hpp"
16 #include "MPI_wrapper/MPI_IsendW.hpp"
17 #include "MPI_wrapper/MPI_IAllGather.hpp"
18 #include "MPI_wrapper/MPI_IBcastW.hpp"
20 #include "Vector/map_vector.hpp"
22 #include "util/check_no_pointers.hpp"
23 #include "util/util_debug.hpp"
25 #include "util/Vcluster_log.hpp"
26 #include "memory/BHeapMemory.hpp"
27 #include "Packer_Unpacker/has_max_prop.hpp"
28 #include "data_type/aggregate.hpp"
29 #include "util/ofp_context.hpp"
31 #if defined (ENABLE_NUMERICS) && defined (HAVE_PETSC)
35 extern double time_spent;
45 constexpr
int MSG_LENGTH = 1024;
46 constexpr
int MSG_SEND_RECV = 1025;
47 constexpr
int SEND_SPARSE = 8192;
48 constexpr
int NONE = 1;
49 constexpr
int NEED_ALL_SIZE = 2;
51 constexpr
int SERIVCE_MESSAGE_TAG = 16384;
52 constexpr
int SEND_RECV_BASE = 4096;
53 constexpr
int GATHER_BASE = 24576;
55 constexpr
int RECEIVE_KNOWN = 4;
56 constexpr
int KNOWN_ELEMENT_OR_BYTE = 8;
57 constexpr
int MPI_GPU_DIRECT = 16;
59 constexpr
int NQUEUE = 4;
62 extern size_t n_vcluster;
64 extern bool global_mpi_init;
66 extern bool ofp_initialized;
67 extern size_t tot_sent;
68 extern size_t tot_recv;
74 template<
typename T>
void assign(T * ptr1, T * ptr2)
124 template<
typename InternalMemory>
180 NBX_Type NBX_active[NQUEUE];
193 int NBX_prc_cnt_base = 0;
194 size_t NBX_prc_n_send[NQUEUE];
195 size_t * NBX_prc_prc[NQUEUE];
196 void ** NBX_prc_ptr[NQUEUE];
197 size_t * NBX_prc_sz[NQUEUE];
198 size_t NBX_prc_n_recv[NQUEUE];
199 void * (* NBX_prc_msg_alloc[NQUEUE])(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *);
200 size_t * NBX_prc_prc_recv[NQUEUE];
201 void * NBX_prc_ptr_arg[NQUEUE];
239 void queue_all_sends(
size_t n_send ,
size_t sz[],
240 size_t prc[],
void * ptr[])
243 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" this function must be called when no other requests are in progress. Please remember that if you use function like max(),sum(),send(),recv() check that you did not miss to call the function execute() \n";}
253 for (
size_t i = 0 ; i < n_send ; i++)
260 check_valid(ptr[i],sz[i]);
267 if (sz[i] > 2147483647)
268 {MPI_SAFE_CALL(MPI_Issend(ptr[i], (sz[i] >> 3) + 1 , MPI_DOUBLE, prc[i], SEND_SPARSE + (
NBX_cnt +
NBX_prc_qcnt)*131072 + i,
ext_comm,&
req.last()));}
279 openfpm::vector_fr<BMemory<InternalMemory>>
recv_buf[NQUEUE];
297 int already_finalised;
299 MPI_Finalized(&already_finalised);
300 if (!already_finalised &&
ext_comm == MPI_COMM_WORLD)
302 if (MPI_Finalize() != 0)
304 std::cerr << __FILE__ <<
":" << __LINE__ <<
" MPI_Finalize FAILED \n";
322 for (
unsigned int i = 0 ; i < NQUEUE ; i++)
324 NBX_active[i] = NBX_Type::NBX_UNACTIVE;
329 check_new(
this,8,VCLUSTER_EVENT,PRJ_VCLUSTER);
334 int already_initialised;
335 MPI_Initialized(&already_initialised);
338 if (!already_initialised)
346 MPI_Comm_split_type(
ext_comm, MPI_COMM_TYPE_SHARED, 0,
347 MPI_INFO_NULL, &shmcomm);
349 MPI_Comm_rank(shmcomm, &
shmrank);
350 MPI_Comm_free(&shmcomm);
377 #ifdef EXTERNAL_SET_GPU
386 #if defined(PRINT_RANK_TO_GPU) && defined(CUDA_GPU)
388 char node_name[MPI_MAX_PROCESSOR_NAME];
391 MPI_Get_processor_name(node_name,&len);
393 std::cout <<
"Rank: " <<
m_rank <<
" on host: " << node_name <<
" work on GPU: " <<
gpuContext->getDevice() <<
"/" <<
gpuContext->getNDevice() << std::endl;
400 MPI_Comm_get_attr(
ext_comm, MPI_TAG_UB, &tag_ub_v, &flag);
404 tag_ub = *(
int*)tag_ub_v;
405 nbx_cycle = (tag_ub - SEND_SPARSE - 131072 - NQUEUE*131072) / 131072;
408 std::cerr << __FILE__ <<
":" << __LINE__ <<
" Error MPI_TAG_UB is too small for OpenFPM" << std::endl;
429 template<
typename T>
void checkType()
433 if (std::is_fundamental<T>::value ==
true)
437 if (std::is_pointer<T>::value ==
true)
438 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
441 if (std::is_lvalue_reference<T>::value ==
true)
442 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
445 if (std::is_rvalue_reference<T>::value ==
true)
446 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
453 std::cerr <<
"Warning: " << __FILE__ <<
":" << __LINE__ <<
" impossible to check the type " << demangle(
typeid(T).name()) <<
" please consider to add a static method \"static bool noPointers()\" \n" ;
458 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" has pointers inside, sending pointers values has no sense\n";
479 std::cout << __FILE__ <<
":" << __LINE__ <<
" Warning: it seem that a gpu context is not initialized."
480 "Either a compatible working cuda device has not been found, either openfpm_init has been called in a file that not compiled with NVCC" << std::endl;
517 return this->m_size*
numPE;
522 #ifdef VCLUSTER_PERF_REPORT
523 std::cout <<
"-- REPORT COMMUNICATIONS -- " << std::endl;
525 std::cout <<
"Processor " << this->
rank() <<
" sent: " << tot_sent << std::endl;
526 std::cout <<
"Processor " << this->
rank() <<
" received: " << tot_recv << std::endl;
528 std::cout <<
"Processor " << this->
rank() <<
" time spent: " << time_spent << std::endl;
529 std::cout <<
"Processor " << this->
rank() <<
" Bandwidth: S:" << (double)tot_sent / time_spent * 1e-9 <<
"GB/s R:" << (
double)tot_recv / time_spent * 1e-9 <<
"GB/s" << std::endl;
532 std::cout <<
"Error to activate performance stats on VCluster enable VCLUSTER_PERF_REPORT" << std::endl;
539 #ifdef VCLUSTER_PERF_REPORT
547 std::cout <<
"Error to activate performance stats on VCluster enable VCLUSTER_PERF_REPORT" << std::endl;
583 template<
typename T>
void sum(T & num)
603 template<
typename T>
void max(T & num)
623 template<
typename T>
void min(T & num)
646 MPI_SAFE_CALL(MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG,
ext_comm,&
stat,&stat_t));
651 unsigned int i = (stat_t.MPI_TAG - SEND_SPARSE) / 131072 - NBX_prc_cnt_base;
653 if (i >= NQUEUE || NBX_active[i] == NBX_Type::NBX_UNACTIVE || NBX_active[i] == NBX_Type::NBX_KNOWN || NBX_active[i] == NBX_Type::NBX_KNOWN_PRC)
658 bool big_data =
true;
662 MPI_SAFE_CALL(MPI_Get_count(&stat_t,MPI_DOUBLE,&msize_));
663 if (msize_ == MPI_UNDEFINED)
666 MPI_SAFE_CALL(MPI_Get_count(&stat_t,MPI_BYTE,&msize_));
671 msize = ((size_t)msize_) << 3;
675 if (stat_t.MPI_TAG >= (
int)(SEND_SPARSE + NBX_prc_cnt_base*131072) && stat_t.MPI_TAG < (
int)(SEND_SPARSE + (NBX_prc_cnt_base +
NBX_prc_qcnt + 1)*131072))
678 void * ptr = this->NBX_prc_msg_alloc[i](msize,0,0,stat_t.MPI_SOURCE,
rid[i],stat_t.MPI_TAG,this->NBX_prc_ptr_arg[i]);
687 check_valid(ptr,msize);
690 #ifdef VCLUSTER_GARBAGE_INJECTOR
691 #if defined (__NVCC__) && !defined(CUDA_ON_CPU)
692 cudaPointerAttributes cpa;
693 auto error = cudaPointerGetAttributes(&cpa,ptr);
694 if (error == cudaSuccess)
696 if(cpa.type == cudaMemoryTypeDevice)
697 {cudaMemset(ptr,0xFF,msize);}
699 {memset(ptr,0xFF,msize);}
702 memset(ptr,0xFF,msize);
705 if (big_data ==
true)
708 MPI_SAFE_CALL(MPI_Recv(ptr,msize >> 3,MPI_DOUBLE,stat_t.MPI_SOURCE,stat_t.MPI_TAG,
ext_comm,&stat_t));
712 MPI_SAFE_CALL(MPI_Recv(ptr,msize,MPI_BYTE,stat_t.MPI_SOURCE,stat_t.MPI_TAG,
ext_comm,&stat_t));
715 check_valid(ptr,msize);
722 for (
unsigned int i = 0 ; i < NQUEUE ; i++)
724 if (i >= NQUEUE || NBX_active[i] == NBX_Type::NBX_UNACTIVE || NBX_active[i] == NBX_Type::NBX_KNOWN || NBX_active[i] == NBX_Type::NBX_KNOWN_PRC)
731 {MPI_SAFE_CALL(MPI_Testall(
req.
size(),&
req.get(0),&flag,MPI_STATUSES_IGNORE));}
790 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
794 #ifdef VCLUSTER_PERF_REPORT
802 for (
size_t i = 0 ; i < prc.
size() ; i++)
803 {
send(prc.get(i),SEND_SPARSE +
NBX_cnt*131072,data.get(i).getPointer(),data.get(i).
size());}
805 for (
size_t i = 0 ; i < prc_recv.
size() ; i++)
807 void * ptr_recv = msg_alloc(recv_sz.get(i),0,0,prc_recv.get(i),i,SEND_SPARSE +
NBX_cnt*131072,ptr_arg);
809 recv(prc_recv.get(i),SEND_SPARSE +
NBX_cnt*131072,ptr_recv,recv_sz.get(i));
817 #ifdef VCLUSTER_PERF_REPORT
819 time_spent += nbx_timer.
getwct();
871 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
878 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
884 for (
size_t i = 0 ; i < prc.
size() ; i++)
885 {
send(prc.get(i),SEND_SPARSE +
NBX_cnt*131072,data.get(i).getPointer(),data.get(i).
size());}
887 for (
size_t i = 0 ; i < prc_recv.
size() ; i++)
889 void * ptr_recv = msg_alloc(recv_sz.get(i),0,0,prc_recv.get(i),i,SEND_SPARSE +
NBX_cnt*131072,ptr_arg);
891 recv(prc_recv.get(i),SEND_SPARSE +
NBX_cnt*131072,ptr_recv,recv_sz.get(i));
938 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
939 void * ptr_arg,
long int opt=NONE)
942 checkType<typename T::value_type>();
949 for (
size_t i = 0 ; i < prc.
size() ; i++)
1000 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1001 void * ptr_arg,
long int opt=NONE)
1004 checkType<typename T::value_type>();
1010 for (
size_t i = 0 ; i < prc.
size() ; i++)
1063 size_t prc[] ,
void * ptr[],
1064 size_t n_recv,
size_t prc_recv[] ,
1065 size_t sz_recv[] ,
void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1066 void * ptr_arg,
long int opt=NONE)
1068 #ifdef VCLUSTER_PERF_REPORT
1076 for (
size_t i = 0 ; i < n_send ; i++)
1077 {
send(prc[i],SEND_SPARSE +
NBX_cnt*131072,ptr[i],sz[i]);}
1079 for (
size_t i = 0 ; i < n_recv ; i++)
1081 void * ptr_recv = msg_alloc(sz_recv[i],0,0,prc_recv[i],i,SEND_SPARSE +
NBX_cnt*131072,ptr_arg);
1083 recv(prc_recv[i],SEND_SPARSE +
NBX_cnt*131072,ptr_recv,sz_recv[i]);
1091 #ifdef VCLUSTER_PERF_REPORT
1093 time_spent += nbx_timer.
getwct();
1141 size_t prc[] ,
void * ptr[],
1142 size_t n_recv,
size_t prc_recv[] ,
1143 size_t sz_recv[] ,
void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1144 void * ptr_arg,
long int opt=NONE)
1149 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
1155 for (
size_t i = 0 ; i < n_send ; i++)
1156 {
send(prc[i],SEND_SPARSE +
NBX_cnt*131072,ptr[i],sz[i]);}
1158 for (
size_t i = 0 ; i < n_recv ; i++)
1160 void * ptr_recv = msg_alloc(sz_recv[i],0,0,prc_recv[i],i,SEND_SPARSE +
NBX_cnt*131072,ptr_arg);
1162 recv(prc_recv[i],SEND_SPARSE +
NBX_cnt*131072,ptr_recv,sz_recv[i]);
1215 void * ptr[],
size_t n_recv,
size_t prc_recv[] ,
1216 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1217 void * ptr_arg,
long int opt=NONE)
1219 #ifdef VCLUSTER_PERF_REPORT
1224 sz_recv_tmp.resize(n_recv);
1228 for (
size_t i = 0 ; i < n_send ; i++)
1229 {
send(prc[i],SEND_SPARSE +
NBX_cnt*131072,&sz[i],
sizeof(
size_t));}
1231 for (
size_t i = 0 ; i < n_recv ; i++)
1232 {
recv(prc_recv[i],SEND_SPARSE +
NBX_cnt*131072,&sz_recv_tmp.get(i),
sizeof(
size_t));}
1241 for (
size_t i = 0 ; i < n_send ; i++)
1242 {
send(prc[i],SEND_SPARSE +
NBX_cnt*131072,ptr[i],sz[i]);}
1244 for (
size_t i = 0 ; i < n_recv ; i++)
1246 void * ptr_recv = msg_alloc(sz_recv_tmp.get(i),0,0,prc_recv[i],i,0,ptr_arg);
1248 recv(prc_recv[i],SEND_SPARSE +
NBX_cnt*131072,ptr_recv,sz_recv_tmp.get(i));
1256 #ifdef VCLUSTER_PERF_REPORT
1258 time_spent += nbx_timer.
getwct();
1305 void * ptr[],
size_t n_recv,
size_t prc_recv[] ,
1306 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1307 void * ptr_arg,
long int opt=NONE)
1312 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
1316 sz_recv_tmp.resize(n_recv);
1320 for (
size_t i = 0 ; i < n_send ; i++)
1321 {
send(prc[i],SEND_SPARSE +
NBX_cnt*131072,&sz[i],
sizeof(
size_t));}
1323 for (
size_t i = 0 ; i < n_recv ; i++)
1324 {
recv(prc_recv[i],SEND_SPARSE +
NBX_cnt*131072,&sz_recv_tmp.get(i),
sizeof(
size_t));}
1383 size_t prc[] ,
void * ptr[],
1384 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1385 void * ptr_arg,
long int opt = NONE)
1387 #ifdef VCLUSTER_PERF_REPORT
1396 std::cout << __FILE__ <<
":" << __LINE__ <<
" error there are some asynchronous call running you have to complete them before go back to synchronous" << std::endl;
1400 queue_all_sends(n_send,sz,prc,ptr);
1426 }
while (flag ==
false);
1438 #ifdef VCLUSTER_PERF_REPORT
1440 time_spent += nbx_timer.
getwct();
1489 size_t prc[] ,
void * ptr[],
1490 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1491 void * ptr_arg,
long int opt = NONE)
1494 queue_all_sends(n_send,sz,prc,ptr);
1517 for (
unsigned int j = 0 ; j < NQUEUE ; j++)
1519 if (NBX_active[j] == NBX_Type::NBX_UNACTIVE)
1522 if (NBX_active[j] == NBX_Type::NBX_KNOWN_PRC)
1531 for (
size_t i = 0 ; i < NBX_prc_n_send[j] ; i++)
1532 {
send(NBX_prc_prc[j][i],SEND_SPARSE +
NBX_cnt*131072,NBX_prc_ptr[j][i],NBX_prc_sz[j][i]);}
1534 for (
size_t i = 0 ; i < NBX_prc_n_recv[j] ; i++)
1536 void * ptr_recv = NBX_prc_msg_alloc[j](sz_recv_tmp.get(i),0,0,NBX_prc_prc_recv[j][i],i,0,this->NBX_prc_ptr_arg[j]);
1538 recv(NBX_prc_prc_recv[j][i],SEND_SPARSE +
NBX_cnt*131072,ptr_recv,sz_recv_tmp.get(i));
1541 NBX_active[j] = NBX_Type::NBX_KNOWN;
1544 if (NBX_active[j] == NBX_Type::NBX_KNOWN)
1550 NBX_active[j] = NBX_Type::NBX_UNACTIVE;
1569 }
while (flag ==
false);
1579 NBX_active[j] = NBX_Type::NBX_UNACTIVE;
1605 bool send(
size_t proc,
size_t tag,
const void * mem,
size_t sz)
1613 MPI_IsendWB::send(proc,SEND_RECV_BASE + tag,mem,sz,
req.last(),
ext_comm);
1671 bool recv(
size_t proc,
size_t tag,
void * v,
size_t sz)
1764 template<
typename T,
typename Mem,
template<
typename>
class layout_base >
1801 for (
size_t i = 0 ; i < NQUEUE ; i++)
General recv for vector of.
Set of wrapping classing for MPI_Iallreduce.
static void recv(size_t proc, size_t tag, void *buf, size_t sz, MPI_Request &req, MPI_Comm ext_comm)
General recv for general buffer.
General recv for vector of.
General send for a vector of any type.
This class virtualize the cluster of PC as a set of processes that communicate.
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
void progressCommunication()
In case of Asynchonous communications like sendrecvMultipleMessagesNBXAsync this function progress th...
MPI_Request bar_req
barrier request
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
void execute()
Execute all the requests.
int NBX_prc_qcnt
NBX comunication on queue (-1 mean 0, 0 mean 1, 1 mean 2, .... )
MPI_Comm getMPIComm()
Get the MPI_Communicator (or processor group) this VCluster is using.
Vcluster_base(int *argc, char ***argv, MPI_Comm ext_comm)
Virtual cluster constructor.
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages Asynchronous version.
size_t rank()
Get the process unit id.
size_t size()
Get the total number of processors.
void sum(T &num)
Sum the numbers across all processors and get the result.
openfpm::vector_fr< BMemory< InternalMemory > > recv_buf[NQUEUE]
Receive buffers.
bool send(size_t proc, size_t tag, openfpm::vector< T, Mem, gr > &v)
Send data to a processor.
Vcluster_base(const Vcluster_base &)
disable copy constructor
void sendrecvMultipleMessagesNBX(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
int m_size
number of processes
Vcluster_base & operator=(const Vcluster_base &)
disable operator=
void clear()
Release the buffer used for communication.
bool Bcast(openfpm::vector< T, Mem, layout_base > &v, size_t root)
Broadcast the data to all processors.
void sendrecvMultipleMessagesNBX(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &recv_sz, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], size_t sz_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
size_t getProcessUnitID()
Get the process unit id.
openfpm::vector< size_t > proc_com
openfpm::vector< size_t > sz_send[NQUEUE]
vector of the size of send buffers
void min(T &num)
Get the minimum number across all processors (or reduction with insinity norm)
gpu::ofp_context_t & getGpuContext(bool iw=true)
If nvidia cuda is activated return a gpu context.
size_t getProcessingUnits()
Get the total number of processors.
openfpm::vector< void * > ptr_send[NQUEUE]
vector of pointers of send buffers
gpu::ofp_context_t * gpuContext
standard context for gpu (if cuda is detected otherwise is unused)
bool recv(size_t proc, size_t tag, openfpm::vector< T, Mem, gr > &v)
Recv data from a processor.
openfpm::vector< int > map_scatter
vector that contain the scatter map (it is basically an array of one)
void sendrecvMultipleMessagesNBXAsync(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
openfpm::vector< size_t > tags[NQUEUE]
tags receiving
bool recv(size_t proc, size_t tag, void *v, size_t sz)
Recv data from a processor.
int shmrank
rank within the node
std::vector< int > post_exe
vector of functions to execute after all the request has been performed
openfpm::vector< MPI_Request > req
vector of MPI requests
void sendrecvMultipleMessagesNBXWait()
Send and receive multiple messages wait NBX communication to complete.
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
bool allGather(T &send, openfpm::vector< T, Mem, gr > &v)
Gather the data from all processors.
int numPE
number of processing unit per process
bool NBX_prc_reached_bar_req[NQUEUE]
Is the barrier request reached.
void max(T &num)
Get the maximum number across all processors (or reduction with infinity norm)
MPI_Comm ext_comm
external communicator
openfpm::vector< MPI_Status > stat
vector of MPI status
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], size_t sz_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
MPI_Status bar_stat
barrier status
void sendrecvMultipleMessagesNBXAsync(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &recv_sz, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
bool send(size_t proc, size_t tag, const void *mem, size_t sz)
Send data to a processor.
Class for cpu time benchmarking.
void stop()
Stop the timer.
void start()
Start the timer.
double getwct()
Return the elapsed real time.
This class check if the type T has pointers inside.
temporal buffer for reductions
unsigned int ui
unsigned integer
unsigned short us
unsigned short
unsigned char uc
unsigned char