1 #ifndef VCLUSTER_BASE_HPP_ 2 #define VCLUSTER_BASE_HPP_ 5 #include "util/cuda_util.hpp" 12 #include "MPI_wrapper/MPI_util.hpp" 13 #include "Vector/map_vector.hpp" 14 #include "MPI_wrapper/MPI_IallreduceW.hpp" 15 #include "MPI_wrapper/MPI_IrecvW.hpp" 16 #include "MPI_wrapper/MPI_IsendW.hpp" 17 #include "MPI_wrapper/MPI_IAllGather.hpp" 18 #include "MPI_wrapper/MPI_IBcastW.hpp" 20 #include "Vector/map_vector.hpp" 22 #include "util/check_no_pointers.hpp" 23 #include "util/util_debug.hpp" 25 #include "util/Vcluster_log.hpp" 26 #include "memory/BHeapMemory.hpp" 27 #include "Packer_Unpacker/has_max_prop.hpp" 28 #include "data_type/aggregate.hpp" 29 #include "util/cuda/ofp_context.hxx" 35 extern double time_spent;
45 constexpr
int MSG_LENGTH = 1024;
46 constexpr
int MSG_SEND_RECV = 1025;
47 constexpr
int SEND_SPARSE = 8192;
48 constexpr
int NONE = 1;
49 constexpr
int NEED_ALL_SIZE = 2;
51 constexpr
int SERIVCE_MESSAGE_TAG = 16384;
52 constexpr
int SEND_RECV_BASE = 4096;
53 constexpr
int GATHER_BASE = 24576;
55 constexpr
int RECEIVE_KNOWN = 4;
56 constexpr
int KNOWN_ELEMENT_OR_BYTE = 8;
57 constexpr
int MPI_GPU_DIRECT = 16;
59 constexpr
int NQUEUE = 4;
62 extern size_t n_vcluster;
64 extern bool global_mpi_init;
66 extern bool ofp_initialized;
67 extern size_t tot_sent;
68 extern size_t tot_recv;
72 extern size_t NBX_cnt;
74 template<
typename T>
void assign(T * ptr1, T * ptr2)
124 template<
typename InternalMemory>
161 NBX_Type NBX_active[NQUEUE];
174 int NBX_prc_cnt_base = 0;
175 size_t NBX_prc_n_send[NQUEUE];
176 size_t * NBX_prc_prc[NQUEUE];
177 void ** NBX_prc_ptr[NQUEUE];
178 size_t * NBX_prc_sz[NQUEUE];
179 size_t NBX_prc_n_recv[NQUEUE];
180 void * (* NBX_prc_msg_alloc[NQUEUE])(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *);
181 size_t * NBX_prc_prc_recv[NQUEUE];
182 void * NBX_prc_ptr_arg[NQUEUE];
220 void queue_all_sends(
size_t n_send ,
size_t sz[],
221 size_t prc[],
void * ptr[])
224 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" this function must be called when no other requests are in progress. Please remember that if you use function like max(),sum(),send(),recv() check that you did not miss to call the function execute() \n";}
234 for (
size_t i = 0 ; i < n_send ; i++)
241 check_valid(ptr[i],sz[i]);
248 MPI_SAFE_CALL(MPI_Issend(ptr[i], sz[i], MPI_BYTE, prc[i], SEND_SPARSE + (NBX_cnt +
NBX_prc_qcnt)*131072 + i, MPI_COMM_WORLD,&
req.last()));
275 int already_finalised;
277 MPI_Finalized(&already_finalised);
278 if (!already_finalised)
280 if (MPI_Finalize() != 0)
282 std::cerr << __FILE__ <<
":" << __LINE__ <<
" MPI_Finalize FAILED \n";
300 for (
unsigned int i = 0 ; i < NQUEUE ; i++)
302 NBX_active[i] = NBX_Type::NBX_UNACTIVE;
307 check_new(
this,8,VCLUSTER_EVENT,PRJ_VCLUSTER);
312 int already_initialised;
313 MPI_Initialized(&already_initialised);
316 if (!already_initialised)
324 MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
325 MPI_INFO_NULL, &shmcomm);
327 MPI_Comm_rank(shmcomm, &
shmrank);
328 MPI_Comm_free(&shmcomm);
333 MPI_Comm_size(MPI_COMM_WORLD, &
m_size);
334 MPI_Comm_rank(MPI_COMM_WORLD, &
m_rank);
355 #ifdef EXTERNAL_SET_GPU 358 context =
new mgpu::ofp_context_t(mgpu::gpu_context_opt::no_print_props,dev);
360 context =
new mgpu::ofp_context_t(mgpu::gpu_context_opt::no_print_props,
shmrank);
364 #if defined(PRINT_RANK_TO_GPU) && defined(CUDA_GPU) 366 char node_name[MPI_MAX_PROCESSOR_NAME];
369 MPI_Get_processor_name(node_name,&len);
371 std::cout <<
"Rank: " <<
m_rank <<
" on host: " << node_name <<
" work on GPU: " <<
context->getDevice() <<
"/" <<
context->getNDevice() << std::endl;
378 MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub_v, &flag);
379 tag_ub = *(
int*)tag_ub_v;
383 nbx_cycle = (tag_ub - SEND_SPARSE - 131072 - NQUEUE*131072) / 131072;
386 {std::cerr << __FILE__ <<
":" << __LINE__ <<
" Error MPI_TAG_UB is too small for OpenFPM" << std::endl;}
404 template<
typename T>
void checkType()
408 if (std::is_fundamental<T>::value ==
true)
412 if (std::is_pointer<T>::value ==
true)
413 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
416 if (std::is_lvalue_reference<T>::value ==
true)
417 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
420 if (std::is_rvalue_reference<T>::value ==
true)
421 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
428 std::cerr <<
"Warning: " << __FILE__ <<
":" << __LINE__ <<
" impossible to check the type " << demangle(
typeid(T).name()) <<
" please consider to add a static method \"static bool noPointers()\" \n" ;
433 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" has pointers inside, sending pointers values has no sense\n";
452 if (
context == NULL && iw ==
true)
454 std::cout << __FILE__ <<
":" << __LINE__ <<
" Warning: it seem that modern gpu context is not initialized." 455 "Either a compatible working cuda device has not been found, either openfpm_init has been called in a file that not compiled with NVCC" << std::endl;
468 return MPI_COMM_WORLD;
492 return this->m_size*
numPE;
497 #ifdef VCLUSTER_PERF_REPORT 498 std::cout <<
"-- REPORT COMMUNICATIONS -- " << std::endl;
500 std::cout <<
"Processor " << this->
rank() <<
" sent: " << tot_sent << std::endl;
501 std::cout <<
"Processor " << this->
rank() <<
" received: " << tot_recv << std::endl;
503 std::cout <<
"Processor " << this->
rank() <<
" time spent: " << time_spent << std::endl;
504 std::cout <<
"Processor " << this->
rank() <<
" Bandwidth: S:" << (double)tot_sent / time_spent * 1e-9 <<
"GB/s R:" << (
double)tot_recv / time_spent * 1e-9 <<
"GB/s" << std::endl;
507 std::cout <<
"Error to activate performance stats on VCluster enable VCLUSTER_PERF_REPORT" << std::endl;
514 #ifdef VCLUSTER_PERF_REPORT 522 std::cout <<
"Error to activate performance stats on VCluster enable VCLUSTER_PERF_REPORT" << std::endl;
558 template<
typename T>
void sum(T & num)
578 template<
typename T>
void max(T & num)
598 template<
typename T>
void min(T & num)
621 MPI_SAFE_CALL(MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&
stat,&stat_t));
626 unsigned int i = (stat_t.MPI_TAG - SEND_SPARSE) / 131072 - NBX_prc_cnt_base;
628 if (i >= NQUEUE || NBX_active[i] == NBX_Type::NBX_UNACTIVE || NBX_active[i] == NBX_Type::NBX_KNOWN || NBX_active[i] == NBX_Type::NBX_KNOWN_PRC)
634 MPI_SAFE_CALL(MPI_Get_count(&stat_t,MPI_BYTE,&msize));
637 if (stat_t.MPI_TAG >= (
int)(SEND_SPARSE + NBX_prc_cnt_base*131072) && stat_t.MPI_TAG < (
int)(SEND_SPARSE + (NBX_prc_cnt_base +
NBX_prc_qcnt + 1)*131072))
640 void * ptr = this->NBX_prc_msg_alloc[i](msize,0,0,stat_t.MPI_SOURCE,
rid[i],stat_t.MPI_TAG,this->NBX_prc_ptr_arg[i]);
649 check_valid(ptr,msize);
652 #ifdef VCLUSTER_GARBAGE_INJECTOR 653 #if defined (__NVCC__) && !defined(CUDA_ON_CPU) 654 cudaPointerAttributes cpa;
655 auto error = cudaPointerGetAttributes(&cpa,ptr);
656 if (error == cudaSuccess)
658 if(cpa.type == cudaMemoryTypeDevice)
659 {cudaMemset(ptr,0xFF,msize);}
661 {memset(ptr,0xFF,msize);}
664 memset(ptr,0xFF,msize);
667 MPI_SAFE_CALL(MPI_Recv(ptr,msize,MPI_BYTE,stat_t.MPI_SOURCE,stat_t.MPI_TAG,MPI_COMM_WORLD,&stat_t));
670 check_valid(ptr,msize);
677 for (
unsigned int i = 0 ; i < NQUEUE ; i++)
679 if (i >= NQUEUE || NBX_active[i] == NBX_Type::NBX_UNACTIVE || NBX_active[i] == NBX_Type::NBX_KNOWN || NBX_active[i] == NBX_Type::NBX_KNOWN_PRC)
686 {MPI_SAFE_CALL(MPI_Testall(
req.
size(),&
req.get(0),&flag,MPI_STATUSES_IGNORE));}
744 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
748 #ifdef VCLUSTER_PERF_REPORT 756 for (
size_t i = 0 ; i < prc.
size() ; i++)
757 {
send(prc.get(i),SEND_SPARSE + NBX_cnt*131072,data.get(i).getPointer(),data.get(i).
size());}
759 for (
size_t i = 0 ; i < prc_recv.
size() ; i++)
761 void * ptr_recv = msg_alloc(recv_sz.get(i),0,0,prc_recv.get(i),i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
763 recv(prc_recv.get(i),SEND_SPARSE + NBX_cnt*131072,ptr_recv,recv_sz.get(i));
771 #ifdef VCLUSTER_PERF_REPORT 773 time_spent += nbx_timer.
getwct();
824 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
831 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
837 for (
size_t i = 0 ; i < prc.
size() ; i++)
838 {
send(prc.get(i),SEND_SPARSE + NBX_cnt*131072,data.get(i).getPointer(),data.get(i).
size());}
840 for (
size_t i = 0 ; i < prc_recv.
size() ; i++)
842 void * ptr_recv = msg_alloc(recv_sz.get(i),0,0,prc_recv.get(i),i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
844 recv(prc_recv.get(i),SEND_SPARSE + NBX_cnt*131072,ptr_recv,recv_sz.get(i));
849 {NBX_prc_cnt_base = NBX_cnt;}
890 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
891 void * ptr_arg,
long int opt=NONE)
894 checkType<typename T::value_type>();
901 for (
size_t i = 0 ; i < prc.
size() ; i++)
952 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
953 void * ptr_arg,
long int opt=NONE)
956 checkType<typename T::value_type>();
962 for (
size_t i = 0 ; i < prc.
size() ; i++)
1015 size_t prc[] ,
void * ptr[],
1016 size_t n_recv,
size_t prc_recv[] ,
1017 size_t sz_recv[] ,
void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1018 void * ptr_arg,
long int opt=NONE)
1020 #ifdef VCLUSTER_PERF_REPORT 1028 for (
size_t i = 0 ; i < n_send ; i++)
1029 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,ptr[i],sz[i]);}
1031 for (
size_t i = 0 ; i < n_recv ; i++)
1033 void * ptr_recv = msg_alloc(sz_recv[i],0,0,prc_recv[i],i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
1035 recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv[i]);
1043 #ifdef VCLUSTER_PERF_REPORT 1045 time_spent += nbx_timer.
getwct();
1093 size_t prc[] ,
void * ptr[],
1094 size_t n_recv,
size_t prc_recv[] ,
1095 size_t sz_recv[] ,
void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1096 void * ptr_arg,
long int opt=NONE)
1101 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
1107 for (
size_t i = 0 ; i < n_send ; i++)
1108 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,ptr[i],sz[i]);}
1110 for (
size_t i = 0 ; i < n_recv ; i++)
1112 void * ptr_recv = msg_alloc(sz_recv[i],0,0,prc_recv[i],i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
1114 recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv[i]);
1119 {NBX_prc_cnt_base = NBX_cnt;}
1167 void * ptr[],
size_t n_recv,
size_t prc_recv[] ,
1168 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1169 void * ptr_arg,
long int opt=NONE)
1171 #ifdef VCLUSTER_PERF_REPORT 1176 sz_recv_tmp.resize(n_recv);
1180 for (
size_t i = 0 ; i < n_send ; i++)
1181 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,&sz[i],
sizeof(
size_t));}
1183 for (
size_t i = 0 ; i < n_recv ; i++)
1184 {
recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,&sz_recv_tmp.get(i),
sizeof(size_t));}
1193 for (
size_t i = 0 ; i < n_send ; i++)
1194 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,ptr[i],sz[i]);}
1196 for (
size_t i = 0 ; i < n_recv ; i++)
1198 void * ptr_recv = msg_alloc(sz_recv_tmp.get(i),0,0,prc_recv[i],i,0,ptr_arg);
1200 recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv_tmp.get(i));
1208 #ifdef VCLUSTER_PERF_REPORT 1210 time_spent += nbx_timer.
getwct();
1257 void * ptr[],
size_t n_recv,
size_t prc_recv[] ,
1258 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1259 void * ptr_arg,
long int opt=NONE)
1264 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
1268 sz_recv_tmp.resize(n_recv);
1272 for (
size_t i = 0 ; i < n_send ; i++)
1273 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,&sz[i],
sizeof(
size_t));}
1275 for (
size_t i = 0 ; i < n_recv ; i++)
1276 {
recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,&sz_recv_tmp.get(i),
sizeof(size_t));}
1291 {NBX_prc_cnt_base = NBX_cnt;}
1335 size_t prc[] ,
void * ptr[],
1336 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1337 void * ptr_arg,
long int opt = NONE)
1339 #ifdef VCLUSTER_PERF_REPORT 1348 std::cout << __FILE__ <<
":" << __LINE__ <<
" error there are some asynchronous call running you have to complete them before go back to synchronous" << std::endl;
1352 queue_all_sends(n_send,sz,prc,ptr);
1362 NBX_prc_cnt_base = NBX_cnt;
1378 }
while (flag ==
false);
1390 #ifdef VCLUSTER_PERF_REPORT 1392 time_spent += nbx_timer.
getwct();
1441 size_t prc[] ,
void * ptr[],
1442 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1443 void * ptr_arg,
long int opt = NONE)
1446 queue_all_sends(n_send,sz,prc,ptr);
1458 {NBX_prc_cnt_base = NBX_cnt;}
1469 for (
unsigned int j = 0 ; j < NQUEUE ; j++)
1471 if (NBX_active[j] == NBX_Type::NBX_UNACTIVE)
1474 if (NBX_active[j] == NBX_Type::NBX_KNOWN_PRC)
1483 for (
size_t i = 0 ; i < NBX_prc_n_send[j] ; i++)
1484 {
send(NBX_prc_prc[j][i],SEND_SPARSE + NBX_cnt*131072,NBX_prc_ptr[j][i],NBX_prc_sz[j][i]);}
1486 for (
size_t i = 0 ; i < NBX_prc_n_recv[j] ; i++)
1488 void * ptr_recv = NBX_prc_msg_alloc[j](sz_recv_tmp.get(i),0,0,NBX_prc_prc_recv[j][i],i,0,this->NBX_prc_ptr_arg[j]);
1490 recv(NBX_prc_prc_recv[j][i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv_tmp.get(i));
1493 NBX_active[j] = NBX_Type::NBX_KNOWN;
1496 if (NBX_active[j] == NBX_Type::NBX_KNOWN)
1502 NBX_active[j] = NBX_Type::NBX_UNACTIVE;
1521 }
while (flag ==
false);
1531 NBX_active[j] = NBX_Type::NBX_UNACTIVE;
1557 bool send(
size_t proc,
size_t tag,
const void * mem,
size_t sz)
1565 MPI_IsendWB::send(proc,SEND_RECV_BASE + tag,mem,sz,
req.last());
1623 bool recv(
size_t proc,
size_t tag,
void * v,
size_t sz)
1716 template<
typename T,
typename Mem,
template<
typename>
class layout_base >
1753 for (
size_t i = 0 ; i < NQUEUE ; i++)
MPI_Request bar_req
barrier request
void progressCommunication()
In case of Asynchonous communications like sendrecvMultipleMessagesNBXAsync this function progress th...
Vcluster_base(const Vcluster_base &)
disable copy constructor
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
void clear()
Eliminate all elements.
void sendrecvMultipleMessagesNBX(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
Vcluster_base(int *argc, char ***argv)
Virtual cluster constructor.
size_t getProcessUnitID()
Get the process unit id.
void sendrecvMultipleMessagesNBX(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &recv_sz, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
openfpm::vector_fr< BMemory< InternalMemory > > recv_buf[NQUEUE]
Receive buffers.
MPI_Comm getMPIComm()
Get the MPI_Communicator (or processor group) this VCluster is using.
bool allGather(T &send, openfpm::vector< T, Mem, gr > &v)
Gather the data from all processors.
openfpm::vector< size_t > sz_send[NQUEUE]
vector of the size of send buffers
openfpm::vector< size_t > proc_com
MPI_Status bar_stat
barrier status
Set of wrapping classing for MPI_Iallreduce.
int numPE
number of processing unit per process
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], size_t sz_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
int shmrank
rank within the node
mgpu::ofp_context_t * context
standard context for mgpu (if cuda is detected otherwise is unused)
bool send(size_t proc, size_t tag, const void *mem, size_t sz)
Send data to a processor.
std::vector< int > post_exe
vector of functions to execute after all the request has been performed
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages Asynchronous version.
openfpm::vector< size_t > tags[NQUEUE]
tags receiving
double getwct()
Return the elapsed real time.
void clear()
Release the buffer used for communication.
openfpm::vector< MPI_Status > stat
vector of MPI status
This class check if the type T has pointers inside.
void execute()
Execute all the requests.
Implementation of 1-D std::vector like structure.
unsigned short us
unsigned short
This class virtualize the cluster of PC as a set of processes that communicate.
unsigned int ui
unsigned integer
void sendrecvMultipleMessagesNBXAsync(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
static void recv(size_t proc, size_t tag, void *buf, size_t sz, MPI_Request &req)
General recv for general buffer.
bool NBX_prc_reached_bar_req[NQUEUE]
Is the barrier request reached.
bool send(size_t proc, size_t tag, openfpm::vector< T, Mem, gr > &v)
Send data to a processor.
General recv for vector of.
size_t rank()
Get the process unit id.
openfpm::vector< void * > ptr_send[NQUEUE]
vector of pointers of send buffers
Vcluster_base & operator=(const Vcluster_base &)
disable operator=
void start()
Start the timer.
bool recv(size_t proc, size_t tag, void *v, size_t sz)
Recv data from a processor.
size_t getProcessingUnits()
Get the total number of processors.
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
int NBX_prc_qcnt
NBX comunication on queue (-1 mean 0, 0 mean 1, 1 mean 2, .... )
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
unsigned char uc
unsigned char
void sum(T &num)
Sum the numbers across all processors and get the result.
mgpu::ofp_context_t & getmgpuContext(bool iw=true)
If nvidia cuda is activated return a mgpu context.
void sendrecvMultipleMessagesNBXAsync(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &recv_sz, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
void sendrecvMultipleMessagesNBXWait()
Send and receive multiple messages wait NBX communication to complete.
bool recv(size_t proc, size_t tag, openfpm::vector< T, Mem, gr > &v)
Recv data from a processor.
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], size_t sz_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
openfpm::vector< int > map_scatter
vector that contain the scatter map (it is basically an array of one)
size_t size()
Get the total number of processors.
bool Bcast(openfpm::vector< T, Mem, layout_base > &v, size_t root)
Broadcast the data to all processors.
openfpm::vector< MPI_Request > req
vector of MPI requests
void max(T &num)
Get the maximum number across all processors (or reduction with infinity norm)
temporal buffer for reductions
General recv for vector of.
int m_size
number of processes
Class for cpu time benchmarking.
void stop()
Stop the timer.
General send for a vector of any type.
void min(T &num)
Get the minimum number across all processors (or reduction with insinity norm)