1#ifndef VCLUSTER_BASE_HPP_
2#define VCLUSTER_BASE_HPP_
5#include "util/cuda_util.hpp"
12#include "MPI_wrapper/MPI_util.hpp"
13#include "Vector/map_vector.hpp"
14#include "MPI_wrapper/MPI_IallreduceW.hpp"
15#include "MPI_wrapper/MPI_IrecvW.hpp"
16#include "MPI_wrapper/MPI_IsendW.hpp"
17#include "MPI_wrapper/MPI_IAllGather.hpp"
18#include "MPI_wrapper/MPI_IBcastW.hpp"
20#include "Vector/map_vector.hpp"
22#include "util/check_no_pointers.hpp"
23#include "util/util_debug.hpp"
25#include "util/Vcluster_log.hpp"
26#include "memory/BHeapMemory.hpp"
27#include "Packer_Unpacker/has_max_prop.hpp"
28#include "data_type/aggregate.hpp"
29#include "util/ofp_context.hpp"
35extern double time_spent;
45constexpr int MSG_LENGTH = 1024;
46constexpr int MSG_SEND_RECV = 1025;
47constexpr int SEND_SPARSE = 8192;
48constexpr int NONE = 1;
49constexpr int NEED_ALL_SIZE = 2;
51constexpr int SERIVCE_MESSAGE_TAG = 16384;
52constexpr int SEND_RECV_BASE = 4096;
53constexpr int GATHER_BASE = 24576;
55constexpr int RECEIVE_KNOWN = 4;
56constexpr int KNOWN_ELEMENT_OR_BYTE = 8;
57constexpr int MPI_GPU_DIRECT = 16;
59constexpr int NQUEUE = 4;
62extern size_t n_vcluster;
64extern bool global_mpi_init;
66extern bool ofp_initialized;
67extern size_t tot_sent;
68extern size_t tot_recv;
74template<
typename T>
void assign(T * ptr1, T * ptr2)
124template<
typename InternalMemory>
161 NBX_Type NBX_active[NQUEUE];
174 int NBX_prc_cnt_base = 0;
175 size_t NBX_prc_n_send[NQUEUE];
176 size_t * NBX_prc_prc[NQUEUE];
177 void ** NBX_prc_ptr[NQUEUE];
178 size_t * NBX_prc_sz[NQUEUE];
179 size_t NBX_prc_n_recv[NQUEUE];
180 void * (* NBX_prc_msg_alloc[NQUEUE])(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *);
181 size_t * NBX_prc_prc_recv[NQUEUE];
182 void * NBX_prc_ptr_arg[NQUEUE];
220 void queue_all_sends(
size_t n_send ,
size_t sz[],
221 size_t prc[],
void * ptr[])
224 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" this function must be called when no other requests are in progress. Please remember that if you use function like max(),sum(),send(),recv() check that you did not miss to call the function execute() \n";}
234 for (
size_t i = 0 ; i < n_send ; i++)
241 check_valid(ptr[i],sz[i]);
248 if (sz[i] > 2147483647)
249 {MPI_SAFE_CALL(MPI_Issend(ptr[i], (sz[i] >> 3) + 1 , MPI_DOUBLE, prc[i], SEND_SPARSE + (NBX_cnt +
NBX_prc_qcnt)*131072 + i, MPI_COMM_WORLD,&
req.last()));}
251 {MPI_SAFE_CALL(MPI_Issend(ptr[i], sz[i], MPI_BYTE, prc[i], SEND_SPARSE + (NBX_cnt +
NBX_prc_qcnt)*131072 + i, MPI_COMM_WORLD,&
req.last()));}
260 openfpm::vector_fr<BMemory<InternalMemory>>
recv_buf[NQUEUE];
278 int already_finalised;
280 MPI_Finalized(&already_finalised);
281 if (!already_finalised)
283 if (MPI_Finalize() != 0)
285 std::cerr << __FILE__ <<
":" << __LINE__ <<
" MPI_Finalize FAILED \n";
303 for (
unsigned int i = 0 ; i < NQUEUE ; i++)
305 NBX_active[i] = NBX_Type::NBX_UNACTIVE;
310 check_new(
this,8,VCLUSTER_EVENT,PRJ_VCLUSTER);
315 int already_initialised;
316 MPI_Initialized(&already_initialised);
319 if (!already_initialised)
327 MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
328 MPI_INFO_NULL, &shmcomm);
330 MPI_Comm_rank(shmcomm, &
shmrank);
331 MPI_Comm_free(&shmcomm);
336 MPI_Comm_size(MPI_COMM_WORLD, &
m_size);
337 MPI_Comm_rank(MPI_COMM_WORLD, &
m_rank);
358#ifdef EXTERNAL_SET_GPU
367#if defined(PRINT_RANK_TO_GPU) && defined(CUDA_GPU)
369 char node_name[MPI_MAX_PROCESSOR_NAME];
372 MPI_Get_processor_name(node_name,&len);
374 std::cout <<
"Rank: " <<
m_rank <<
" on host: " << node_name <<
" work on GPU: " <<
context->getDevice() <<
"/" <<
context->getNDevice() << std::endl;
381 MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub_v, &flag);
382 tag_ub = *(
int*)tag_ub_v;
386 nbx_cycle = (tag_ub - SEND_SPARSE - 131072 - NQUEUE*131072) / 131072;
389 {std::cerr << __FILE__ <<
":" << __LINE__ <<
" Error MPI_TAG_UB is too small for OpenFPM" << std::endl;}
407 template<
typename T>
void checkType()
411 if (std::is_fundamental<T>::value ==
true)
415 if (std::is_pointer<T>::value ==
true)
416 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
419 if (std::is_lvalue_reference<T>::value ==
true)
420 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
423 if (std::is_rvalue_reference<T>::value ==
true)
424 {std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" is a pointer, sending pointers values has no sense\n";}
431 std::cerr <<
"Warning: " << __FILE__ <<
":" << __LINE__ <<
" impossible to check the type " << demangle(
typeid(T).name()) <<
" please consider to add a static method \"static bool noPointers()\" \n" ;
436 std::cerr <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" the type " << demangle(
typeid(T).name()) <<
" has pointers inside, sending pointers values has no sense\n";
455 if (
context == NULL && iw ==
true)
457 std::cout << __FILE__ <<
":" << __LINE__ <<
" Warning: it seem that modern gpu context is not initialized."
458 "Either a compatible working cuda device has not been found, either openfpm_init has been called in a file that not compiled with NVCC" << std::endl;
471 return MPI_COMM_WORLD;
495 return this->m_size*
numPE;
500#ifdef VCLUSTER_PERF_REPORT
501 std::cout <<
"-- REPORT COMMUNICATIONS -- " << std::endl;
503 std::cout <<
"Processor " << this->
rank() <<
" sent: " << tot_sent << std::endl;
504 std::cout <<
"Processor " << this->
rank() <<
" received: " << tot_recv << std::endl;
506 std::cout <<
"Processor " << this->
rank() <<
" time spent: " << time_spent << std::endl;
507 std::cout <<
"Processor " << this->
rank() <<
" Bandwidth: S:" << (double)tot_sent / time_spent * 1e-9 <<
"GB/s R:" << (
double)tot_recv / time_spent * 1e-9 <<
"GB/s" << std::endl;
510 std::cout <<
"Error to activate performance stats on VCluster enable VCLUSTER_PERF_REPORT" << std::endl;
517#ifdef VCLUSTER_PERF_REPORT
525 std::cout <<
"Error to activate performance stats on VCluster enable VCLUSTER_PERF_REPORT" << std::endl;
561 template<
typename T>
void sum(T & num)
581 template<
typename T>
void max(T & num)
601 template<
typename T>
void min(T & num)
624 MPI_SAFE_CALL(MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&
stat,&stat_t));
629 unsigned int i = (stat_t.MPI_TAG - SEND_SPARSE) / 131072 - NBX_prc_cnt_base;
631 if (i >= NQUEUE || NBX_active[i] == NBX_Type::NBX_UNACTIVE || NBX_active[i] == NBX_Type::NBX_KNOWN || NBX_active[i] == NBX_Type::NBX_KNOWN_PRC)
636 bool big_data =
true;
640 MPI_SAFE_CALL(MPI_Get_count(&stat_t,MPI_DOUBLE,&msize_));
641 if (msize_ == MPI_UNDEFINED)
644 MPI_SAFE_CALL(MPI_Get_count(&stat_t,MPI_BYTE,&msize_));
649 msize = ((size_t)msize_) << 3;
653 if (stat_t.MPI_TAG >= (
int)(SEND_SPARSE + NBX_prc_cnt_base*131072) && stat_t.MPI_TAG < (
int)(SEND_SPARSE + (NBX_prc_cnt_base +
NBX_prc_qcnt + 1)*131072))
656 void * ptr = this->NBX_prc_msg_alloc[i](msize,0,0,stat_t.MPI_SOURCE,
rid[i],stat_t.MPI_TAG,this->NBX_prc_ptr_arg[i]);
665 check_valid(ptr,msize);
668 #ifdef VCLUSTER_GARBAGE_INJECTOR
669 #if defined (__NVCC__) && !defined(CUDA_ON_CPU)
670 cudaPointerAttributes cpa;
671 auto error = cudaPointerGetAttributes(&cpa,ptr);
672 if (error == cudaSuccess)
674 if(cpa.type == cudaMemoryTypeDevice)
675 {cudaMemset(ptr,0xFF,msize);}
677 {memset(ptr,0xFF,msize);}
680 memset(ptr,0xFF,msize);
683 if (big_data ==
true)
686 MPI_SAFE_CALL(MPI_Recv(ptr,msize >> 3,MPI_DOUBLE,stat_t.MPI_SOURCE,stat_t.MPI_TAG,MPI_COMM_WORLD,&stat_t));
690 MPI_SAFE_CALL(MPI_Recv(ptr,msize,MPI_BYTE,stat_t.MPI_SOURCE,stat_t.MPI_TAG,MPI_COMM_WORLD,&stat_t));
693 check_valid(ptr,msize);
700 for (
unsigned int i = 0 ; i < NQUEUE ; i++)
702 if (i >= NQUEUE || NBX_active[i] == NBX_Type::NBX_UNACTIVE || NBX_active[i] == NBX_Type::NBX_KNOWN || NBX_active[i] == NBX_Type::NBX_KNOWN_PRC)
709 {MPI_SAFE_CALL(MPI_Testall(
req.
size(),&
req.get(0),&flag,MPI_STATUSES_IGNORE));}
767 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
771 #ifdef VCLUSTER_PERF_REPORT
779 for (
size_t i = 0 ; i < prc.
size() ; i++)
780 {
send(prc.get(i),SEND_SPARSE + NBX_cnt*131072,data.get(i).getPointer(),data.get(i).
size());}
782 for (
size_t i = 0 ; i < prc_recv.
size() ; i++)
784 void * ptr_recv = msg_alloc(recv_sz.get(i),0,0,prc_recv.get(i),i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
786 recv(prc_recv.get(i),SEND_SPARSE + NBX_cnt*131072,ptr_recv,recv_sz.get(i));
794 #ifdef VCLUSTER_PERF_REPORT
796 time_spent += nbx_timer.
getwct();
847 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
854 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
860 for (
size_t i = 0 ; i < prc.
size() ; i++)
861 {
send(prc.get(i),SEND_SPARSE + NBX_cnt*131072,data.get(i).getPointer(),data.get(i).
size());}
863 for (
size_t i = 0 ; i < prc_recv.
size() ; i++)
865 void * ptr_recv = msg_alloc(recv_sz.get(i),0,0,prc_recv.get(i),i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
867 recv(prc_recv.get(i),SEND_SPARSE + NBX_cnt*131072,ptr_recv,recv_sz.get(i));
872 {NBX_prc_cnt_base = NBX_cnt;}
913 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
914 void * ptr_arg,
long int opt=NONE)
917 checkType<typename T::value_type>();
924 for (
size_t i = 0 ; i < prc.
size() ; i++)
975 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
976 void * ptr_arg,
long int opt=NONE)
979 checkType<typename T::value_type>();
985 for (
size_t i = 0 ; i < prc.
size() ; i++)
1038 size_t prc[] ,
void * ptr[],
1039 size_t n_recv,
size_t prc_recv[] ,
1040 size_t sz_recv[] ,
void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1041 void * ptr_arg,
long int opt=NONE)
1043 #ifdef VCLUSTER_PERF_REPORT
1051 for (
size_t i = 0 ; i < n_send ; i++)
1052 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,ptr[i],sz[i]);}
1054 for (
size_t i = 0 ; i < n_recv ; i++)
1056 void * ptr_recv = msg_alloc(sz_recv[i],0,0,prc_recv[i],i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
1058 recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv[i]);
1066 #ifdef VCLUSTER_PERF_REPORT
1068 time_spent += nbx_timer.
getwct();
1116 size_t prc[] ,
void * ptr[],
1117 size_t n_recv,
size_t prc_recv[] ,
1118 size_t sz_recv[] ,
void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1119 void * ptr_arg,
long int opt=NONE)
1124 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
1130 for (
size_t i = 0 ; i < n_send ; i++)
1131 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,ptr[i],sz[i]);}
1133 for (
size_t i = 0 ; i < n_recv ; i++)
1135 void * ptr_recv = msg_alloc(sz_recv[i],0,0,prc_recv[i],i,SEND_SPARSE + NBX_cnt*131072,ptr_arg);
1137 recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv[i]);
1142 {NBX_prc_cnt_base = NBX_cnt;}
1190 void * ptr[],
size_t n_recv,
size_t prc_recv[] ,
1191 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1192 void * ptr_arg,
long int opt=NONE)
1194 #ifdef VCLUSTER_PERF_REPORT
1199 sz_recv_tmp.resize(n_recv);
1203 for (
size_t i = 0 ; i < n_send ; i++)
1204 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,&sz[i],
sizeof(
size_t));}
1206 for (
size_t i = 0 ; i < n_recv ; i++)
1207 {
recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,&sz_recv_tmp.get(i),
sizeof(
size_t));}
1216 for (
size_t i = 0 ; i < n_send ; i++)
1217 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,ptr[i],sz[i]);}
1219 for (
size_t i = 0 ; i < n_recv ; i++)
1221 void * ptr_recv = msg_alloc(sz_recv_tmp.get(i),0,0,prc_recv[i],i,0,ptr_arg);
1223 recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv_tmp.get(i));
1231 #ifdef VCLUSTER_PERF_REPORT
1233 time_spent += nbx_timer.
getwct();
1280 void * ptr[],
size_t n_recv,
size_t prc_recv[] ,
1281 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1282 void * ptr_arg,
long int opt=NONE)
1287 std::cout << __FILE__ <<
":" << __LINE__ <<
" error you can queue at most " << NQUEUE <<
" asychronous communication functions " << std::endl;
1291 sz_recv_tmp.resize(n_recv);
1295 for (
size_t i = 0 ; i < n_send ; i++)
1296 {
send(prc[i],SEND_SPARSE + NBX_cnt*131072,&sz[i],
sizeof(
size_t));}
1298 for (
size_t i = 0 ; i < n_recv ; i++)
1299 {
recv(prc_recv[i],SEND_SPARSE + NBX_cnt*131072,&sz_recv_tmp.get(i),
sizeof(
size_t));}
1314 {NBX_prc_cnt_base = NBX_cnt;}
1358 size_t prc[] ,
void * ptr[],
1359 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1360 void * ptr_arg,
long int opt = NONE)
1362 #ifdef VCLUSTER_PERF_REPORT
1371 std::cout << __FILE__ <<
":" << __LINE__ <<
" error there are some asynchronous call running you have to complete them before go back to synchronous" << std::endl;
1375 queue_all_sends(n_send,sz,prc,ptr);
1385 NBX_prc_cnt_base = NBX_cnt;
1401 }
while (flag ==
false);
1413 #ifdef VCLUSTER_PERF_REPORT
1415 time_spent += nbx_timer.
getwct();
1464 size_t prc[] ,
void * ptr[],
1465 void * (* msg_alloc)(
size_t,
size_t,
size_t,
size_t,
size_t,
size_t,
void *),
1466 void * ptr_arg,
long int opt = NONE)
1469 queue_all_sends(n_send,sz,prc,ptr);
1481 {NBX_prc_cnt_base = NBX_cnt;}
1492 for (
unsigned int j = 0 ; j < NQUEUE ; j++)
1494 if (NBX_active[j] == NBX_Type::NBX_UNACTIVE)
1497 if (NBX_active[j] == NBX_Type::NBX_KNOWN_PRC)
1506 for (
size_t i = 0 ; i < NBX_prc_n_send[j] ; i++)
1507 {
send(NBX_prc_prc[j][i],SEND_SPARSE + NBX_cnt*131072,NBX_prc_ptr[j][i],NBX_prc_sz[j][i]);}
1509 for (
size_t i = 0 ; i < NBX_prc_n_recv[j] ; i++)
1511 void * ptr_recv = NBX_prc_msg_alloc[j](sz_recv_tmp.get(i),0,0,NBX_prc_prc_recv[j][i],i,0,this->NBX_prc_ptr_arg[j]);
1513 recv(NBX_prc_prc_recv[j][i],SEND_SPARSE + NBX_cnt*131072,ptr_recv,sz_recv_tmp.get(i));
1516 NBX_active[j] = NBX_Type::NBX_KNOWN;
1519 if (NBX_active[j] == NBX_Type::NBX_KNOWN)
1525 NBX_active[j] = NBX_Type::NBX_UNACTIVE;
1544 }
while (flag ==
false);
1554 NBX_active[j] = NBX_Type::NBX_UNACTIVE;
1580 bool send(
size_t proc,
size_t tag,
const void * mem,
size_t sz)
1588 MPI_IsendWB::send(proc,SEND_RECV_BASE + tag,mem,sz,
req.last());
1646 bool recv(
size_t proc,
size_t tag,
void * v,
size_t sz)
1739 template<
typename T,
typename Mem,
template<
typename>
class layout_base >
1776 for (
size_t i = 0 ; i < NQUEUE ; i++)
General recv for vector of.
Set of wrapping classing for MPI_Iallreduce.
static void recv(size_t proc, size_t tag, void *buf, size_t sz, MPI_Request &req)
General recv for general buffer.
General recv for vector of.
General send for a vector of any type.
This class virtualize the cluster of PC as a set of processes that communicate.
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
void progressCommunication()
In case of Asynchonous communications like sendrecvMultipleMessagesNBXAsync this function progress th...
MPI_Request bar_req
barrier request
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
void execute()
Execute all the requests.
int NBX_prc_qcnt
NBX comunication on queue (-1 mean 0, 0 mean 1, 1 mean 2, .... )
MPI_Comm getMPIComm()
Get the MPI_Communicator (or processor group) this VCluster is using.
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages Asynchronous version.
size_t rank()
Get the process unit id.
size_t size()
Get the total number of processors.
void sum(T &num)
Sum the numbers across all processors and get the result.
openfpm::vector_fr< BMemory< InternalMemory > > recv_buf[NQUEUE]
Receive buffers.
bool send(size_t proc, size_t tag, openfpm::vector< T, Mem, gr > &v)
Send data to a processor.
Vcluster_base(const Vcluster_base &)
disable copy constructor
void sendrecvMultipleMessagesNBX(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
int m_size
number of processes
void clear()
Release the buffer used for communication.
bool Bcast(openfpm::vector< T, Mem, layout_base > &v, size_t root)
Broadcast the data to all processors.
Vcluster_base(int *argc, char ***argv)
Virtual cluster constructor.
void sendrecvMultipleMessagesNBX(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &recv_sz, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
void sendrecvMultipleMessagesNBXAsync(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], size_t sz_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
size_t getProcessUnitID()
Get the process unit id.
openfpm::vector< size_t > proc_com
openfpm::vector< size_t > sz_send[NQUEUE]
vector of the size of send buffers
void min(T &num)
Get the minimum number across all processors (or reduction with insinity norm)
size_t getProcessingUnits()
Get the total number of processors.
openfpm::vector< void * > ptr_send[NQUEUE]
vector of pointers of send buffers
gpu::ofp_context_t & getgpuContext(bool iw=true)
If nvidia cuda is activated return a gpu context.
Vcluster_base & operator=(const Vcluster_base &)
disable operator=
bool recv(size_t proc, size_t tag, openfpm::vector< T, Mem, gr > &v)
Recv data from a processor.
openfpm::vector< int > map_scatter
vector that contain the scatter map (it is basically an array of one)
void sendrecvMultipleMessagesNBXAsync(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
openfpm::vector< size_t > tags[NQUEUE]
tags receiving
bool recv(size_t proc, size_t tag, void *v, size_t sz)
Recv data from a processor.
int shmrank
rank within the node
std::vector< int > post_exe
vector of functions to execute after all the request has been performed
openfpm::vector< MPI_Request > req
vector of MPI requests
void sendrecvMultipleMessagesNBXWait()
Send and receive multiple messages wait NBX communication to complete.
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
bool allGather(T &send, openfpm::vector< T, Mem, gr > &v)
Gather the data from all processors.
int numPE
number of processing unit per process
bool NBX_prc_reached_bar_req[NQUEUE]
Is the barrier request reached.
void max(T &num)
Get the maximum number across all processors (or reduction with infinity norm)
gpu::ofp_context_t * context
standard context for gpu (if cuda is detected otherwise is unused)
openfpm::vector< MPI_Status > stat
vector of MPI status
void sendrecvMultipleMessagesNBX(size_t n_send, size_t sz[], size_t prc[], void *ptr[], size_t n_recv, size_t prc_recv[], size_t sz_recv[], void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages.
MPI_Status bar_stat
barrier status
void sendrecvMultipleMessagesNBXAsync(openfpm::vector< size_t > &prc, openfpm::vector< T > &data, openfpm::vector< size_t > &prc_recv, openfpm::vector< size_t > &recv_sz, void *(*msg_alloc)(size_t, size_t, size_t, size_t, size_t, size_t, void *), void *ptr_arg, long int opt=NONE)
Send and receive multiple messages asynchronous version.
bool send(size_t proc, size_t tag, const void *mem, size_t sz)
Send data to a processor.
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
void stop()
Stop the timer.
void start()
Start the timer.
double getwct()
Return the elapsed real time.
This class check if the type T has pointers inside.
temporal buffer for reductions
unsigned int ui
unsigned integer
unsigned short us
unsigned short
unsigned char uc
unsigned char