doxygen/openfpm/Vector_21__gpu__first__step_2main_8cu_source.html

#ifdef __NVCC__


#define OPENMPI


#define SCAN_WITH_CUB <------ MODERNGPU is broken on RTX use CUB library for scan

//#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default


#include "Vector/vector_dist.hpp"


template<typename vector_type>

__global__ void translate_fill_prop(vector_type vd)

{

    auto p = GET_PARTICLE(vd);


    vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1];


    vd.template getProp<1>(p)[0] = vd.getPos(p)[0];

    vd.template getProp<1>(p)[1] = vd.getPos(p)[1];


    vd.template getProp<2>(p)[0][0] = vd.getPos(p)[0];

    vd.template getProp<2>(p)[0][1] = vd.getPos(p)[1];

    vd.template getProp<2>(p)[1][0] = vd.getPos(p)[0] + vd.getPos(p)[1];

    vd.template getProp<2>(p)[1][1] = vd.getPos(p)[1] - vd.getPos(p)[0];


    vd.getPos(p)[0] += 0.01f;

    vd.getPos(p)[1] += 0.01f;

}


int main(int argc, char* argv[])

{

    // OpenFPM GPU distribution


    // OpenFPM by default select GPU 0 for process 0, gpu 1 for process 1 and so on ... . In case of multi-node is the same each node has

    // has a group of processes and these group of processes are distributed across the available GPU on that node.


    // If you want to override this behaviour use #define EXTERNAL_SET_GPU at the very beginning of the program and use

    // cudaSetDevice to select the GPU for that particular process before openfpm_init

    // Note: To get the process number do MPI_Init and and use the MPI_Comm_rank. VCluster is not available before openfpm_init

    // A code snippet in case we want to skip GPU 0

    // MPI_Init(&argc,&argv);

    // int rank;

    // MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    // cudaSetDevice(1+rank);


    // initialize the library

    openfpm_init(&argc,&argv);


    // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y

    Box<2,float> domain({0.0,0.0},{1.0,1.0});


    // Here we define the boundary conditions of our problem

    size_t bc[2]={PERIODIC,PERIODIC};


    // extended boundary around the domain, and the processor domain

    Ghost<2,float> g(0.05);


    vector_dist_gpu<2,float, aggregate<float,float[2],float[2][2]> > vd(100,domain,bc,g);


    // the scalar is the element at position 0 in the aggregate

    const int scalar = 0;


    // the vector is the element at position 1 in the aggregate

    const int vector = 1;


    // the tensor is the element at position 2 in the aggregate

    const int tensor = 2;


    auto it = vd.getDomainIterator();


    while (it.isNext())

    {

        auto key = it.get();


        // we define x, assign a random position between 0.0 and 1.0

        vd.getPos(key)[0] = (float)rand() / RAND_MAX;


        // we define y, assign a random position between 0.0 and 1.0

        vd.getPos(key)[1] = (float)rand() / RAND_MAX;


        // next particle

        ++it;

    }


    vd.map();


    vd.hostToDevicePos();

    vd.template hostToDeviceProp<scalar,vector,tensor>();


    auto ite = vd.getDomainIteratorGPU();

    // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());

    CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());


    vd.deviceToHostPos();

    vd.deviceToHostProp<0,1,2>();


    // We write on a file

    vd.write("output");


    for (int j = 0 ; j < 100 ; j++)

    {

        auto ite = vd.getDomainIteratorGPU();

        // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());

        CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());


        vd.map(RUN_ON_DEVICE);

        vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);


        if ( j % 10 == 0)

        {

            // offload to host

            vd.deviceToHostPos();

            vd.template deviceToHostProp<0,1,2>();


            // write

            vd.write_frame("output_f",j);

        }

    }


    bool active = is_mpi_rdma_cuda_active();


    std::cout << "Is MPI rdma active on CUDA " << active << std::endl;


    openfpm_finalize();

}


#else


int main(int argc, char* argv[])

{

        return 0;

}


#endif

Box
This class represent an N-dimensional box.
Definition Box.hpp:61

Ghost
Definition Ghost.hpp:40

tensor
Definition grid_test.hpp:13

vector_dist_iterator::get
vect_dist_key_dx get()
Get the actual key.
Definition vector_dist_iterator.hpp:75

vector_dist
Distributed vector.
Definition vector_dist.hpp:305

vector_dist::write_frame
bool write_frame(std::string out, size_t iteration, int opt=VTK_WRITER)
Output particle position and properties.
Definition vector_dist.hpp:2874

vector_dist::deviceToHostPos
void deviceToHostPos()
Move the memory from the device to host memory.
Definition vector_dist.hpp:3455

vector_dist::getPos
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
Definition vector_dist.hpp:722

vector_dist::getDomainIterator
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
Definition vector_dist.hpp:2165

vector_dist::hostToDevicePos
void hostToDevicePos()
Move the memory from the device to host memory.
Definition vector_dist.hpp:3471

vector_dist::map
void map(size_t opt=NONE)
It move all the particles that does not belong to the local processor to the respective processor.
Definition vector_dist.hpp:2436

vector_dist::write
bool write(std::string out, int opt=VTK_WRITER)
Output particle position and properties.
Definition vector_dist.hpp:2759

vector_dist::deviceToHostProp
void deviceToHostProp()
Move the memory from the device to host memory.
Definition vector_dist.hpp:3447