doxygen/openfpm/compute__optimal__device__grid_8hpp_source.html

/*

 * compute_optimal_device_grid.hpp

 *

 *  Created on: Oct 1, 2017

 *      Author: i-bird

 */


#ifndef OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_

#define OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_


template<unsigned int dim>

void calculate_optimal_device_grid(device_grid<dim> & dg,

                                   size_t (& sz)[dim],

                                   size_t max_block_size,

                                   size_t min_block_size)

{


    if (dim == 0)   {return ;}


    size_t tot_block = 1;


    // Here we calculate the factors for each grid dimension and prioritize the

    // factors by 2 for the blocks


    // Get the factors for x

    std::vector<unsigned long int> x;

    openfpm::math::getFactorization(sz[0],x);


    dg.threads.x = 1;

    size_t jx = 0;


    while(jx < x.size() && x[jx] == 2 && tot_block < max_block_size)

    {

        if (tot_block * 2 > max_block_size)

        {break;}


        dg.threads.x *= 2;

        tot_block *= 2;


        jx++;

    }


    // if we already reach the maximum block size we finished

    if (tot_block * 2 > max_block_size)

    {

        dg.threads.y = 1;

        dg.threads.z = 1;


        dg.grids.x = 1;

        for (; jx < x.size() ; jx++)

        {dg.grids.x *= x[jx];}


        if (dim >= 2)

        {dg.grids.y = sz[1];}

        else

        {dg.grids.y = 1;}


        dg.grids.z = 1;

        for (size_t k = 2 ; k < dim ; k++)

        // coverty[dead_error_line]

        {dg.grids.z *= sz[k];}


        return;

    }


    // Get the factors for y

    std::vector<unsigned long int> y;

    size_t jy = 0;

    dg.threads.y = 1;


    if (dim >= 2)

    {

        openfpm::math::getFactorization(sz[1],y);


        while(jy < y.size() && y[jy] == 2 && tot_block < max_block_size)

        {

            if (tot_block * 2 > max_block_size)

            {break;}


            dg.threads.y *= 2;

            tot_block *= 2;


            jy++;

        }


        // if we already reach the maximum block size we finished

        if (tot_block * 2 > max_block_size)

        {

            dg.threads.z = 1;


            dg.grids.x = 1;

            for (; jx < x.size() ; jx++)

            {dg.grids.x *= x[jx];}


            dg.grids.y = 1;

            for (; jy < y.size() ; jy++)

            {dg.grids.y *= y[jy];}


            dg.grids.z = 1;

            for (size_t k = 2 ; k < dim ; k++)

            {dg.grids.z *= sz[k];}


            return;

        }

    }


    // Get the factors for z

    std::vector<unsigned long int> z;


    size_t jz = 0;

    dg.threads.z = 1;


    if (dim >= 3)

    {

        openfpm::math::getFactorization(sz[2],z);


        while(jz < z.size() && z[jz] == 2 && tot_block < max_block_size)

        {

            if (tot_block * 2 > max_block_size)

            {break;}


            dg.threads.z *= 2;

            tot_block *= 2;


            jz++;

        }


        // if we already reach the maximum block size we finished

        if (tot_block * 2 > max_block_size)

        {

            dg.grids.x = 1;

            for (; jx < x.size() ; jx++)

            {dg.grids.x *= x[jx];}


            dg.grids.y = 1;

            for (; jy < y.size() ; jy++)

            {dg.grids.y *= y[jy];}


            dg.grids.z = 1;

            for (; jz < z.size() ; jz++)

            {dg.grids.z *= z[jz];}


            for (size_t k = 3 ; k < dim ; k++)

            // coverty[dead_error_line]

            {dg.grids.z *= sz[k];}


            return;

        }

    }


    if (tot_block >= min_block_size)

    {return;}


    // Calculate the grids from the threads configuration


    dg.grids.x = 1;

    for (size_t k =  jx ; k < x.size() ; k++)

    {dg.grids.x *= x[k];}


    dg.grids.y = 1;

    for (size_t k = jy ; k < y.size() ; k++)

    {dg.grids.y *= y[k];}


    dg.grids.z = 1;

    for (size_t k = jz ; k < z.size() ; k++)

    {dg.grids.z *= z[k];}


    std::vector<unsigned long int> * ptr_xyz[3];

    ptr_xyz[0] = &x;

    ptr_xyz[1] = &y;

    ptr_xyz[2] = &z;


    size_t  * jj[3];

    jj[0] = &jx;

    jj[1] = &jy;

    jj[2] = &jz;


    while (tot_block < min_block_size && (jx < x.size() || jy < y.size() || jz < z.size() ) )

    {

        size_t best_fact = std::numeric_limits<size_t>::max();

        size_t k_best = 0;


        for (size_t k = 0 ; k < dim ; k++)

        {

            if (*jj[k] < ptr_xyz[k]->size() && ptr_xyz[k]->operator[](*jj[k]) < best_fact )

            {

                best_fact = ptr_xyz[k]->operator[](*jj[k]);

                k_best = k;

            }

        }


        // The maximum block size cannot be passed

        if (tot_block * best_fact > max_block_size)

        {break;}


        if (k_best == 0)

        {

            dg.threads.x *= best_fact;

            dg.grids.x /= best_fact;

        }

        else if (k_best == 1)

        {

            dg.threads.y *= best_fact;

            dg.grids.y /= best_fact;

        }

        /* coverty[dead_error_line] */

        else if (k_best == 2)

        {

            dg.threads.z *= best_fact;

            dg.grids.z /= best_fact;

        }


        tot_block *= best_fact;


        (*jj[k_best])++;

    }

}


#endif /* OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ */

device_grid
Definition map_grid.hpp:476

device_grid::grids
dim3_ grids
number of grid for the kernel execution
Definition map_grid.hpp:481

device_grid::threads
dim3_ threads
number of treads in each block
Definition map_grid.hpp:478

dim3_::z
unsigned int z
size in z dimension
Definition map_grid.hpp:471

dim3_::x
unsigned int x
size in x dimension
Definition map_grid.hpp:465

dim3_::y
unsigned int y
size in y dimension
Definition map_grid.hpp:468