doxygen/openfpm/SparseGridGpu_8hpp_source.html

//

// Created by tommaso on 6/06/19.

//


#ifndef OPENFPM_PDATA_SPARSEGRIDGPU_HPP

#define OPENFPM_PDATA_SPARSEGRIDGPU_HPP


constexpr int BLOCK_SIZE_STENCIL = 128;


#include "config.h"

#include "util/cuda_launch.hpp"

#include <cstdlib>

#include <SparseGridGpu/BlockMapGpu.hpp>

#include <Grid/iterators/grid_skin_iterator.hpp>

#include <Grid/Geometry/grid_smb.hpp>

#include "SparseGridGpu_ker.cuh"

#include "SparseGridGpu_kernels.cuh"

#include "Iterators/SparseGridGpu_iterator_sub.hpp"

#include "Grid/Geometry/grid_zmb.hpp"

#include "util/stat/common_statistics.hpp"

#include "Iterators/SparseGridGpu_iterator.hpp"

#include "Space/SpaceBox.hpp"


#if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST)

#include "VTKWriter/VTKWriter.hpp"

#endif


constexpr int NO_ITERATOR_INIT = 0;


// todo: Move all the following utils into some proper file inside TemplateUtils


enum tag_boundaries

{

    NO_CALCULATE_EXISTING_POINTS,

    CALCULATE_EXISTING_POINTS

};


template<unsigned int dim>

struct default_edge

{

    typedef boost::mpl::int_<2> type;

};


template<>

struct default_edge<1>

{

    typedef boost::mpl::int_<256> type;

    typedef boost::mpl::int_<256> tb;

};


template<>

struct default_edge<2>

{

    typedef boost::mpl::int_<16> type;

    typedef boost::mpl::int_<256> tb;

};


template<>

struct default_edge<3>

{

    typedef boost::mpl::int_<8> type;

    typedef boost::mpl::int_<512> tb;

};


template<typename T>

struct type_identity

{

    typedef T type;

};


template<typename T, unsigned int dim, unsigned int blockEdgeSize>

struct process_data_block

{

    typedef type_identity<DataBlock<T,IntPow<blockEdgeSize,dim>::value>> type;

};


template<typename T, unsigned int dim, unsigned int blockEdgeSize, unsigned int N1>

struct process_data_block<T[N1],dim,blockEdgeSize>

{

    typedef type_identity<DataBlock<T,IntPow<blockEdgeSize,dim>::value>[N1]> type;

};


template<unsigned int dim, unsigned int blockEdgeSize, typename ... aggr_list>

struct aggregate_transform_datablock_impl

{

    typedef aggregate<typename process_data_block<aggr_list,dim,blockEdgeSize>::type::type ...> type;

};


template<unsigned int dim, unsigned int blockEdgeSize, typename aggr>

struct aggregate_convert

{

};


template<unsigned int dim, unsigned int blockEdgeSize, typename ... types>

struct aggregate_convert<dim,blockEdgeSize,aggregate<types ...>>

{

    typedef typename aggregate_transform_datablock_impl<dim,blockEdgeSize,types ...>::type type;

};


template<typename aggr>

struct aggregate_add

{

};


template<typename ... types>

struct aggregate_add<aggregate<types ...>>

{

    typedef aggregate<types ..., unsigned char> type;

};


template<typename enc_type>

class encap_data_block

{

    int offset;

    enc_type enc;


    public:


    encap_data_block(int offset,const enc_type & enc)

    :offset(offset),enc(enc)

    {}


    encap_data_block operator=(const encap_data_block<enc_type> & enc)

    {

        copy_cpu_encap_single<encap_data_block<enc_type>> cp(enc,*this);


        boost::mpl::for_each_ref< boost::mpl::range_c<int,0,enc_type::T_type::max_prop> >(cp);


        return *this;

    }


    template<unsigned int p>

    auto get() -> decltype(enc.template get<p>()[offset])

    {

        return enc.template get<p>()[offset];

    }


    template<unsigned int p>

    auto get() const -> decltype(enc.template get<p>()[offset])

    {

        return enc.template get<p>()[offset];

    }

};


enum StencilMode

{

    STENCIL_MODE_INPLACE = 1,

    STENCIL_MODE_INPLACE_NO_SHARED = 3

};


template<typename SGridGpu, unsigned int prp, unsigned int stencil_size>

struct GetCpBlockType

{

    typedef cp_block<typename boost::mpl::at<typename SGridGpu::device_grid_type::value_type::type,boost::mpl::int_<prp>>::type,

             stencil_size,

             typename vmpl_sum_constant<2*stencil_size,typename vmpl_create_constant<SGridGpu::dims,SGridGpu::device_grid_type::blockEdgeSize_>::type >::type,

             SGridGpu::device_grid_type::dims> type;

};


#include "encap_num.hpp"


template<typename SGridGpu>

struct GetAddBlockType

{

    typedef enc_num<typename SGridGpu::device_grid_type::insert_encap> type;

};


template<unsigned int dim>

struct NNfull_is_padding_impl

{

    __device__ inline static bool is_padding()

    {

        printf("NNfull_is_padding_impl with dim: %d not implemented yet \n",dim);


        return false;

    }

};


template<>

struct NNfull_is_padding_impl<3>

{

    template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>

    __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])

    {

        bool isPadding_ = false;

        for (int i = 0 ; i < 3 ; i++)

        {

            for (int j = 0 ; j < 3 ; j++)

            {

                for (int k = 0 ; k < 3 ; k++)

                {

                    grid_key_dx<3,int> key;


                    key.set_d(0,i-1);

                    key.set_d(1,j-1);

                    key.set_d(2,k-1);


                    auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);

                    typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];

                    isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));

                    if (isPadding_) break;

                }

            }

        }

        return isPadding_;

    }

};


template<>

struct NNfull_is_padding_impl<2>

{

    template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>

    __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])

    {

        bool isPadding_ = false;

        for (int i = 0 ; i < 3 ; i++)

        {

            for (int j = 0 ; j < 3 ; j++)

            {

                grid_key_dx<2,int> key;


                key.set_d(0,i-1);

                key.set_d(1,j-1);


                auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);

                typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];

                isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));

                if (isPadding_) break;

            }

        }

        return isPadding_;

    }

};


template<unsigned int dim>

struct NNFull

{

    static const int nNN = IntPow<3, dim>::value;


    template<typename indexT, typename blockCoord_type, typename blockMap_type, typename SparseGrid_type>

    __device__ static inline indexT getNNpos(blockCoord_type & blockCoord,

                                  blockMap_type & blockMap,

                                  SparseGrid_type & sparseGrid,

                                  const unsigned int offset)

    {

        // point to the background element

        int neighbourPos = blockMap.size();

        if (offset < nNN && offset != nNN / 2)

        {

            int cnt = offset;

            for (int i = 0 ; i < dim ; i++)

            {

                int dPos = cnt % 3;

                cnt /= 3;

                blockCoord.set_d(i, blockCoord.get(i) + dPos - 1);

            }


            neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).id;

        }

        return neighbourPos;

    }


    template<typename indexT, unsigned int blockEdgeSize, typename coordType>

    __host__ static inline indexT getNNskin(coordType & coord, int stencilSupportRadius)

    {

        // linearize the coord


        indexT neighbourNum = 0;


        indexT accu = 1;

        for(int i = 0 ; i < dim ; i++)

        {

            int c = static_cast<int>(coord.get(i)) - static_cast<int>(stencilSupportRadius);

            if (c < 0)

            {

                neighbourNum += 0;

            }

            else if (c >= blockEdgeSize)

            {

                neighbourNum += 2*accu;

            }

            else

            {

                neighbourNum += accu;

            }

            accu *= 3;

        }


        return neighbourNum;

    }


    template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>

    __device__ static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])

    {

        return NNfull_is_padding_impl<3>::template is_padding(sparseGrid,coord,enlargedBlock);

    }


    template<unsigned int blockEdgeSize, typename indexT2>

    __device__ static inline bool getNNindex_offset(grid_key_dx<dim,indexT2> & coord, unsigned int & NN_index, unsigned int & offset_nn)

    {

        bool out = false;

        NN_index = 0;

        offset_nn = 0;


        int cnt = 1;

        int cnt_off = 1;

        for (unsigned int i = 0 ; i < dim ; i++)

        {

            int p = 1 - ((int)(coord.get(i) < 0)) + ((int)(coord.get(i) >= (int)blockEdgeSize));


            NN_index += p*cnt;


            offset_nn += (coord.get(i) + (1 - p)*(int)blockEdgeSize)*cnt_off;


            cnt *= 3;

            cnt_off *= blockEdgeSize;


            out |= (p != 1);

        }


        return out;

    }

};


template<unsigned int nNN_, unsigned int nLoop_>

struct ct_par

{

    static const unsigned int nNN = nNN_;

    static const unsigned int nLoop = nLoop_;

};


template<typename copy_type>

struct copy_prop_to_vector_block_impl

{

    template<typename T, typename dst_type, typename src_type>

    static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)

    {

        dst.template get<T::value>() = src.template get<T::value>()[bPos];

    }

};


template<typename copy_type,unsigned int N1>

struct copy_prop_to_vector_block_impl<copy_type[N1]>

{

    template<typename T, typename dst_type, typename src_type>

    static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)

    {

        for (int i = 0 ; i < N1 ; i++)

        {

            dst.template get<T::value>()[i] = src.template get<T::value>()[i][bPos];

        }

    }

};


template<typename Tsrc,typename Tdst>

class copy_prop_to_vector_block

{

    Tsrc src;


    Tdst dst;


    size_t pos;


    unsigned int bPos;


public:


    copy_prop_to_vector_block(Tsrc src, Tdst dst,size_t pos, size_t bPos)

    :src(src),dst(dst),pos(pos),bPos(bPos)

    {}


    template<typename T>

    inline void operator()(T& t) const

    {

        typedef typename std::remove_reference<decltype(dst.template get<T::value>())>::type copy_rtype;


        copy_prop_to_vector_block_impl<copy_rtype>::template copy<T>(src,dst,bPos);


        //meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>()[bPos],dst.template get<T::value>());

    }


};


template<typename AggregateT, unsigned int n_it, unsigned int ... prp>

class data_ptr_fill

{

    typedef typename to_boost_vmpl<prp...>::type vprp;


    void * base_ptr;


    mutable size_t tot = 0;


    size_t i;


    size_t sz = 0;


    arr_arr_ptr<n_it,sizeof...(prp)> & arrs;


public:


    data_ptr_fill(void * base_ptr,size_t i,  arr_arr_ptr<n_it,sizeof...(prp)> & arrs, size_t sz)

    :base_ptr(base_ptr),i(i),sz(sz),arrs(arrs)

    {}


    template<typename T>

    inline void operator()(T& t) const

    {

        typedef typename boost::mpl::at<vprp,T>::type prp_cp;


        // Remove the reference from the type to copy

        typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;


        arrs.ptr[i][T::value] = (void *)((((unsigned char *)base_ptr) + tot));


        tot += sz * sizeof(pack_type);

    }


};


template<typename SparseGridType>

struct sparse_grid_section

{

    SparseGridType * grd;


    // Source box

    Box<SparseGridType::dims,size_t> src;


    // destination Box

    Box<SparseGridType::dims,size_t> dst;


    sparse_grid_section(SparseGridType & grd,const Box<SparseGridType::dims,size_t> & src, const Box<SparseGridType::dims,size_t> & dst)

    :grd(&grd),src(src),dst(dst)

    {}

};


template<unsigned int dim,

         typename AggregateT,

         unsigned int blockEdgeSize = default_edge<dim>::type::value,

         unsigned int threadBlockSize = default_edge<dim>::tb::value,

         typename indexT=long int,

         template<typename> class layout_base=memory_traits_inte,

         typename linearizer = grid_smb<dim, blockEdgeSize, indexT>>

class SparseGridGpu : public BlockMapGpu<

        typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,

        threadBlockSize, indexT, layout_base>

{

public:


     static constexpr unsigned int dims = dim;


private:


    typedef BlockMapGpu<

            typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,

            threadBlockSize, indexT, layout_base> BMG;


    const static unsigned char PADDING_BIT = 1;

    typedef typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type AggregateBlockT;

    linearizer gridGeometry;

    grid_sm<dim, int> extendedBlockGeometry;

    grid_sm<dim, int> gridSize;

    unsigned int stencilSupportRadius;

    unsigned int ghostLayerSize;

    int req_index;

    int req_index_swp;

    int req_index_swp_r;


    AggregateT bck;


    typedef SparseGridGpu<dim,AggregateT,blockEdgeSize,threadBlockSize,indexT,layout_base,linearizer> self;


    // Queue of remove sections

    openfpm::vector_gpu<Box<dim,unsigned int>> rem_sects;


    // Queue of copy sections

    openfpm::vector<sparse_grid_section<self>> copySect;


    CudaMemory mem;


    // pointers for unpack

    openfpm::vector<void *> index_ptrs;

    openfpm::vector<void *> index_ptrs_swp;

    openfpm::vector<void *> index_ptrs_swp_r;

    openfpm::vector<void *> scan_ptrs;

    openfpm::vector<void *> scan_ptrs_swp;

    openfpm::vector<void *> scan_ptrs_swp_r;

    openfpm::vector<void *> data_ptrs;

    openfpm::vector<void *> data_ptrs_swp;

    openfpm::vector<void *> data_ptrs_swp_r;

    openfpm::vector<void *> offset_ptrs;

    openfpm::vector<void *> offset_ptrs_swp;

    openfpm::vector<void *> offset_ptrs_swp_r;

    openfpm::vector<void *> mask_ptrs;

    openfpm::vector<void *> mask_ptrs_swp;

    openfpm::vector<void *> mask_ptrs_swp_r;


    // pointers for copyRemove

    openfpm::vector<void *> offset_ptrs_cp;

    openfpm::vector<void *> offset_ptrs_cp_swp;

    openfpm::vector<void *> offset_ptrs_cp_swp_r;

    openfpm::vector<void *> scan_ptrs_cp;

    openfpm::vector<void *> scan_ptrs_cp_swp;

    openfpm::vector<void *> scan_ptrs_cp_swp_r;

    openfpm::vector<void *> data_base_ptr_cp;

    openfpm::vector<void *> data_base_ptr_cp_swp;

    openfpm::vector<void *> data_base_ptr_cp_swp_r;

    openfpm::vector<int> n_cnk_cp;

    openfpm::vector<int> n_cnk_cp_swp;

    openfpm::vector<int> n_cnk_cp_swp_r;

    openfpm::vector<int> n_pnt_cp;

    openfpm::vector<int> n_pnt_cp_swp;

    openfpm::vector<int> n_pnt_cp_swp_r;

    openfpm::vector<int> n_shifts_cp;

    openfpm::vector<int> n_shift_cp_swp;

    openfpm::vector<int> n_shift_cp_swp_r;

    typedef typename aggregate_convert<dim,blockEdgeSize,aggregate<int>>::type convertAggr;


    // Map to convert blocks from missaligned chunks

    openfpm::vector_gpu<convertAggr> convert_blk;

    openfpm::vector_gpu<convertAggr> convert_blk_swp;

    openfpm::vector_gpu<convertAggr> convert_blk_swp_r;

    openfpm::vector<Box<dim,size_t>> box_cp;

    openfpm::vector<Box<dim,size_t>> box_cp_swp;

    openfpm::vector<Box<dim,size_t>> box_cp_swp_r;


    openfpm::vector_gpu<aggregate<indexT>> e_points;

    openfpm::vector_gpu<aggregate<indexT>> e_points_swp;

    openfpm::vector_gpu<aggregate<indexT>> e_points_swp_r;


    openfpm::vector_gpu<aggregate<unsigned int>> pack_output;

    openfpm::vector_gpu<aggregate<unsigned int>> pack_output_swp;

    openfpm::vector_gpu<aggregate<unsigned int>> pack_output_swp_r;


    openfpm::vector_gpu<aggregate<short int,short int>> ghostLayerToThreadsMapping;


    openfpm::vector_gpu<aggregate<indexT>> nn_blocks;


    mutable openfpm::vector_gpu<aggregate<indexT,unsigned int>> tmp;

    mutable openfpm::vector_gpu<aggregate<indexT,unsigned int>> tmp_swp;

    mutable openfpm::vector_gpu<aggregate<indexT,unsigned int>> tmp_swp_r;


    mutable openfpm::vector_gpu<aggregate<indexT>> tmp2;


    mutable openfpm::vector_gpu<aggregate<indexT>> tmp3;


    mutable openfpm::vector_gpu<aggregate<indexT>> scan_it;


    mutable openfpm::vector_gpu<aggregate<int>> new_map;

    mutable openfpm::vector_gpu<aggregate<int>> new_map_swp;

    mutable openfpm::vector_gpu<aggregate<int>> new_map_swp_r;


    mutable openfpm::vector_gpu<Box<dim,int>> pack_subs;

    mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp;

    mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp_r;


    mutable int index_size_swp = -1;

    mutable int index_size_swp_r = -1;


    openfpm::vector_gpu<aggregate<size_t>> links_up;


    openfpm::vector_gpu<aggregate<unsigned int>> link_dw_scan;


    openfpm::vector_gpu<aggregate<int,short int>> link_dw;


    openfpm::vector_gpu<aggregate<unsigned int>> link_up_scan;


    openfpm::vector_gpu<aggregate<int,short int>> link_up;


    ExtPreAlloc<CudaMemory> * prAlloc_prp;


    openfpm::vector_gpu<aggregate<int[dim]>> shifts;


    bool findNN = false;


    inline void swap_internal_remote()

    {

        n_cnk_cp_swp_r.swap(n_cnk_cp);

        n_pnt_cp_swp_r.swap(n_pnt_cp);

        n_shift_cp_swp_r.swap(n_shifts_cp);

        convert_blk_swp_r.swap(convert_blk);

        box_cp_swp_r.swap(box_cp);

        new_map_swp_r.swap(new_map);

    }


    inline void swap_internal_local()

    {

        offset_ptrs_cp_swp.swap(offset_ptrs_cp);

        scan_ptrs_cp_swp.swap(scan_ptrs_cp);

        data_base_ptr_cp_swp.swap(data_base_ptr_cp);

        n_cnk_cp_swp.swap(n_cnk_cp);

        n_pnt_cp_swp.swap(n_pnt_cp);

        n_shift_cp_swp.swap(n_shifts_cp);

        convert_blk_swp.swap(convert_blk);

        box_cp_swp.swap(box_cp);

        new_map_swp.swap(new_map);

    }


    inline void swap_local_pack()

    {

        index_ptrs_swp.swap(index_ptrs);

        scan_ptrs_swp.swap(scan_ptrs);

        data_ptrs_swp.swap(data_ptrs);

        offset_ptrs_swp.swap(offset_ptrs);

        mask_ptrs_swp.swap(mask_ptrs);


        e_points_swp.swap(e_points);

        pack_output_swp.swap(pack_output);

        tmp_swp.swap(tmp);


        pack_subs_swp.swap(pack_subs);

        index_size_swp = private_get_index_array().size();

    }


    inline void swap_remote_pack()

    {

        index_ptrs_swp_r.swap(index_ptrs);

        scan_ptrs_swp_r.swap(scan_ptrs);

        data_ptrs_swp_r.swap(data_ptrs);

        offset_ptrs_swp_r.swap(offset_ptrs);

        mask_ptrs_swp_r.swap(mask_ptrs);


        e_points_swp_r.swap(e_points);

        pack_output_swp_r.swap(pack_output);

        tmp_swp_r.swap(tmp);


        pack_subs_swp_r.swap(pack_subs);

        //req_index_swp_r = req_index;

        index_size_swp_r = private_get_index_array().size();

    }


protected:

    static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;

    typedef AggregateBlockT AggregateInternalT;


public:


    typedef int yes_i_am_grid;


    static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;


    typedef linearizer grid_info;


    typedef linearizer linearizer_type;


    template<typename Tfunc> using layout_mfunc = memory_traits_inte<Tfunc>;


    typedef sparse_grid_gpu_index<self> base_key;


    typedef indexT indexT_;


    typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;


    inline size_t size() const

    {

        return this->countExistingElements();

    }


    template <typename stencil = no_stencil>

    static SparseGridGpu_iterator_sub<dim,self> type_of_subiterator()

    {

        return SparseGridGpu_iterator_sub<dim,self>();

    }


    static SparseGridGpu_iterator<dim,self> type_of_iterator()

    {

        return SparseGridGpu_iterator<dim,self>(std::declval<self>());

    }


    template<typename dim3T>

    inline static int dim3SizeToInt(dim3T d)

    {

        return d.x * d.y * d.z;

    }


    inline static int dim3SizeToInt(size_t d)

    {

        return d;

    }


    inline static int dim3SizeToInt(unsigned int d)

    {

        return d;

    }


    template<typename ... v_reduce>

    void flush(gpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)

    {

        BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>

                ::template flush<v_reduce ...>(context, opt);


        findNN = false;

    }


    void saveUnpackVariableIfNotKeepGeometry(int opt, bool is_unpack_remote)

    {

        if (is_unpack_remote == true)

        {swap_internal_remote();}


        if (is_unpack_remote == false)

        {swap_internal_local();}

    }


    void RestoreUnpackVariableIfKeepGeometry(int opt, bool is_unpack_remote)

    {

        if (opt & KEEP_GEOMETRY && is_unpack_remote == true)

        {swap_internal_remote();}


        if (opt & KEEP_GEOMETRY && is_unpack_remote == false)

        {swap_internal_local();}

    }


    void savePackVariableIfNotKeepGeometry(int opt, bool is_pack_remote)

    {

        if (is_pack_remote == false)

        {

            swap_local_pack();

            req_index_swp = req_index;

        }


        if (is_pack_remote == true)

        {

            swap_remote_pack();

            req_index_swp_r = req_index;

        }

    }


    void RestorePackVariableIfKeepGeometry(int opt, bool is_pack_remote)

    {

        if (opt & KEEP_GEOMETRY && is_pack_remote == false)

        {

            swap_local_pack();

            req_index = req_index_swp;

        }


        if (opt & KEEP_GEOMETRY && is_pack_remote == true)

        {

            swap_remote_pack();

            req_index = req_index_swp_r;

        }

    }


    template<unsigned int n_it>

    void calculatePackingPointsFromBoxes(int opt,size_t tot_pnt)

    {

        if (!(opt & KEEP_GEOMETRY))

        {

            auto & indexBuffer = private_get_index_array();

            auto & dataBuffer = private_get_data_array();


            e_points.resize(tot_pnt);

            pack_output.resize(tot_pnt);


            ite_gpu<1> ite;


            ite.wthr.x = indexBuffer.size();

            ite.wthr.y = 1;

            ite.wthr.z = 1;

            ite.thr.x = getBlockSize();

            ite.thr.y = 1;

            ite.thr.z = 1;


            // Launch a kernel that count the number of element on each chunks

            CUDA_LAUNCH((SparseGridGpuKernels::get_exist_points_with_boxes<dim,

                                                                        BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                                                        n_it,

                                                                        indexT>),

                     ite,

                     indexBuffer.toKernel(),

                     pack_subs.toKernel(),

                     gridGeometry,

                     dataBuffer.toKernel(),

                     pack_output.toKernel(),

                     tmp.toKernel(),

                     scan_it.toKernel(),

                     e_points.toKernel());

        }

    }


private:


    void computeSizeOfGhostLayer()

    {

        unsigned int term1 = 1;

        for (int i = 0; i < dim; ++i)

        {

            term1 *= blockEdgeSize + 2 * stencilSupportRadius;

        }

        unsigned int term2 = 1;

        for (int i = 0; i < dim; ++i)

        {

            term2 *= blockEdgeSize;

        }

        ghostLayerSize = term1 - term2;

    }


    void allocateGhostLayerMapping()

    {

        ghostLayerToThreadsMapping.resize(ghostLayerSize);

    }


    template<typename stencil_type>

    void computeGhostLayerMapping()

    {

        size_t dimensions[dim],

                origin[dim],

                innerDomainBegin[dim], innerDomainEnd[dim],

                outerBoxBegin[dim], outerBoxEnd[dim],

                bc[dim];

        for (int i = 0; i < dim; ++i)

        {

            dimensions[i] = blockEdgeSize + 2 * stencilSupportRadius;

            origin[i] = 0;

            innerDomainBegin[i] = stencilSupportRadius - 1;

            innerDomainEnd[i] = dimensions[i] - stencilSupportRadius;

            outerBoxBegin[i] = origin[i];

            outerBoxEnd[i] = dimensions[i];

            bc[i] = NON_PERIODIC;

        }

        grid_sm<dim, void> enlargedGrid;

        enlargedGrid.setDimensions(dimensions);

        Box<dim, size_t> outerBox(outerBoxBegin, outerBoxEnd);

        Box<dim, size_t> innerBox(innerDomainBegin, innerDomainEnd);


        grid_skin_iterator_bc<dim> gsi(enlargedGrid, innerBox, outerBox, bc);


        unsigned int i = 0;

        while (gsi.isNext())

        {

            auto coord = gsi.get();

            assert(i < ghostLayerSize);

            mem_id linId = enlargedGrid.LinId(coord);

            // Mapping

            ghostLayerToThreadsMapping.template get<gt>(i) = linId;

            // Now compute the neighbour position to use

            ghostLayerToThreadsMapping.template get<nt>(i) = stencil_type::template getNNskin<indexT,blockEdgeSize>(coord,stencilSupportRadius);

            //

            ++i;

            ++gsi;

        }

        assert(i == ghostLayerSize);


        ghostLayerToThreadsMapping.template hostToDevice<gt,nt>();

    }


    void initialize(const size_t (& res)[dim])

    {

        gridGeometry = linearizer(res);


        computeSizeOfGhostLayer();

        allocateGhostLayerMapping();

        computeGhostLayerMapping<NNStar<dim>>();


        size_t extBlockDims[dim];

        for (int d=0; d<dim; ++d)

        {

            extBlockDims[d] = blockEdgeSize + 2*stencilSupportRadius;

        }

        extendedBlockGeometry.setDimensions(extBlockDims);


        gridSize.setDimensions(res);

    }


    template <typename stencil, typename... Args>

    void applyStencilInPlace(const Box<dim,int> & box, StencilMode & mode,Args... args)

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer_ = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer_ = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;

        unsigned int numScalars = indexBuffer_.size() * dataChunkSize;


        if (numScalars == 0) return;


        // NOTE: Here we want to work only on one data chunk per block!

        constexpr unsigned int chunksPerBlock = 1;

        const unsigned int localThreadBlockSize = dataChunkSize * chunksPerBlock;

        const unsigned int threadGridSize = numScalars % localThreadBlockSize == 0

                                            ? numScalars / localThreadBlockSize

                                            : 1 + numScalars / localThreadBlockSize;


        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * chunksPerBlock)>::value; // todo: This works only for stencilSupportSize==1


#ifdef CUDIFY_USE_CUDA


        CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace

                <dim,

                BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                stencil>),

                threadGridSize, localThreadBlockSize,

                        box,

                        indexBuffer_.toKernel(),

                        dataBuffer_.toKernel(),

                        this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),

                        args...);


#else


        auto bx = box;

        auto indexBuffer = indexBuffer_.toKernel();

        auto dataBuffer = dataBuffer_.toKernel();

        auto sparseGrid = this->template toKernelNN<stencil::stencil_type::nNN, nLoop>();


        constexpr int pMask = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask;


        auto lamb = [=] __device__ () mutable

        {

            constexpr unsigned int pIndex = 0;


            typedef typename decltype(indexBuffer)::value_type IndexAggregateT;

            typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;


            typedef typename decltype(dataBuffer)::value_type AggregateT_;

            typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;

            typedef ScalarTypeOf<AggregateT_, pMask> MaskT;

            constexpr unsigned int blockSize = MaskBlockT::size;


            // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints

            // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)

            const unsigned int dataBlockPos = blockIdx.x;

            const unsigned int offset = threadIdx.x;


            if (dataBlockPos >= indexBuffer.size())

            {

                return;

            }


            auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible


            // todo: Add management of RED-BLACK stencil application! :)

            const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);

            grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);


            unsigned char curMask;


            if (offset < blockSize)

            {

                // Read local mask to register

                curMask = dataBlockLoad.template get<pMask>()[offset];

                for (int i = 0 ; i < dim ; i++)

                {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}

            }


            openfpm::sparse_index<unsigned int> sdataBlockPos;

            sdataBlockPos.id = dataBlockPos;


            stencil::stencil(

                    sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,

                    curMask, args...);

        };


        CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);


#endif


    }


    template <typename stencil, typename... Args>

    void applyStencilInPlaceNoShared(const Box<dim,int> & box, StencilMode & mode,Args... args)

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;

        unsigned int numScalars = indexBuffer.size() * dataChunkSize;


        if (numScalars == 0) return;


        auto ite = e_points.getGPUIterator(BLOCK_SIZE_STENCIL);


        CUDA_LAUNCH((SparseGridGpuKernels::applyStencilInPlaceNoShared

                <dim,

                BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                stencil>),

                ite,

                        box,

                        indexBuffer.toKernel(),

                        dataBuffer.toKernel(),

                        this->template toKernelNN<stencil::stencil_type::nNN, 0>(),

                        args...);

    }


    template<typename ids_type>

    void fill_chunks_boxes(openfpm::vector<SpaceBox<dim,double>> & chunks_box, ids_type & chunk_ids, Point<dim,double> & spacing, Point<dim,double> & offset)

    {

        for (int i = 0 ; i < chunk_ids.size() ; i++)

        {

            SpaceBox<dim,double> box;


            auto c_pos = gridGeometry.InvLinId(chunk_ids.template get<0>(i)*blockSize);


            for (int j = 0 ; j < dim ; j++)

            {

                box.setLow(j,c_pos.get(j) * spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);

                box.setHigh(j,(c_pos.get(j) + blockEdgeSize)*spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);

            }


            chunks_box.add(box);

        }

    }


    template<typename MemType, unsigned int ... prp>

    void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, gpu::ofp_context_t & ctx, int opt)

    {

        if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)

        {

            // Convert the packed chunk ids


            prAlloc_prp->reset();

            Unpack_stat ups;


            for (size_t i = 0 ; i < copySect.size() ; i++)

            {

                auto sub_it = this->getIterator(copySect.get(i).dst.getKP1(),copySect.get(i).dst.getKP2(),NO_ITERATOR_INIT);


                copySect.get(i).grd->template addAndConvertPackedChunkToTmp<prp ...>(*prAlloc_prp,sub_it,ups,ctx);

            }

        }

    }


    template<unsigned int ... prp>

    void removeCopyToFinalize_phase1(gpu::ofp_context_t & ctx, int opt)

    {

        if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)

        {removePoints(ctx);}

    }


    template<unsigned int ... prp>

    void removeCopyToFinalize_phase2(gpu::ofp_context_t & ctx, int opt)

    {

        // Pack information

        Pack_stat sts;


        if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)

        {

            this->packReset();


            size_t req = 0;

            // First we do counting of point to copy (as source)


            for (size_t i = 0 ; i < copySect.size() ; i++)

            {

                auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);


                this->packRequest(sub_it,req);

            }


            this->template packCalculate<prp...>(req,ctx);


            mem.resize(req);


            // Create an object of preallocated memory for properties

            prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);

            prAlloc_prp->incRef();


            for (size_t i = 0 ; i < copySect.size() ; i++)

            {

                auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);


                this->pack<prp ...>(*prAlloc_prp,sub_it,sts);

            }

        }

        else

        {

            size_t req = mem.size();


            // Create an object of preallocated memory for properties

            prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);

            prAlloc_prp->incRef();

        }


        this->template packFinalize<prp ...>(*prAlloc_prp,sts,opt,false);


        preUnpack<CudaMemory,prp ...>(prAlloc_prp,ctx,opt);


        prAlloc_prp->decRef();

        delete prAlloc_prp;

    }


    template<unsigned int ... prp>

    void removeCopyToFinalize_phase3(gpu::ofp_context_t & ctx, int opt, bool is_unpack_remote)

    {

        ite_gpu<1> ite;


        if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)

        {

            if (tmp2.size() == 0)

            {return;}


            // Fill the add buffer given tmp and than flush


            setGPUInsertBuffer(tmp2.size(),1ul);


            auto & add_buff = this->blockMap.private_get_vct_add_index();

            add_buff.swap(tmp2);


            auto & nadd_buff = this->blockMap.private_get_vct_nadd_index();

            ite = nadd_buff.getGPUIterator();

            CUDA_LAUNCH(SparseGridGpuKernels::set_one,ite,nadd_buff.toKernel());


            int sz_b =  this->private_get_index_array().size();


            this->template flush<sLeft_<prp>...>(ctx,flush_type::FLUSH_ON_DEVICE);


            // get the map of the new inserted elements


            auto & m_map = this->getMergeIndexMapVector();

            auto & a_map = this->getMappingVector();

            auto & o_map = this->getSegmentToOutMap();

            auto & segments_data = this->getSegmentToMergeIndexMap();


            new_map.resize(a_map.size(),0);


            // construct new to old map


            ite = segments_data.getGPUIterator();


            if (ite.nblocks() != 0)

            CUDA_LAUNCH(SparseGridGpuKernels::construct_new_chunk_map<1>,ite,new_map.toKernel(),a_map.toKernel(),m_map.toKernel(),o_map.toKernel(),segments_data.toKernel(),sz_b);


            convert_blk.template hostToDevice<0>();

        }

        else

        {

            ite.wthr.x = 1;

            ite.wthr.y = 1;

            ite.wthr.z = 1;


            ite.thr.x = 1;

            ite.thr.y = 1;

            ite.thr.z = 1;

        }


        // Restore

        RestoreUnpackVariableIfKeepGeometry(opt,is_unpack_remote);


        // for each packed chunk


        size_t n_accu_cnk = 0;

        for (size_t i = 0 ; i < n_cnk_cp.size() ; i++)

        {

            arr_arr_ptr<1,sizeof...(prp)> data;

            size_t n_pnt = n_pnt_cp.get(i);


            void * data_base_ptr = data_base_ptr_cp.get(i);

            data_ptr_fill<AggregateT,1,prp...> dpf(data_base_ptr,0,data,n_pnt);

            boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);


            ite.wthr.x = n_cnk_cp.get(i);


            // calculate best number of threads

            Box<dim,size_t> ub = box_cp.get(i);


            ite.thr.x = 1;

            for (int j = 0 ; j < dim ; j++)

            {

                size_t l = ub.getHigh(j) - ub.getLow(j) + 1;


                if (l >= blockEdgeSize)

                {ite.thr.x *= blockEdgeSize;}

                else

                {ite.thr.x *= l;}

            }


            // copy to new (1 block for each packed chunk)

            if (ite.nblocks() != 0 && ite.thr.x != 0)

            {

                auto & chunks = private_get_data_array();


                CUDA_LAUNCH((SparseGridGpuKernels::copy_packed_data_to_chunks<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                                                              AggregateT,decltype(convert_blk.toKernel()),decltype(new_map.toKernel()),

                                                                              decltype(data),decltype(chunks.toKernel()),prp... >),ite,

                                                                                 (unsigned int *)scan_ptrs_cp.get(i),

                                                                                 (unsigned short int *)offset_ptrs_cp.get(i),

                                                                                 convert_blk.toKernel(),

                                                                                 new_map.toKernel(),

                                                                                 data,

                                                                                 chunks.toKernel(),

                                                                                 n_cnk_cp.get(i),

                                                                                 n_shifts_cp.get(i),

                                                                                 n_pnt_cp.get(i),

                                                                                 i,

                                                                                 n_accu_cnk);

            }


            n_accu_cnk += n_cnk_cp.get(i)*n_shifts_cp.get(i);

        }


        // Save

        saveUnpackVariableIfNotKeepGeometry(opt,is_unpack_remote);

    }


    template<unsigned int n_it, unsigned int ... prp>

    void pack_sg_implement(ExtPreAlloc<CudaMemory> & mem,

                           Pack_stat & sts,

                           int opt,

                           bool is_pack_remote)

    {

        arr_ptr<n_it> index_ptr;

        arr_arr_ptr<n_it,sizeof...(prp)> data_ptr;

        arr_ptr<n_it> scan_ptr;

        arr_ptr<n_it> offset_ptr;

        arr_ptr<n_it> mask_ptr;

        static_array<n_it,unsigned int> sar;


        auto & indexBuffer = private_get_index_array();

        auto & dataBuffer = private_get_data_array();


        if (req_index != pack_subs.size())

        {std::cerr << __FILE__ << ":" << __LINE__ << " error the packing request number differ from the number of packed objects " << req_index << "  " << pack_subs.size() << std::endl;}


        size_t tot_pnt = 0;

        size_t tot_cnk = 0;


        sparsegridgpu_pack_request<AggregateT,prp ...> spq;

        boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);


        // Calculate total points


        for (size_t i = 0 ; i < pack_subs.size() ; i++)

        {

            size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);

            sar.sa[i] = n_pnt;

            tot_pnt += n_pnt;

        }


        // CUDA require aligned access, here we suppose 8 byte alligned and we ensure 8 byte aligned after

        // the cycle

        for (size_t i = 0 ; i < pack_subs.size() ; i++)

        {

            size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);


            // fill index_ptr data_ptr scan_ptr

            index_ptr.ptr[i] = index_ptrs.get(i);

            scan_ptr.ptr[i] = scan_ptrs.get(i);


            // for all properties fill the data pointer


            data_ptr_fill<AggregateT,n_it,prp...> dpf(data_ptrs.get(i),i,data_ptr,tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1));

            boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);


            offset_ptr.ptr[i] = offset_ptrs.get(i);

            mask_ptr.ptr[i] = mask_ptrs.get(i);


            tot_cnk += n_cnk;

        }


        ite_gpu<1> ite;


        if (tot_pnt != 0)

        {

            calculatePackingPointsFromBoxes<n_it>(opt,tot_pnt);


            ite = e_points.getGPUIterator();


            // Here we copy the array of pointer of properties into a CudaMemory array


            CudaMemory mem;

            mem.allocate(sizeof(data_ptr));


            // copy

            arr_arr_ptr<n_it,sizeof...(prp)> * arr_data = (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getPointer();


            for(int i = 0 ; i < n_it ; i++)

            {

                for (int j = 0 ; j < sizeof...(prp) ; j++)

                {

                    arr_data->ptr[i][j] = data_ptr.ptr[i][j];

                }

            }


            mem.hostToDevice();


            CUDA_LAUNCH((SparseGridGpuKernels::pack_data<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                   AggregateT,

                                   n_it,

                                   sizeof...(prp),

                                   indexT,

                                   decltype(e_points.toKernel()),

                                   decltype(pack_output.toKernel()),

                                   decltype(indexBuffer.toKernel()),

                                   decltype(dataBuffer.toKernel()),

                                   decltype(tmp.toKernel()),

                                   self::blockSize,

                                   prp ...>),

                                   ite,

                                   e_points.toKernel(),

                                   dataBuffer.toKernel(),

                                   indexBuffer.toKernel(),

                                   tmp.toKernel(),

                                   pack_output.toKernel(),

                                   index_ptr,

                                   scan_ptr,

                                   (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getDevicePointer(),

                                   offset_ptr,

                                   mask_ptr,

                                   sar);

        }


        ite.wthr.x = 1;

        ite.wthr.y = 1;

        ite.wthr.z = 1;

        ite.thr.x = pack_subs.size();

        ite.thr.y = 1;

        ite.thr.z = 1;


        if (pack_subs.size() != 0)

        {CUDA_LAUNCH(SparseGridGpuKernels::last_scan_point,ite,scan_ptr,tmp.toKernel(),indexBuffer.size()+1,pack_subs.size());}

    }


    template<unsigned int ... prp, typename S2>

    void addAndConvertPackedChunkToTmp(ExtPreAlloc<S2> & mem,

                SparseGridGpu_iterator_sub<dim,self> & sub_it,

                Unpack_stat & ps,

                gpu::ofp_context_t &context)

    {

        sparsegridgpu_pack_request<AggregateT,prp ...> spq;

        boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);


        // First get the number of chunks


        size_t n_cnk;


        // Unpack the number of chunks

        mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));

        Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);


        grid_key_dx<dim,int> origPack_pnt;

        grid_key_dx<dim,int> origPack_cnk;

        size_t sz[dim];


        // Unpack origin of the chunk indexing

        for (int i = 0 ; i < dim ; i++)

        {

            int tmp;

            Unpacker<int,S2>::unpack(mem,tmp,ps);

            origPack_cnk.set_d(i,((int)(tmp / blockEdgeSize))*blockEdgeSize);

            origPack_pnt.set_d(i,tmp);

        }


        for (int i = 0 ; i < dim ; i++)

        {

            int tmp;

            Unpacker<int,S2>::unpack(mem,tmp,ps);

            sz[i] = tmp;

        }


        size_t actual_offset = n_cnk*sizeof(indexT);

        // get the id pointers

        indexT * ids = (indexT *)((unsigned char *)mem.getDevicePointer() + ps.getOffset());

        unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));


        mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),

                         ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));


        // Unpack number of points

        // calculate the number of total points

        size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));

        actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));


        void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );


        actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));


        short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);


        offset_ptrs_cp.add(offsets);

        scan_ptrs_cp.add(scan);

        n_cnk_cp.add(n_cnk);

        n_pnt_cp.add(n_pnt);

        data_base_ptr_cp.add(data_base_ptr);


        Box<dim,size_t> bx;


        for (int i = 0 ; i < dim ; i++)

        {

            bx.setLow(i,sub_it.getStart().get(i));

            bx.setHigh(i,sub_it.getStop().get(i));

        }


        box_cp.add(bx);


        actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));


        if (n_cnk != 0)

        {

            shifts.clear();


            int n_shift = 1;

            shifts.add();


            for (int i = 0 ; i < dim ; i++)

            {shifts.last().template get<0>()[i] = 0;}


            for (int i = 0 ; i < dim ; i++)

            {

                int op_q = origPack_pnt.get(i) % blockEdgeSize;

                int ou_q = sub_it.getStart().get(i) % blockEdgeSize;

                int quot = abs(ou_q - op_q) % blockEdgeSize;

                int squot = openfpm::math::sgn(ou_q - op_q);

                if (quot != 0)

                {

                    n_shift *= 2;


                    int sz = shifts.size();

                    for (int j = 0 ; j < sz ; j++)

                    {

                        shifts.add();

                        for (int k = 0 ; k < dim ; k++)

                        {

                            shifts.last().template get<0>()[k] = shifts.template get<0>(j)[k] + ((i == k)?squot:0);

                        }

                    }

                }

            }


            shifts.template hostToDevice<0>();


            linearizer gridGeoPack(sz);


            int bs = 0;

            size_t sz[1] = {n_cnk};

            grid_sm<1,void> g(sz);

            auto ite = g.getGPUIterator();


            grid_key_dx<dim,int> sz_g;

            grid_key_dx<dim,int> origUnpack_cnk;


            for (int i = 0 ; i < dim ; i++)

            {

                sz_g.set_d(i,gridGeometry.getSize()[i]);

                origUnpack_cnk.set_d(i,(int)(sub_it.getStart().get(i) / blockEdgeSize)*blockEdgeSize);

            }


            bs = tmp2.size();

            tmp2.resize(tmp2.size() + n_cnk * shifts.size());


            n_shifts_cp.add(shifts.size());


            switch (shifts.size())

            {

            case 1:

                // Calculate for each chunk the indexes where they should go + active points

                CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,1,indexT>),ite,ids,

                                                              n_cnk,

                                                              gridGeoPack,origPack_cnk,

                                                              gridGeometry,origUnpack_cnk,

                                                              tmp2.toKernel(),

                                                              shifts.toKernel(),

                                                              sz_g,

                                                              bs);

                break;

            case 2:

                // Calculate for each chunk the indexes where they should go + active points

                CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,2,indexT>),ite,ids,

                                                              n_cnk,

                                                              gridGeoPack,origPack_cnk,

                                                              gridGeometry,origUnpack_cnk,

                                                              tmp2.toKernel(),

                                                              shifts.toKernel(),

                                                              sz_g,

                                                              bs);

                break;

            case 4:

                // Calculate for each chunk the indexes where they should go + active points

                CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,4,indexT>),ite,ids,

                                                              n_cnk,

                                                              gridGeoPack,origPack_cnk,

                                                              gridGeometry,origUnpack_cnk,

                                                              tmp2.toKernel(),

                                                              shifts.toKernel(),

                                                              sz_g,

                                                              bs);

                break;

            case 8:

                // Calculate for each chunk the indexes where they should go + active points

                CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,8,indexT>),ite,ids,

                                                              n_cnk,

                                                              gridGeoPack,origPack_cnk,

                                                              gridGeometry,origUnpack_cnk,

                                                              tmp2.toKernel(),

                                                              shifts.toKernel(),

                                                              sz_g,

                                                              bs);

                break;

            }


            convertChunkIds(offsets,origPack_pnt,sub_it);

        }

        else

        {

            convert_blk.add();

            n_shifts_cp.add(0);

        }


        actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));


        ps.addOffset(actual_offset);

    }


    template<typename origPackType, typename IteratorType>

    void convertChunkIds(short int * offset, origPackType & origPack, IteratorType & sub_it)

    {

        int quot_diff[dim];

        for (int i = 0 ; i < dim ; i++)

        {

            int op_q = origPack.get(i) % blockEdgeSize;

            int ou_q = sub_it.getStart().get(i) % blockEdgeSize;

            int quot = abs(ou_q - op_q) % blockEdgeSize;

            quot_diff[i] = openfpm::math::sgn(ou_q - op_q)*quot;

        }


        convert_blk.add();


        // Create conversion block


        for (int j = 0 ; j < this->blockSize ; j++)

        {

            int offset = 0;

            int bpos = 0;

            int bp_c = 1;

            int pos = 0;

            int pos_c = 1;


            int x = j;

            for (int i = 0 ; i < dim ; i++)

            {

                int c = x % blockEdgeSize;


                if (quot_diff[i] + c < 0)

                {

                    offset += pos_c*(quot_diff[i] + c + blockEdgeSize);

                    bpos += bp_c*1;

                }

                else if (quot_diff[i] + c >= blockEdgeSize)

                {

                    offset += pos_c*(quot_diff[i] + c - blockEdgeSize);

                    bpos += bp_c*1;

                }

                else

                {

                    offset += pos_c*(quot_diff[i] + c);

                }


                pos += pos_c*c;

                pos_c *= blockEdgeSize;

                bp_c *= (quot_diff[i] != 0)?2:1;

                x /= blockEdgeSize;

            }


            convert_blk.template get<0>(convert_blk.size()-1)[pos] = (bpos << 16) + offset;

        }

    }


public:


    typedef AggregateT value_type;


    typedef self device_grid_type;


    SparseGridGpu()

    :stencilSupportRadius(1)

    {};


    void resize(size_t (& res)[dim])

    {

        initialize(res);

    }


    SparseGridGpu(const size_t (& res)[dim], unsigned int stencilSupportRadius = 1)

    :stencilSupportRadius(stencilSupportRadius)

    {

        initialize(res);

    };


    SparseGridGpu(linearizer & gridGeometry, unsigned int stencilSupportRadius = 1)

            : gridGeometry(gridGeometry),

              stencilSupportRadius(stencilSupportRadius)

    {

        // convert to size_t

        size_t sz_st[dim];


        for (int i = 0 ; i < dim ; i++) {sz_st[i] = gridGeometry.getSize()[i];}


        initialize(sz_st);

    };


    SparseGridGpu_ker

            <

                    dim,

                    blockEdgeSize,

                    typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,

                    ct_par<0,1>,

                    indexT,

                    layout_base,

                    decltype(extendedBlockGeometry),

                    linearizer,

                    AggregateT

            > toKernel()

    {

        SparseGridGpu_ker

                <

                        dim,

                        blockEdgeSize,

                        typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,

                        ct_par<0,1>,

                        indexT,

                        layout_base,

                        decltype(extendedBlockGeometry),

                        linearizer,

                        AggregateT

                > toKer(

                BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.toKernel(),

                gridGeometry,

                extendedBlockGeometry,

                stencilSupportRadius,

                ghostLayerToThreadsMapping.toKernel(),

                nn_blocks.toKernel(),

                e_points.toKernel(),

                ghostLayerSize,

                bck);

        return toKer;

    }


    template<unsigned int nNN, unsigned int nLoop>

    SparseGridGpu_ker

            <

                    dim,

                    blockEdgeSize,

                    typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,

                    ct_par<nNN,nLoop>,

                    indexT,

                    layout_base,

                    decltype(extendedBlockGeometry),

                    linearizer,

                    AggregateT

            > toKernelNN()

    {

        SparseGridGpu_ker

                <

                        dim,

                        blockEdgeSize,

                        typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,

                        ct_par<nNN,nLoop>,

                        indexT,

                        layout_base,

                        decltype(extendedBlockGeometry),

                        linearizer,

                        AggregateT

                > toKer(

                BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.toKernel(),

                gridGeometry,

                extendedBlockGeometry,

                stencilSupportRadius,

                ghostLayerToThreadsMapping.toKernel(),

                nn_blocks.toKernel(),

                e_points.toKernel(),

                ghostLayerSize,

                bck);

        return toKer;

    }


    void clear()

    {

        BMG::clear();

    }


    /* \brief Does nothing

     *

     */

    void setMemory()

    {}


    auto insertBlockFlush(size_t block) -> decltype(BMG::insertBlockFlush(block))

    {

        return BMG::insertBlockFlush(block);

    }


    linearizer & getGrid()

    {

        return gridGeometry;

    }


    template<typename stencil_type>

    void setNNType()

    {

        computeGhostLayerMapping<stencil_type>();

    }


    constexpr static unsigned int getBlockEdgeSize()

    {

        return blockEdgeSize;

    }


    constexpr unsigned int getBlockSize() const

    {

        return blockSize;

    }


    // Geometry

    template<typename CoordT>

    inline size_t getLinId(CoordT &coord)

    {

        return gridGeometry.LinId(coord);

    }


    inline grid_key_dx<dim, int> getCoord(size_t linId) const

    {

        return gridGeometry.InvLinId(linId);

    }


    inline ite_gpu<dim> getGridGPUIterator(const grid_key_dx<dim, int> & start, const grid_key_dx<dim, int> & stop, size_t n_thr = threadBlockSize)

    {

        return gridSize.getGPUIterator(start,stop,n_thr);

    }


    template<typename CoordT>

    base_key get_sparse(const grid_key_dx<dim,CoordT> & coord) const

    {

        base_key k(*this,0,0);


        const auto & blockMap = this->private_get_blockMap();


        auto glid = gridGeometry.LinId(coord);


        auto bid = glid / blockSize;

        auto lid = glid % blockSize;


        auto key = blockMap.get_sparse(bid);


        k.set_cnk_pos_id(key.id);

        k.set_data_id(lid);


        return k;

    }


    template<unsigned int p, typename CoordT>

    auto get(const grid_key_dx<dim,CoordT> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::template get<p>(gridGeometry.LinId(coord));

    }


    template<unsigned int p>

    auto get(const sparse_grid_gpu_index<self> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &

    {

        return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];

    }


    auto private_get_data_array() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer()) &

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();

    }


    auto private_get_data_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer())

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();

    }


    template<typename CoordT>

    auto get_o(const grid_key_dx<dim,CoordT> & coord) const -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::get(0))>::type >

    {

        int offset;

        indexT lin;

        gridGeometry.LinId(coord,lin,offset);


        return encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::get(0))>::type >(offset,BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::get(lin));

    }


    auto get_o(const sparse_grid_gpu_index<self> & coord) const -> encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >

    {

        return encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >(coord.get_data_id(),private_get_data_array().get(coord.get_cnk_pos_id()));

    }


    bool isSkipLabellingPossible()

    {

        return (index_size_swp_r == private_get_index_array().size()) && (index_size_swp == private_get_index_array().size());

    }


    template<unsigned int p>

    auto get(const sparse_grid_gpu_index<self> & coord) -> ScalarTypeOf<AggregateBlockT, p> &

    {

        return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];

    }


    unsigned char getFlag(const sparse_grid_gpu_index<self> & coord) const

    {

        return private_get_data_array().template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(coord.get_cnk_pos_id())[coord.get_data_id()];

    }


    template<unsigned int p, typename CoordT>

    auto insert(const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::template insert<p>(gridGeometry.LinId(coord));

    }


    template<typename CoordT>

    auto insert_o(const CoordT &coord) -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insert_o(0))>::type >

    {

        indexT ind;

        int offset;

        gridGeometry.LinId(coord,ind,offset);


        return encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insert_o(0))>::type >(offset, BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insert_o(ind));

    }


    void construct_link(self & grid_up, self & grid_dw, gpu::ofp_context_t &context)

    {

/*        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        ite_gpu<1> ite;


        ite.wthr.x = indexBuffer.size();

        ite.wthr.y = 1;

        ite.wthr.z = 1;


        ite.thr.x = getBlockSize();

        ite.thr.y = 1;

        ite.thr.z = 1;


        openfpm::vector_gpu<aggregate<unsigned int>> output;

        output.resize(indexBuffer.size() + 1);


        CUDA_LAUNCH((SparseGridGpuKernels::link_construct<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel());


        openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);


        output.template deviceToHost<0>(output.size()-1,output.size()-1);


        unsigned int np_lup = output.template get<0>(output.size()-1);


        links_up.resize(np_lup);


        CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel(),links_up.toKernel());


*/

    }


    openfpm::vector_gpu<aggregate<unsigned int>> & getDownLinksOffsets()

    {

        return link_dw_scan;

    }


    openfpm::vector_gpu<aggregate<int,short int>> & getDownLinks()

    {

        return link_dw;

    }


    openfpm::vector_gpu<aggregate<unsigned int>> & getUpLinksOffsets()

    {

        return link_up_scan;

    }


    openfpm::vector_gpu<aggregate<int,short int>> & getUpLinks()

    {

        return link_up;

    }


    void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, gpu::ofp_context_t &context)

    {

        Box<dim,int> db = db_;


        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        // Count padding points


        // First we count the padding points

        ite_gpu<1> ite;


        ite.wthr.x = indexBuffer.size();

        ite.wthr.y = 1;

        ite.wthr.z = 1;


        ite.thr.x = getBlockSize();

        ite.thr.y = 1;

        ite.thr.z = 1;


        openfpm::vector_gpu<aggregate<unsigned int>> output;

        output.resize(indexBuffer.size()+1);


        output.fill<0>(0);


        CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),ite,this->toKernel(),output.toKernel(),db);


        openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);


        output.template deviceToHost<0>(output.size()-1,output.size()-1);

        unsigned int padding_points = output.template get<0>(output.size()-1);


        // get the padding points


        openfpm::vector_gpu<aggregate<unsigned int,short int>> pd_points;

        pd_points.resize(padding_points);


        CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);


        // Count number of link down for padding points


        // Calculate ghost


        link_dw_scan.resize(padding_points+1);

        link_dw_scan.fill<0>(0);


        ite = link_dw_scan.getGPUIterator();


        CUDA_LAUNCH((SparseGridGpuKernels::link_construct_dw_count<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),

                                    ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),p_dw);


        openfpm::scan((unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),link_dw_scan.size(),(unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),context);


        link_dw_scan.template deviceToHost<0>(link_dw_scan.size()-1,link_dw_scan.size()-1);


        size_t np_ldw = link_dw_scan.template get<0>(link_dw_scan.size()-1);


        link_dw.resize(np_ldw);


        CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_dw<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),link_dw.toKernel(),p_dw);


        link_dw_scan.resize(link_dw_scan.size()-1);

    }


    void construct_link_up(self & grid_up,  const Box<dim,int> & db_, Point<dim,int> p_up, gpu::ofp_context_t &context)

    {

        Box<dim,int> db = db_;


        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        // Count padding points


        // First we count the padding points

        ite_gpu<1> ite;


        ite.wthr.x = indexBuffer.size();

        ite.wthr.y = 1;

        ite.wthr.z = 1;


        ite.thr.x = getBlockSize();

        ite.thr.y = 1;

        ite.thr.z = 1;


        openfpm::vector_gpu<aggregate<unsigned int>> output;

        output.resize(indexBuffer.size()+1);


        output.fill<0>(0);


        CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),ite,this->toKernel(),output.toKernel(),db);


        openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);


        output.template deviceToHost<0>(output.size()-1,output.size()-1);

        unsigned int padding_points = output.template get<0>(output.size()-1);


        // get the padding points


        openfpm::vector_gpu<aggregate<unsigned int,short int>> pd_points;

        pd_points.resize(padding_points);


        CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);


        // Count number of link down for padding points


        // Calculate ghost


        link_up_scan.resize(padding_points+1);

        link_up_scan.fill<0>(0);


        ite = link_up_scan.getGPUIterator();


        CUDA_LAUNCH((SparseGridGpuKernels::link_construct_up_count<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),

                                    ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),p_up);


        openfpm::scan((unsigned int *)link_up_scan.template getDeviceBuffer<0>(),link_up_scan.size(),(unsigned int *)link_up_scan.template getDeviceBuffer<0>(),context);


        link_up_scan.template deviceToHost<0>(link_up_scan.size()-1,link_up_scan.size()-1);


        size_t np_lup = link_up_scan.template get<0>(link_up_scan.size()-1);


        link_up.resize(np_lup);


        CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_up<dim,

                                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                    blockSize>),ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),link_up.toKernel(),p_up);


        link_up_scan.resize(link_up_scan.size()-1);

    }


    template<typename dim3T>

    void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)

    {

        BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>

        ::setGPUInsertBuffer(

                dim3SizeToInt(nBlock),

                dim3SizeToInt(nSlot)

        );

    }


    void preFlush()

    {

        BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::preFlush();

    }


    template<typename stencil_type = NNStar<dim>, typename checker_type = No_check>

    void tagBoundaries(gpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;

        unsigned int numScalars = indexBuffer.size() * dataChunkSize;


        if (numScalars == 0) return;

        if (findNN == false)

        {

            findNeighbours<stencil_type>();

            findNN = true;

        }


        // NOTE: Here we want to work only on one data chunk per block!


        unsigned int localThreadBlockSize = dataChunkSize;

        unsigned int threadGridSize = numScalars % dataChunkSize == 0

                                    ? numScalars / dataChunkSize

                                    : 1 + numScalars / dataChunkSize;


        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * 1)>::value; // todo: This works only for stencilSupportSize==1

//        constexpr unsigned int nLoop = IntPow<blockEdgeSize + 2, dim>::value; // todo: This works only for stencilSupportSize==1


        if (stencilSupportRadius == 1)

        {

            CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<

                    dim,

                    1,

                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                    stencil_type,

                    checker_type>),

                    threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);

        }

        else if (stencilSupportRadius == 2)

        {

            CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<

                    dim,

                    2,

                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                    stencil_type,

                    checker_type>),

                    threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);

        }

        else if (stencilSupportRadius == 0)

        {

            CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<

                    dim,

                    0,

                    BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                    stencil_type,

                    checker_type>),

                    threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);

        }

        else

        {

            //todo: KABOOOOOOM!!!

            std::cout << __FILE__ << ":" << __LINE__ << " error: stencilSupportRadius supported only up to 2, passed: " << stencilSupportRadius << std::endl;


        }


        if (opt == tag_boundaries::CALCULATE_EXISTING_POINTS)

        {

            // first we calculate the existing points

            openfpm::vector_gpu<aggregate<indexT>> block_points;


            block_points.resize(indexBuffer.size() + 1);

            block_points.template get<0>(block_points.size()-1) = 0;

            block_points.template hostToDevice<0>(block_points.size()-1,block_points.size()-1);


            ite_gpu<1> ite;


            ite.wthr.x = indexBuffer.size();

            ite.wthr.y = 1;

            ite.wthr.z = 1;

            ite.thr.x = getBlockSize();

            ite.thr.y = 1;

            ite.thr.z = 1;


            CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),

                         ite,

                         dataBuffer.toKernel(),

                         block_points.toKernel());


            // than we scan

            openfpm::scan((indexT *)block_points.template getDeviceBuffer<0>(),block_points.size(),(indexT *)block_points.template getDeviceBuffer<0>(),context);


            // Get the total number of points

            block_points.template deviceToHost<0>(block_points.size()-1,block_points.size()-1);

            size_t tot = block_points.template get<0>(block_points.size()-1);

            e_points.resize(tot);


            // we fill e_points

            CUDA_LAUNCH((SparseGridGpuKernels::fill_e_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,

                         dataBuffer.toKernel(),

                         block_points.toKernel(),

                         e_points.toKernel())


        }


        cudaDeviceSynchronize();

    }


    template<typename NNtype = NNStar<dim>>

    void findNeighbours()

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();


        const unsigned int numBlocks = indexBuffer.size();

        const unsigned int numScalars = numBlocks * NNtype::nNN;

        nn_blocks.resize(numScalars);


        if (numScalars == 0) return;


        // NOTE: Here we want to work only on one data chunk per block!


        unsigned int localThreadBlockSize = NNtype::nNN;


        unsigned int threadGridSize = numScalars % localThreadBlockSize == 0

                                      ? numScalars / localThreadBlockSize

                                      : 1 + numScalars / localThreadBlockSize;


        CUDA_LAUNCH_DIM3((SparseGridGpuKernels::findNeighbours<dim,NNtype>),

                threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());


        findNN = true;

    }


    size_t countExistingElements() const

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        constexpr unsigned int pMask = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask;

        typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;

        typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;

        constexpr unsigned int blockSize = MaskBlockT::size;

        const auto bufferSize = indexBuffer.size();


        size_t numExistingElements = 0;


        for (size_t blockId=0; blockId<bufferSize; ++blockId)

        {

            auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible

            for (size_t elementId=0; elementId<blockSize; ++elementId)

            {

                const auto curMask = dataBlock.template get<pMask>()[elementId];


                if (this->exist(curMask))

                {

                    ++numExistingElements;

                }

            }

        }


        return numExistingElements;

    }


    size_t countBoundaryElements()

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        constexpr unsigned int pMask = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask;

        typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;

        typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;

        constexpr unsigned int blockSize = MaskBlockT::size;

        const auto bufferSize = indexBuffer.size();


        size_t numBoundaryElements = 0;


        for (size_t blockId=0; blockId<bufferSize; ++blockId)

        {

            auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible

            for (size_t elementId=0; elementId<blockSize; ++elementId)

            {

                const auto curMask = dataBlock.template get<pMask>()[elementId];


                if (this->exist(curMask) && this->isPadding(curMask))

                {

                    ++numBoundaryElements;

                }

            }

        }


        return numBoundaryElements;

    }


    // Also count mean+stdDev of occupancy of existing blocks

    void measureBlockOccupancyMemory(double &mean, double &deviation)

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        constexpr unsigned int pMask = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask;

        typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;

        typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;

        constexpr unsigned int blockSize = MaskBlockT::size;

        const auto bufferSize = indexBuffer.size();


        openfpm::vector<double> measures;


        for (size_t blockId=0; blockId<bufferSize; ++blockId)

        {

            auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible

            size_t numElementsInBlock = 0;

            for (size_t elementId=0; elementId<blockSize; ++elementId)

            {

                const auto curMask = dataBlock.template get<pMask>()[elementId];


                if (this->exist(curMask))

                {

                    ++numElementsInBlock;

                }

            }

            double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;

            measures.add(blockOccupancy);

        }


        standard_deviation(measures, mean, deviation);

    }


    // Also count mean+stdDev of occupancy of existing blocks

    void measureBlockOccupancy(double &mean, double &deviation)

    {

        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        constexpr unsigned int pMask = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask;

        typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;

        typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;

        constexpr unsigned int blockSize = MaskBlockT::size;

        const auto bufferSize = indexBuffer.size();


        openfpm::vector<double> measures;


        for (size_t blockId=0; blockId<bufferSize; ++blockId)

        {

            auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible

            size_t numElementsInBlock = 0;

            for (size_t elementId=0; elementId<blockSize; ++elementId)

            {

                const auto curMask = dataBlock.template get<pMask>()[elementId];


                if (this->exist(curMask) && !this->isPadding(curMask))

                {

                    ++numElementsInBlock;

                }

            }

            double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;

            measures.add(blockOccupancy);

        }


        standard_deviation(measures, mean, deviation);

    }


    template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >

    void conv_cross(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)

    {

        Box<dim,int> box;


        for (int i = 0 ; i < dim ; i++)

        {

            box.setLow(i,start.get(i));

            box.setHigh(i,stop.get(i));

        }


        applyStencils< SparseGridGpuKernels::stencil_cross_func<dim,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);

    }


    template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >

    void conv(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)

    {

        Box<dim,int> box;


        for (int i = 0 ; i < dim ; i++)

        {

            box.setLow(i,start.get(i));

            box.setHigh(i,stop.get(i));

        }


        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;


        applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);

    }


    template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >

    void conv_cross_b(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)

    {

        Box<dim,int> box;


        for (int i = 0 ; i < dim ; i++)

        {

            box.setLow(i,start.get(i));

            box.setHigh(i,stop.get(i));

        }


        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;


        applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);

    }


    template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >

    void conv2_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)

    {

        Box<dim,int> box;


        for (int i = 0 ; i < dim ; i++)

        {

            box.setLow(i,start.get(i));

            box.setHigh(i,stop.get(i));

        }


        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;


        applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);

    }


    template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_src3,

             unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int prop_dst3,

             unsigned int stencil_size, typename lambda_f, typename ... ArgsT >

    void conv3_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)

    {

        Box<dim,int> box;


        for (int i = 0 ; i < dim ; i++)

        {

            box.setLow(i,start.get(i));

            box.setHigh(i,stop.get(i));

        }


        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;


        applyStencils< SparseGridGpuKernels::stencil_func_conv3_b<dim,nLoop,prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);

    }


    template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >

    void conv2(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)

    {

        Box<dim,int> box;


        for (int i = 0 ; i < dim ; i++)

        {

            box.setLow(i,start.get(i));

            box.setHigh(i,stop.get(i));

        }


        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;


        applyStencils< SparseGridGpuKernels::stencil_func_conv2<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);

    }


    Box<dim,int> getBox()

    {

        Box<dim,int> b;


        for (int i = 0 ; i < dim ; i++)

        {

            b.setLow(i,0);

            b.setHigh(i,gridGeometry.getSize()[i]);

        }


        return b;

    }


    //todo: Move implems into a functor for compile time choice of stencil mode

    template<typename stencil, typename... Args>

    void applyStencils(const Box<dim,int> & box, StencilMode mode, Args... args)

    {

        if (findNN == false)

        {

            findNeighbours<typename stencil::stencil_type>();

            findNN = true;

        }


        // Apply the given stencil on all elements which are not boundary-tagged

        // The idea is to have this function launch a __global__ function (made by us) on all existing blocks

        // then this kernel checks if elements exist && !padding and on those it calls the user-provided

        // __device__ functor. The mode of the stencil application is used basically to choose how we load the block

        // that we pass to the user functor as storeBlock: in case of Insert, we get the block through an insert (and

        // we also call the necessary aux functions); in case of an In-place we just get the block from the data buffer.

        switch (mode)

        {

            case STENCIL_MODE_INPLACE:

                applyStencilInPlace<stencil>(box,mode,args...);

                break;

            case STENCIL_MODE_INPLACE_NO_SHARED:

                applyStencilInPlaceNoShared<stencil>(box,mode,args...);

                break;

        }

    }

    template<typename stencil1, typename stencil2, typename ... otherStencils, typename... Args>

    void applyStencils(Box<dim,int> box, StencilMode mode, Args... args)

    {

        applyStencils<stencil1>(box,mode, args...);

        applyStencils<stencil2, otherStencils ...>(box,mode, args...);

    }


    template<typename BitMaskT>

    inline static bool isPadding(BitMaskT &bitMask)

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>

        ::getBit(bitMask, PADDING_BIT);

    }


    template<typename BitMaskT>

    inline static void setPadding(BitMaskT &bitMask)

    {

        BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>

        ::setBit(bitMask, PADDING_BIT);

    }


    template<typename BitMaskT>

    inline static void unsetPadding(BitMaskT &bitMask)

    {

        BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>

        ::unsetBit(bitMask, PADDING_BIT);

    }


    template<typename CoordT>

    inline size_t getBlockLinId(const CoordT & blockCoord) const

    {

        return gridGeometry.BlockLinId(blockCoord);

    }


    template<unsigned int p>

    auto insertFlush(const sparse_grid_gpu_index<self> &coord) -> ScalarTypeOf<AggregateBlockT, p> &

    {

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();


        indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());

        indexT local_id = coord.get_data_id();


        typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;


        auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);

        block_data.template get<BMG::pMask>()[local_id] = 1;


        return block_data.template get<p>()[local_id];

    }


    template<typename CoordT>

    auto insertBlockFlush(const grid_key_dx<dim,CoordT> &coord, indexT & local_id) -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(0))

    {

        auto lin = gridGeometry.LinId(coord);

        indexT block_id = lin / blockSize;

        local_id = lin % blockSize;


        typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;


        auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);

        block_data.template get<BMG::pMask>()[local_id] = 1;


        return block_data;

    }


    template<unsigned int p, typename CoordT>

    auto insertFlush(const grid_key_dx<dim,CoordT> &coord) -> ScalarTypeOf<AggregateBlockT, p> &

    {

        // Linearized block_id

        auto lin = gridGeometry.LinId(coord);

        indexT block_id = lin / blockSize;

        indexT local_id = lin % blockSize;


        typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;


        auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);

        block_data.template get<BMG::pMask>()[local_id] = 1;


        return block_data.template get<p>()[local_id];

    }


    template<unsigned int p>

    void print_vct_add_data()

    {

        typedef BlockMapGpu<

                typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,

                threadBlockSize, indexT, layout_base> BMG;


        auto & bM = BMG::blockMap.private_get_vct_add_data();

        auto & vI = BMG::blockMap.private_get_vct_add_index();

        bM.template deviceToHost<p>();

        vI.template deviceToHost<0>();


        std::cout << "vct_add_data: " << std::endl;


        for (size_t i = 0 ; i < bM.size() ; i++)

        {

            std::cout << i << "  index: " << vI.template get<0>(i) << "  BlockData: " << std::endl;

            for (size_t j = 0 ; j < blockSize ; j++)

            {

                std::cout << (int)bM.template get<p>(i)[j] << "  ";

            }


            std::cout << std::endl;

        }

    }


    template<unsigned int p>

    void setBackgroundValue(typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)

    {

        meta_copy<typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>::meta_copy_(backgroundValue,bck.template get<p>());


        BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);

    }


    //Functions to check if the packing object is complex

    static bool pack()

    {

        return true;

    }


    //Functions to check if the packing object is complex

    static bool packRequest()

    {

        return true;

    }


    template<int ... prp> inline

    void packRequest(size_t & req) const

    {

        // To fill

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        indexBuffer.template packRequest<prp ...>(req);

        dataBuffer.template packRequest<prp ...>(req);


        Packer<decltype(gridGeometry),HeapMemory>::packRequest(req);

    }


    template<int ... prp> void pack(ExtPreAlloc<HeapMemory> & mem,

                                    Pack_stat & sts) const

    {

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        // To fill

        indexBuffer.template pack<prp ...>(mem,sts);

        dataBuffer.template pack<prp ...>(mem,sts);


        Packer<decltype(gridGeometry),HeapMemory>::pack(mem,gridGeometry,sts);

    }


    template<int ... prp> void unpack(ExtPreAlloc<HeapMemory> & mem,

                                    Unpack_stat & ps)

    {

        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();


        // To fill

        indexBuffer.template unpack<prp ...>(mem,ps);

        dataBuffer.template unpack<prp ...>(mem,ps);


        Unpacker<decltype(gridGeometry),HeapMemory>::unpack(mem,gridGeometry,ps);

    }


    template<int ... prp> void unpack(ExtPreAlloc<CudaMemory> & mem,

                                    Unpack_stat & ps)

    {

        if (mem.size() != 0)

        {std::cout << __FILE__ << ":" << __LINE__ << " not implemented: " << std::endl;}

    }


    template<int ... prp> inline

    void packRequest(size_t & req, gpu::ofp_context_t &context) const

    {

        ite_gpu<1> ite;


        auto & indexBuffer = private_get_index_array();

        auto & dataBuffer = private_get_data_array();


        ite.wthr.x = indexBuffer.size();

        ite.wthr.y = 1;

        ite.wthr.z = 1;

        ite.thr.x = getBlockSize();

        ite.thr.y = 1;

        ite.thr.z = 1;


        tmp.resize(indexBuffer.size() + 1);


        // Launch a kernel that count the number of element on each chunks

        CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),

                     ite,

                     dataBuffer.toKernel(),

                     tmp.toKernel());


        openfpm::scan((indexT *)tmp. template getDeviceBuffer<0>(),

                            tmp.size(), (indexT *)tmp. template getDeviceBuffer<0>(), context);


        tmp.template deviceToHost<0>(tmp.size()-1,tmp.size()-1);


        sparsegridgpu_pack_request<AggregateT,prp ...> spq;


        boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                (prp)>>(spq);


        size_t n_pnt = tmp.template get<0>(tmp.size()-1);


                                // 4 byte each chunks        data                      // we use short to pack the offset

                                // for each counter

        req = sizeof(indexT) +               // byte required to pack the number

              sizeof(indexT)*indexBuffer.size() +    // byte required to pack the chunk indexes

              sizeof(indexT)*tmp.size() +            // byte required to pack the scan of the chunks points

              n_pnt*(spq.point_size + sizeof(short int) + sizeof(unsigned char));    // byte required to pack data + offset position

    }


    template<int ... prp> inline

    void packRequest(SparseGridGpu_iterator_sub<dim,self> & sub_it,

                     size_t & req) const

    {

        pack_subs.add();


        for (int i = 0 ; i < dim ; i++)

        {

            pack_subs.template get<0>(pack_subs.size()-1)[i] = sub_it.getStart().get(i);

            pack_subs.template get<1>(pack_subs.size()-1)[i] = sub_it.getStop().get(i);

        }

    }


    void packReset()

    {

        pack_subs.clear();


        index_ptrs.clear();

        scan_ptrs.clear();

        data_ptrs.clear();

        offset_ptrs.clear();

        mask_ptrs.clear();


        req_index = 0;

    }


    template<int ... prp> inline

    void packCalculate(size_t & req, gpu::ofp_context_t &context)

    {

        ite_gpu<1> ite;

        pack_subs.template hostToDevice<0,1>();


        auto & indexBuffer = private_get_index_array();

        auto & dataBuffer = private_get_data_array();


        ite.wthr.x = indexBuffer.size();

        ite.wthr.y = 1;

        ite.wthr.z = 1;

        ite.thr.x = getBlockSize();

        ite.thr.y = 1;

        ite.thr.z = 1;


        tmp.resize((indexBuffer.size() + 1)*pack_subs.size());


        if (indexBuffer.size() != 0)

        {

            if (pack_subs.size() <= 32)

            {

                // Launch a kernel that count the number of element on each chunks

                CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,

                                                                            BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                                                            32,

                                                                            indexT>),

                         ite,

                         indexBuffer.toKernel(),

                         pack_subs.toKernel(),

                         gridGeometry,

                         dataBuffer.toKernel(),

                         tmp.toKernel(),

                         indexBuffer.size() + 1);

            }

            else if (pack_subs.size() <= 64)

            {

                // Launch a kernel that count the number of element on each chunks

                CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,

                                                                            BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                                                            64,

                                                                            indexT>),

                         ite,

                         indexBuffer.toKernel(),

                         pack_subs.toKernel(),

                         gridGeometry,

                         dataBuffer.toKernel(),

                         tmp.toKernel(),

                         indexBuffer.size() + 1);

            }

            else if (pack_subs.size() <= 96)

            {

                // Launch a kernel that count the number of element on each chunks

                CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,

                                                                            BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                                                            96,

                                                                            indexT>),

                         ite,

                         indexBuffer.toKernel(),

                         pack_subs.toKernel(),

                         gridGeometry,

                         dataBuffer.toKernel(),

                         tmp.toKernel(),

                         indexBuffer.size() + 1);

            }

            else if (pack_subs.size() <= 128)

            {

                // Launch a kernel that count the number of element on each chunks

                CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,

                                                                            BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                                                            128,

                                                                            indexT>),

                         ite,

                         indexBuffer.toKernel(),

                         pack_subs.toKernel(),

                         gridGeometry,

                         dataBuffer.toKernel(),

                         tmp.toKernel(),

                         indexBuffer.size() + 1);

            }

            else

            {

                std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;

            }

        }


        sparsegridgpu_pack_request<AggregateT,prp ...> spq;


        boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);


        scan_it.resize(pack_subs.size());


        // scan all

        for (size_t i = 0 ; i < pack_subs.size() ; i++)

        {

            size_t n_pnt = 0;

            size_t n_cnk = 0;


            tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1) = 0;

            tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1) = 0;


            // put a zero at the end

            tmp.template hostToDevice<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);

            tmp.template hostToDevice<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);


            openfpm::scan(((indexT *)tmp. template getDeviceBuffer<0>()) + i*(indexBuffer.size() + 1),

                            indexBuffer.size() + 1, (indexT *)tmp. template getDeviceBuffer<0>() + i*(indexBuffer.size() + 1), context);


            openfpm::scan(((unsigned int *)tmp. template getDeviceBuffer<1>()) + i*(indexBuffer.size() + 1),

                            indexBuffer.size() + 1, (unsigned int *)tmp. template getDeviceBuffer<1>() + i*(indexBuffer.size() + 1), context);


            tmp.template deviceToHost<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);

            tmp.template deviceToHost<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);


            scan_it.template get<0>(i) = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);


            n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);

            n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);


            req += sizeof(size_t) +               // byte required to pack the number of chunk packed

                    2*dim*sizeof(int) +           // starting point + size of the indexing packing

                      sizeof(indexT)*n_cnk +                       // byte required to pack the chunk indexes

                      align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) +            // byte required to pack the scan of the chunk point

                      align_number(sizeof(indexT),n_pnt*(spq.point_size)) +  // byte required to pack data

                      align_number(sizeof(indexT),n_pnt*sizeof(short int)) + // byte required to pack offsets

                      align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));  // byte required to pack masks

        }


        scan_it.template hostToDevice<0>();


        openfpm::scan((indexT *)scan_it. template getDeviceBuffer<0>(),

                                scan_it.size(), (indexT *)scan_it. template getDeviceBuffer<0>(), context);

    }


    auto getMappingVector() -> decltype(this->blockMap.getMappingVector())

    {

        return this->blockMap.getMappingVector();

    }


    auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())

    {

        return this->blockMap.getMergeIndexMapVector();

    }


    template<int ... prp> void pack(ExtPreAlloc<CudaMemory> & mem,

                                    SparseGridGpu_iterator_sub<dim,self> & sub_it,

                                    Pack_stat & sts)

    {

        unsigned int i = req_index;


        sparsegridgpu_pack_request<AggregateT,prp ...> spq;

        boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);


        auto & indexBuffer = private_get_index_array();

        auto & dataBuffer = private_get_data_array();


        size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);

        size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);


        Packer<size_t,CudaMemory>::pack(mem,n_cnk,sts);

        mem.hostToDevice(mem.getOffset(),mem.getOffset()+sizeof(size_t));


        size_t offset1 = mem.getOffsetEnd();


        grid_key_dx<dim> key = sub_it.getStart();


        for (int i = 0 ; i < dim ; i++)

        {Packer<int,CudaMemory>::pack(mem,key.get(i),sts);}


        for (int i = 0 ; i < dim ; i++)

        {Packer<int,CudaMemory>::pack(mem,(int)gridGeometry.getSize()[i],sts);}


        mem.hostToDevice(offset1,offset1+2*dim*sizeof(int));


        // chunk indexes

        mem.allocate(n_cnk*sizeof(indexT));

        index_ptrs.add(mem.getDevicePointer());


        // chunk point scan

        mem.allocate( align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) );

        scan_ptrs.add(mem.getDevicePointer());


        // chunk data

        mem.allocate( align_number(sizeof(indexT),n_pnt*(spq.point_size)) );

        data_ptrs.add(mem.getDevicePointer());


        // space for offsets

        mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(short int) ) );

        offset_ptrs.add(mem.getDevicePointer());


        // space for offsets

        mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(unsigned char) ) );

        mask_ptrs.add(mem.getDevicePointer());


        req_index++;

    }


    template<unsigned int ... prp>

    void removeCopyToFinalize(gpu::ofp_context_t & ctx, int opt)

    {

        if ((opt & 0x3) == rem_copy_opt::PHASE1)

        {

            this->template removeCopyToFinalize_phase1<prp ...>(ctx,opt);

        }

        else if ((opt & 0x3) == rem_copy_opt::PHASE2)

        {

            this->template removeCopyToFinalize_phase2<prp ...>(ctx,opt);

        }

        else

        {

            this->template removeCopyToFinalize_phase3<prp ...>(ctx,opt,false);

        }

    }


    template<int ... prp> void packFinalize(ExtPreAlloc<CudaMemory> & mem,

                                    Pack_stat & sts,

                                    int opt = 0,

                                    bool is_pack_remote = false)

    {


        RestorePackVariableIfKeepGeometry(opt,is_pack_remote);


        if (pack_subs.size() <= 32)

        {

                pack_sg_implement<32,prp...>(mem,sts,opt,is_pack_remote);

        }

        else if (pack_subs.size() <= 64)

        {

                pack_sg_implement<64, prp...>(mem,sts,opt,is_pack_remote);

        }

        else if (pack_subs.size() <= 80)

        {

                pack_sg_implement<80, prp...>(mem,sts,opt,is_pack_remote);

        }

        else

        {

                std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;

        }


        savePackVariableIfNotKeepGeometry(opt,is_pack_remote);

    }


    void removeAddUnpackReset()

    {

        rem_sects.clear();


        auto & vad = BMG::blockMap.private_get_vct_add_data();

        auto & vai = BMG::blockMap.private_get_vct_add_index();


        vad.clear();

        vai.clear();


        // Clear variables

        offset_ptrs_cp.clear();

        scan_ptrs_cp.clear();

        n_cnk_cp.clear();

        n_pnt_cp.clear();

        data_base_ptr_cp.clear();

        box_cp.clear();

        n_shifts_cp.clear();

        convert_blk.clear();

        tmp2.clear();

    }


    void swap(self & gr)

    {

        gridGeometry.swap(gr.gridGeometry);


        BMG::swap(gr);

    }


    void removePoints(gpu::ofp_context_t& context)

    {

        auto & indexBuffer = private_get_index_array();

        auto & dataBuffer = private_get_data_array();


        // first we remove

        if (rem_sects.size() != 0)

        {

            rem_sects.template hostToDevice<0,1>();


            tmp.resize(indexBuffer.size() + 1);


            tmp.template get<1>(tmp.size()-1) = 0;

            tmp.template hostToDevice<1>(tmp.size()-1,tmp.size()-1);


            auto ite = indexBuffer.getGPUIterator();


            if (has_work_gpu(ite) == true)

            {

                // mark all the chunks that must remove points

                CUDA_LAUNCH((SparseGridGpuKernels::calc_remove_points_chunks_boxes<dim,

                                                             BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,

                                                             blockEdgeSize>),ite,indexBuffer.toKernel(),rem_sects.toKernel(),

                                                                  gridGeometry,dataBuffer.toKernel(),

                                                                  tmp.toKernel());


                // scan

                openfpm::scan((unsigned int *)tmp.template getDeviceBuffer<1>(),tmp.size(),(unsigned int *)tmp.template getDeviceBuffer<1>(),context);


                tmp.template deviceToHost<1>(tmp.size()-1,tmp.size()-1);


                // get the number of chunks involved

                size_t nr_cnk = tmp.template get<1>(tmp.size()-1);


                tmp3.resize(nr_cnk);


                // collect the chunks involved in the remove

                ite = indexBuffer.getGPUIterator();


                if (has_work_gpu(ite) == false) {return;}


                CUDA_LAUNCH((SparseGridGpuKernels::collect_rem_chunks),ite,tmp.toKernel(),tmp3.toKernel());


                // Launch to remove points


                ite = tmp3.getGPUIterator();


                ite.wthr.x = tmp3.size();

                ite.wthr.y = 1;

                ite.wthr.z = 1;

                ite.thr.x = getBlockSize();

                ite.thr.y = 1;

                ite.thr.z = 1;


                if (has_work_gpu(ite) == false) {return;}


                CUDA_LAUNCH((SparseGridGpuKernels::remove_points<dim,

                                                                BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),

                                                                ite,indexBuffer.toKernel(),

                                                                gridGeometry,

                                                                dataBuffer.toKernel(),

                                                                tmp3.toKernel(),

                                                                rem_sects.toKernel());


                tmp3.clear();

            }

        }

    }


    template<unsigned int ... prp>

    void removeAddUnpackFinalize(gpu::ofp_context_t& context, int opt)

    {

        if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)

        {removePoints(context);}


        removeCopyToFinalize_phase3<prp ...>(context,opt,true);

    }


    void copyRemoveReset()

    {

        rem_sects.clear();

        copySect.clear();

        offset_ptrs_cp.clear();

        scan_ptrs_cp.clear();

        data_base_ptr_cp.clear();

        n_cnk_cp.clear();

        n_pnt_cp.clear();

        n_shifts_cp.clear();

        convert_blk.clear();

        box_cp.clear();

        data_base_ptr_cp.clear();


        tmp2.clear();

    }


    void remove(const Box<dim,int> & section_to_delete)

    {

        rem_sects.add(section_to_delete);

    }


    static constexpr bool isCompressed()

    {

        return true;

    }


    void copy_to(self & grid_src,

                 const Box<dim,size_t> & box_src,

                 const Box<dim,size_t> & box_dst)

    {

        // first we launch a kernel to count the number of points we have


        sparse_grid_section<self> sgs(*this,box_src,box_dst);


        grid_src.copySect.add(sgs);

    }


    template<typename pointers_type,

             typename headers_type,

             typename result_type,

             unsigned int ... prp >

    static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result, int n_slot)

    {

        // we have to increment ps by the right amount

        sparsegridgpu_pack_request<AggregateT,prp ...> spq;

        boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);


        result.allocate(sizeof(int));


        if (pointers.size())

        CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),

                                                                                                             pointers.toKernel(),

                                                                                                             headers.toKernel(),

                                                                                                             (int *)result.getDevicePointer(),

                                                                                                             spq.point_size,

                                                                                                             n_slot)

    }


    template<unsigned int ... prp, typename S2, typename header_type>

    void unpack_with_headers(ExtPreAlloc<S2> & mem,

                SparseGridGpu_iterator_sub<dim,self> & sub_it,

                header_type & headers,

                int ih,

                Unpack_stat & ps,

                gpu::ofp_context_t &context,

                rem_copy_opt opt = rem_copy_opt::NONE_OPT)

    {


        if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)

        {

            this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,context);


            // readjust mem

        }

        else

        {

            // we have to increment ps by the right amount

            sparsegridgpu_pack_request<AggregateT,prp ...> spq;

            boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);


            // First get the number of chunks


            size_t n_cnk = headers.template get<1>(ih);

            ps.addOffset(sizeof(size_t));

            ps.addOffset(2*dim*sizeof(unsigned int));


            size_t actual_offset = n_cnk*sizeof(indexT);

            unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));


            // Unpack number of points

            // calculate the number of total points

            size_t n_pnt = headers.template get<2>(ih);

            actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));


            void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );


            actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));

            short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);


            actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));

            actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));


            scan_ptrs_cp.add(scan);

            offset_ptrs_cp.add(offsets);

            data_base_ptr_cp.add(data_base_ptr);


            ps.addOffset(actual_offset);

        }

    }


    static bool is_unpack_header_supported()

    {return true;}


    template<unsigned int ... prp, typename S2>

    void unpack(ExtPreAlloc<S2> & mem,

                SparseGridGpu_iterator_sub<dim,self> & sub_it,

                Unpack_stat & ps,

                gpu::ofp_context_t &context,

                rem_copy_opt opt = rem_copy_opt::NONE_OPT)

    {


        if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)

        {

            this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,context);


            // readjust mem

        }

        else

        {

            // we have to increment ps by the right amount

            sparsegridgpu_pack_request<AggregateT,prp ...> spq;

            boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);


            // First get the number of chunks


            size_t n_cnk;


            // Unpack the number of chunks

            mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));

            Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);


            // Unpack origin of the chunk indexing

/*          for (int i = 0 ; i < dim ; i++)

            {

                int tmp;

                Unpacker<int,S2>::unpack(mem,tmp,ps);

            }


            for (int i = 0 ; i < dim ; i++)

            {

                int tmp;

                Unpacker<int,S2>::unpack(mem,tmp,ps);

            }*/


            ps.addOffset(2*dim*sizeof(unsigned int));


            size_t actual_offset = n_cnk*sizeof(indexT);

            unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));


            mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),

                             ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));


            // Unpack number of points

            // calculate the number of total points

            size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));

            actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));


            void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );


            actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));

            short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);


            actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));

            actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));


            scan_ptrs_cp.add(scan);

            offset_ptrs_cp.add(offsets);

            data_base_ptr_cp.add(data_base_ptr);


            ps.addOffset(actual_offset);

        }

    }


    void removeUnusedBuffers()

    {

        BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::removeUnusedBuffers();

    }


    decltype(self::type_of_iterator()) getIterator() const

    {

        return decltype(self::type_of_iterator())(*this);

    }


    decltype(self::type_of_subiterator()) getIterator(const grid_key_dx<dim> & start, const grid_key_dx<dim> & stop, int is_to_init = 1) const

    {

        return decltype(self::type_of_subiterator())(*this,start,stop,is_to_init);

    }


    auto private_get_add_index_array() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.private_get_vct_add_index()) &

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.private_get_vct_add_index();

    }


    auto private_get_add_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.private_get_vct_add_index()) &

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.private_get_vct_add_index();

    }


    auto private_get_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer()) &

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

    }


    auto getSegmentToOutMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap();

    }


    auto getSegmentToOutMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap();

    }


    auto getSegmentToMergeIndexMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap();

    }


    auto getSegmentToMergeIndexMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap();

    }


    auto private_get_index_array() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer())

    {

        return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();

    }


    auto private_get_neighborhood_array() -> decltype(nn_blocks) &

    {

        return nn_blocks;

    }


#if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) || defined(VTKWRITER_HPP_)


    template<typename Tw = float> bool write(const std::string & output)

    {

        Point<dim,double> spacing;

        Point<dim,double> offset;


        spacing.one();

        offset.zero();


        return write_with_spacing_offset(output,spacing,offset);

    }


    template<typename Tw = float>

    bool write_with_spacing_offset(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)

    {

        file_type ft = file_type::BINARY;


        auto & bm = this->private_get_blockMap();


        auto & index = bm.getIndexBuffer();

        auto & data = bm.getDataBuffer();


        openfpm::vector<Point<dim,Tw>> tmp_pos;

        openfpm::vector<typename aggregate_add<AggregateT>::type> tmp_prp;


        // copy position and properties


        auto it = index.getIterator();


        while(it.isNext())

        {

            auto key = it.get();


            Point<dim,Tw> p;


            for (size_t i = 0 ; i < gridGeometry.getBlockSize() ; i++)

            {

                if (data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i] != 0)

                {

                    // Get the block index

                    grid_key_dx<dim,int> keyg = gridGeometry.InvLinId(index.template get<0>(key),i);


                    for (size_t k = 0 ; k < dim ; k++)

                    {p.get(k) = keyg.get(k)*spacing[k] + offset[k]*spacing[k];}


                    tmp_pos.add(p);


                    tmp_prp.add();

                    copy_prop_to_vector_block<decltype(data.get_o(key)),decltype(tmp_prp.last())>

                    cp(data.get_o(key),tmp_prp.last(),key,i);


                    boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);


                    tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];

                }

            }


            ++it;

        }


        // VTKWriter for a set of points

        VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim,Tw>>, openfpm::vector<typename aggregate_add<AggregateT>::type>>, VECTOR_POINTS> vtk_writer;

        vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.size());


        openfpm::vector<std::string> prp_names;


        // Write the VTK file

        return vtk_writer.write(output,prp_names,"sparse_grid","",ft);

    }


    template<typename Tw = float> bool write_debug(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)

    {

        VTKWriter<openfpm::vector<SpaceBox<dim, double>>, VECTOR_BOX> vtk_box1;


        openfpm::vector<SpaceBox<dim,double>> chunks_box;


        auto & ids = private_get_index_array();


        fill_chunks_boxes(chunks_box,ids,spacing,offset);


        vtk_box1.add(chunks_box);

        vtk_box1.write(std::string("chunks_") + output + std::string(".vtk"));


        //write data


        write_with_spacing_offset(std::string("data_") + output + std::string(".vtk"),spacing,offset);


        return true;

    }


#endif

};


template<unsigned int dim,

         typename AggregateT,

         unsigned int blockEdgeSize = default_edge<dim>::type::value,

         unsigned int threadBlockSize = default_edge<dim>::tb::value,

         typename indexT=long int,

         template<typename> class layout_base=memory_traits_inte,

         typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>

using SparseGridGpu_z = SparseGridGpu<dim,AggregateT,blockEdgeSize,threadBlockSize,indexT,layout_base,linearizer>;


template<unsigned int dim,

         typename AggregateT,

         unsigned int blockEdgeSize = default_edge<dim>::type::value,

         unsigned int threadBlockSize = default_edge<dim>::tb::value,

         typename indexT=int,

         template<typename> class layout_base=memory_traits_inte,

         typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>

using SparseGridGpu_zi = SparseGridGpu<dim,AggregateT,blockEdgeSize,threadBlockSize,indexT,layout_base,linearizer>;


template<unsigned int dim,

         typename AggregateT,

         unsigned int blockEdgeSize = default_edge<dim>::type::value,

         unsigned int threadBlockSize = default_edge<dim>::tb::value,

         typename indexT=int,

         template<typename> class layout_base=memory_traits_inte,

         typename linearizer = grid_smb<dim, blockEdgeSize,indexT>>

using SparseGridGpu_i = SparseGridGpu<dim,AggregateT,blockEdgeSize,threadBlockSize,indexT,layout_base,linearizer>;


#endif //OPENFPM_PDATA_SPARSEGRIDGPU_HPP

BlockMapGpu
Definition BlockMapGpu.hpp:42

BlockMapGpu::removeUnusedBuffers
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
Definition BlockMapGpu.hpp:368

BlockMapGpu::insert_o
auto insert_o(unsigned int linId) -> decltype(blockMap.insert(0))
insert data, host version
Definition BlockMapGpu.hpp:168

BlockMapGpu::insertBlockFlush
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId, is_new).template get< p >())
insert a block + flush, host version
Definition BlockMapGpu.hpp:187

BlockMapGpu::preFlush
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
Definition BlockMapGpu.hpp:265

BlockMapGpu::private_get_blockMap
decltype(blockMap) & private_get_blockMap()
Return internal structure block map.
Definition BlockMapGpu.hpp:388

Box
This class represent an N-dimensional box.
Definition Box.hpp:61

Box::getLow
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition Box.hpp:556

Box::getHigh
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition Box.hpp:567

Box::setHigh
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
Definition Box.hpp:544

Box::setLow
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
Definition Box.hpp:533

CudaMemory
Definition CudaMemory.cuh:59

CudaMemory::getDevicePointer
virtual void * getDevicePointer()
get a readable pointer with the data
Definition CudaMemory.cu:503

CudaMemory::resize
virtual bool resize(size_t sz)
resize the momory allocated
Definition CudaMemory.cu:261

CudaMemory::hostToDevice
virtual void hostToDevice()
Move memory from host to device.
Definition CudaMemory.cu:514

CudaMemory::size
virtual size_t size() const
the the size of the allocated memory
Definition CudaMemory.cu:245

CudaMemory::getPointer
virtual void * getPointer()
get a readable pointer with the data
Definition CudaMemory.cu:354

CudaMemory::allocate
virtual bool allocate(size_t sz)
allocate memory
Definition CudaMemory.cu:38

ExtPreAlloc
Definition ExtPreAlloc.hpp:28

ExtPreAlloc::decRef
virtual void decRef()
Decrement the reference counter.
Definition ExtPreAlloc.hpp:123

ExtPreAlloc::getOffsetEnd
size_t getOffsetEnd()
Get offset.
Definition ExtPreAlloc.hpp:439

ExtPreAlloc::getOffset
size_t getOffset()
Get offset.
Definition ExtPreAlloc.hpp:429

ExtPreAlloc::getDevicePointer
virtual void * getDevicePointer()
Return the pointer of the last allocation.
Definition ExtPreAlloc.hpp:227

ExtPreAlloc::incRef
virtual void incRef()
Increment the reference counter.
Definition ExtPreAlloc.hpp:119

ExtPreAlloc::getPointer
virtual void * getPointer()
Return the pointer of the last allocation.
Definition ExtPreAlloc.hpp:271

ExtPreAlloc::deviceToHost
virtual void deviceToHost()
Do nothing.
Definition ExtPreAlloc.hpp:255

ExtPreAlloc::reset
void reset()
Reset the internal counters.
Definition ExtPreAlloc.hpp:448

ExtPreAlloc::hostToDevice
virtual void hostToDevice()
Return the pointer of the last allocation.
Definition ExtPreAlloc.hpp:239

ExtPreAlloc::allocate
virtual bool allocate(size_t sz)
Allocate a chunk of memory.
Definition ExtPreAlloc.hpp:151

ExtPreAlloc::size
virtual size_t size() const
Get the size of the LAST allocated memory.
Definition ExtPreAlloc.hpp:319

HeapMemory
This class allocate, and destroy CPU memory.
Definition HeapMemory.hpp:40

Pack_stat
Packing status object.
Definition Pack_stat.hpp:61

Packer
Packing class.
Definition Packer.hpp:50

Packer::pack
static void pack(ExtPreAlloc< Mem >, const T &obj)
Error, no implementation.
Definition Packer.hpp:56

Point
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28

Point::one
void one()
Set to one the point coordinate.
Definition Point.hpp:296

Point::zero
__device__ __host__ void zero()
Set to zero the point coordinate.
Definition Point.hpp:284

Point::get
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition Point.hpp:172

SpaceBox
This class represent an N-dimensional box.
Definition SpaceBox.hpp:27

SparseGridGpu_iterator_sub
Definition SparseGridGpu_iterator_sub.hpp:15

SparseGridGpu_iterator_sub::getStart
grid_key_dx< dim > getStart() const
Return the starting point.
Definition SparseGridGpu_iterator_sub.hpp:172

SparseGridGpu_iterator_sub::getStop
grid_key_dx< dim > getStop() const
Return the stop point.
Definition SparseGridGpu_iterator_sub.hpp:182

SparseGridGpu_iterator
Definition SparseGridGpu_iterator.hpp:159

SparseGridGpu_ker
Definition SparseGridGpu_ker.cuh:30

SparseGridGpu_ker::size
__device__ unsigned int size(unsigned int i)
Size of the sparse grid in each direction.
Definition SparseGridGpu_ker.cuh:151

SparseGridGpu_ker::getBlockSize
constexpr __device__ unsigned int getBlockSize() const
Return the size of the block.
Definition SparseGridGpu_ker.cuh:272

SparseGridGpu
Definition SparseGridGpu.hpp:493

SparseGridGpu::link_dw
openfpm::vector_gpu< aggregate< int, short int > > link_dw
links of the padding points with real points of a finer sparsegrid
Definition SparseGridGpu.hpp:630

SparseGridGpu::links_up
openfpm::vector_gpu< aggregate< size_t > > links_up
links of the padding points with real points of a coarse sparsegrid
Definition SparseGridGpu.hpp:624

SparseGridGpu::SparseGridGpu
SparseGridGpu(linearizer &gridGeometry, unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
Definition SparseGridGpu.hpp:1700

SparseGridGpu::getBlockLinId
size_t getBlockLinId(const CoordT &blockCoord) const
Linearization of block coordinates.
Definition SparseGridGpu.hpp:2761

SparseGridGpu::tmp3
openfpm::vector_gpu< aggregate< indexT > > tmp3
temporal 3
Definition SparseGridGpu.hpp:602

SparseGridGpu::private_get_index_array
auto private_get_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer()) &
Return the index array of the blocks.
Definition SparseGridGpu.hpp:3787

SparseGridGpu::conv_cross
void conv_cross(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a convolution using a cross like stencil.
Definition SparseGridGpu.hpp:2565

SparseGridGpu::shifts
openfpm::vector_gpu< aggregate< int[dim]> > shifts
shifts for chunk conversion
Definition SparseGridGpu.hpp:642

SparseGridGpu::get
auto get(const grid_key_dx< dim, CoordT > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using the point coordinates.
Definition SparseGridGpu.hpp:1894

SparseGridGpu::getUpLinks
openfpm::vector_gpu< aggregate< int, short int > > & getUpLinks()
Get the links up for each point.
Definition SparseGridGpu.hpp:2096

SparseGridGpu::index_size_swp
int index_size_swp
Definition SparseGridGpu.hpp:620

SparseGridGpu::pack
void pack(ExtPreAlloc< CudaMemory > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Pack_stat &sts)
Pack the object into the memory given an iterator.
Definition SparseGridGpu.hpp:3240

SparseGridGpu::get_o
auto get_o(const grid_key_dx< dim, CoordT > &coord) const -> encap_data_block< typename std::remove_const< decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::get(0))>::type >
Get an element using the point coordinates.
Definition SparseGridGpu.hpp:1942

SparseGridGpu::get
auto get(const sparse_grid_gpu_index< self > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
Definition SparseGridGpu.hpp:1909

SparseGridGpu::SparseGridGpu
SparseGridGpu(const size_t(&res)[dim], unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
Definition SparseGridGpu.hpp:1690

SparseGridGpu::private_get_add_index_array
auto private_get_add_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
Definition SparseGridGpu.hpp:3767

SparseGridGpu::removeCopyToFinalize
void removeCopyToFinalize(gpu::ofp_context_t &ctx, int opt)
It finalize the queued operations of remove() and copy_to()
Definition SparseGridGpu.hpp:3311

SparseGridGpu::prAlloc_prp
ExtPreAlloc< CudaMemory > * prAlloc_prp
Memory to remove copy finalize.
Definition SparseGridGpu.hpp:639

SparseGridGpu::remove
void remove(const Box< dim, int > &section_to_delete)
Remove all the points in this region.
Definition SparseGridGpu.hpp:3524

SparseGridGpu::e_points
openfpm::vector_gpu< aggregate< indexT > > e_points
Definition SparseGridGpu.hpp:575

SparseGridGpu::removePoints
void removePoints(gpu::ofp_context_t &context)
Remove the points we queues to remove.
Definition SparseGridGpu.hpp:3413

SparseGridGpu::unpack
void unpack(ExtPreAlloc< CudaMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
Definition SparseGridGpu.hpp:2967

SparseGridGpu::construct_link
void construct_link(self &grid_up, self &grid_dw, gpu::ofp_context_t &context)
construct link between levels
Definition SparseGridGpu.hpp:2022

SparseGridGpu::unpack_with_headers
void unpack_with_headers(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, header_type &headers, int ih, Unpack_stat &ps, gpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
Definition SparseGridGpu.hpp:3591

SparseGridGpu::private_get_neighborhood_array
auto private_get_neighborhood_array() -> decltype(nn_blocks) &
Return the index array of the blocks.
Definition SparseGridGpu.hpp:3827

SparseGridGpu::setNNType
void setNNType()
Set the neighborhood type.
Definition SparseGridGpu.hpp:1822

SparseGridGpu::pack_output
openfpm::vector_gpu< aggregate< unsigned int > > pack_output
Helper array to pack points.
Definition SparseGridGpu.hpp:580

SparseGridGpu::clear
void clear()
Definition SparseGridGpu.hpp:1790

SparseGridGpu::get
auto get(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
Definition SparseGridGpu.hpp:1983

SparseGridGpu::tmp2
openfpm::vector_gpu< aggregate< indexT > > tmp2
temporal 2
Definition SparseGridGpu.hpp:599

SparseGridGpu::insertFlush
auto insertFlush(const grid_key_dx< dim, CoordT > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
Definition SparseGridGpu.hpp:2828

SparseGridGpu::packRequest
void packRequest(size_t &req) const
Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory.
Definition SparseGridGpu.hpp:2901

SparseGridGpu::private_get_data_array
auto private_get_data_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer())
Return the data array of the blocks.
Definition SparseGridGpu.hpp:1929

SparseGridGpu::insertFlush
auto insertFlush(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
Definition SparseGridGpu.hpp:2777

SparseGridGpu::setGPUInsertBuffer
void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
Definition SparseGridGpu.hpp:2267

SparseGridGpu::preFlush
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
Definition SparseGridGpu.hpp:2281

SparseGridGpu::packCalculate
void packCalculate(size_t &req, gpu::ofp_context_t &context)
Calculate the size of the information to pack.
Definition SparseGridGpu.hpp:3073

SparseGridGpu::getMappingVector
auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
Return the mapping vector used to know where the data has been added.
Definition SparseGridGpu.hpp:3211

SparseGridGpu::is_unpack_header_supported
static bool is_unpack_header_supported()
Indicate that unpacking the header is supported.
Definition SparseGridGpu.hpp:3648

SparseGridGpu::ghostLayerToThreadsMapping
openfpm::vector_gpu< aggregate< short int, short int > > ghostLayerToThreadsMapping
Definition SparseGridGpu.hpp:589

SparseGridGpu::copyRemoveReset
void copyRemoveReset()
Reset the queue to remove and copy section of grids.
Definition SparseGridGpu.hpp:3500

SparseGridGpu::getDownLinks
openfpm::vector_gpu< aggregate< int, short int > > & getDownLinks()
Get the links down for each point.
Definition SparseGridGpu.hpp:2076

SparseGridGpu::getFlag
unsigned char getFlag(const sparse_grid_gpu_index< self > &coord) const
Return the flag of the point.
Definition SparseGridGpu.hpp:1995

SparseGridGpu::removeAddUnpackReset
void removeAddUnpackReset()
In this case it does nothing.
Definition SparseGridGpu.hpp:3372

SparseGridGpu::type_of_subiterator
static SparseGridGpu_iterator_sub< dim, self > type_of_subiterator()
This is a meta-function return which type of sub iterator a grid produce.
Definition SparseGridGpu.hpp:741

SparseGridGpu::size
size_t size() const
return the size of the grid
Definition SparseGridGpu.hpp:730

SparseGridGpu::packRequest
void packRequest(SparseGridGpu_iterator_sub< dim, self > &sub_it, size_t &req) const
Calculate the size to pack part of this structure.
Definition SparseGridGpu.hpp:3037

SparseGridGpu::getDownLinksOffsets
openfpm::vector_gpu< aggregate< unsigned int > > & getDownLinksOffsets()
Get the offsets for each point of the links down.
Definition SparseGridGpu.hpp:2066

SparseGridGpu::pack
void pack(ExtPreAlloc< HeapMemory > &mem, Pack_stat &sts) const
Pack the object into the memory.
Definition SparseGridGpu.hpp:2922

SparseGridGpu::getUpLinksOffsets
openfpm::vector_gpu< aggregate< unsigned int > > & getUpLinksOffsets()
Get the offsets for each point of the links up.
Definition SparseGridGpu.hpp:2086

SparseGridGpu::isCompressed
static constexpr bool isCompressed()
This is a multiresolution sparse grid so is a compressed format.
Definition SparseGridGpu.hpp:3534

SparseGridGpu::removeAddUnpackFinalize
void removeAddUnpackFinalize(gpu::ofp_context_t &context, int opt)
This function remove the points we queue to remove and it flush all the added/unpacked data.
Definition SparseGridGpu.hpp:3488

SparseGridGpu::getMergeIndexMapVector
auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
Return the mapping vector used to know where the data has been added.
Definition SparseGridGpu.hpp:3221

SparseGridGpu::getIterator
decltype(self::type_of_iterator()) getIterator() const
Return a SparseGrid iterator.
Definition SparseGridGpu.hpp:3747

SparseGridGpu::packRequest
void packRequest(size_t &req, gpu::ofp_context_t &context) const
memory requested to pack this object
Definition SparseGridGpu.hpp:2980

SparseGridGpu::copy_to
void copy_to(self &grid_src, const Box< dim, size_t > &box_src, const Box< dim, size_t > &box_dst)
It queue a copy.
Definition SparseGridGpu.hpp:3546

SparseGridGpu::type_of_iterator
static SparseGridGpu_iterator< dim, self > type_of_iterator()
This is a meta-function return which type of iterator a grid produce.
Definition SparseGridGpu.hpp:751

SparseGridGpu::resize
void resize(size_t(&res)[dim])
resize the SparseGrid
Definition SparseGridGpu.hpp:1681

SparseGridGpu::construct_link_dw
void construct_link_dw(self &grid_dw, const Box< dim, int > &db_, Point< dim, int > p_dw, gpu::ofp_context_t &context)
construct link on the down level
Definition SparseGridGpu.hpp:2109

SparseGridGpu::link_up
openfpm::vector_gpu< aggregate< int, short int > > link_up
links of the padding points with real points of a finer sparsegrid
Definition SparseGridGpu.hpp:636

SparseGridGpu::conv2_b
void conv2_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
Definition SparseGridGpu.hpp:2624

SparseGridGpu::conv_cross_b
void conv_cross_b(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
Definition SparseGridGpu.hpp:2604

SparseGridGpu::tmp
openfpm::vector_gpu< aggregate< indexT, unsigned int > > tmp
temporal
Definition SparseGridGpu.hpp:594

SparseGridGpu::packReset
void packReset()
Reset the pack calculation.
Definition SparseGridGpu.hpp:3053

SparseGridGpu::swap
void swap(self &gr)
Definition SparseGridGpu.hpp:3399

SparseGridGpu::isSkipLabellingPossible
bool isSkipLabellingPossible()
This function check if keep geometry is possible for this grid.
Definition SparseGridGpu.hpp:1968

SparseGridGpu::construct_link_up
void construct_link_up(self &grid_up, const Box< dim, int > &db_, Point< dim, int > p_up, gpu::ofp_context_t &context)
construct link on the up levels
Definition SparseGridGpu.hpp:2187

SparseGridGpu::pack_subs
openfpm::vector_gpu< Box< dim, int > > pack_subs
the set of all sub-set to pack
Definition SparseGridGpu.hpp:613

SparseGridGpu::private_get_index_array
auto private_get_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer())
Return the index array of the blocks.
Definition SparseGridGpu.hpp:3817

SparseGridGpu::addAndConvertPackedChunkToTmp
void addAndConvertPackedChunkToTmp(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &context)
unpack the sub-grid object
Definition SparseGridGpu.hpp:1417

SparseGridGpu::insertBlockFlush
auto insertBlockFlush(const grid_key_dx< dim, CoordT > &coord, indexT &local_id) -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::insertBlockFlush(0))
Insert the point on host side and flush directly.
Definition SparseGridGpu.hpp:2803

SparseGridGpu::unpack
void unpack(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
Definition SparseGridGpu.hpp:3661

SparseGridGpu::private_get_data_array
auto private_get_data_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer()) &
Return the index array of the blocks.
Definition SparseGridGpu.hpp:1919

SparseGridGpu::get_o
auto get_o(const sparse_grid_gpu_index< self > &coord) const -> encap_data_block< typename std::remove_const< decltype(private_get_data_array().get(0))>::type >
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
Definition SparseGridGpu.hpp:1958

SparseGridGpu::unpack_headers
static void unpack_headers(pointers_type &pointers, headers_type &headers, result_type &result, int n_slot)
Stub does not do anything.
Definition SparseGridGpu.hpp:3564

SparseGridGpu::conv3_b
void conv3_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
Definition SparseGridGpu.hpp:2646

SparseGridGpu::new_map
openfpm::vector_gpu< aggregate< int > > new_map
Map between the (Last) added chunks and their position in chunks data.
Definition SparseGridGpu.hpp:608

SparseGridGpu::yes_i_am_grid
int yes_i_am_grid
it define that this data-structure is a grid
Definition SparseGridGpu.hpp:709

SparseGridGpu::setBackgroundValue
void setBackgroundValue(typename boost::mpl::at< typename AggregateT::type, boost::mpl::int_< p > >::type backgroundValue)
set the background for property p
Definition SparseGridGpu.hpp:2875

SparseGridGpu::convertChunkIds
void convertChunkIds(short int *offset, origPackType &origPack, IteratorType &sub_it)
convert the offset index from the packed to the add buffer
Definition SparseGridGpu.hpp:1613

SparseGridGpu::unpack
void unpack(ExtPreAlloc< HeapMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
Definition SparseGridGpu.hpp:2944

SparseGridGpu::conv2
void conv2(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
Definition SparseGridGpu.hpp:2666

SparseGridGpu::link_dw_scan
openfpm::vector_gpu< aggregate< unsigned int > > link_dw_scan
scan offsets of the links down
Definition SparseGridGpu.hpp:627

SparseGridGpu::getBox
Box< dim, int > getBox()
Return a Box with the range if the SparseGrid.
Definition SparseGridGpu.hpp:2686

SparseGridGpu::get_sparse
base_key get_sparse(const grid_key_dx< dim, CoordT > &coord) const
Get an element using the point coordinates.
Definition SparseGridGpu.hpp:1865

SparseGridGpu::removeUnusedBuffers
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
Definition SparseGridGpu.hpp:3735

SparseGridGpu::getIterator
decltype(self::type_of_subiterator()) getIterator(const grid_key_dx< dim > &start, const grid_key_dx< dim > &stop, int is_to_init=1) const
Return a SparseGrid iterator only on a sub-set of elements.
Definition SparseGridGpu.hpp:3757

SparseGridGpu::link_up_scan
openfpm::vector_gpu< aggregate< unsigned int > > link_up_scan
scan offsets of the links down
Definition SparseGridGpu.hpp:633

SparseGridGpu::getGrid
linearizer & getGrid()
Return the grid information object.
Definition SparseGridGpu.hpp:1811

SparseGridGpu::private_get_add_index_array
auto private_get_add_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
Definition SparseGridGpu.hpp:3777

SparseGridGpu::scan_it
openfpm::vector_gpu< aggregate< indexT > > scan_it
contain the scan of the point for each iterator
Definition SparseGridGpu.hpp:605

SparseGridGpu::packFinalize
void packFinalize(ExtPreAlloc< CudaMemory > &mem, Pack_stat &sts, int opt=0, bool is_pack_remote=false)
Finalize the packing procedure.
Definition SparseGridGpu.hpp:3339

SparseGridGpu::conv
void conv(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
Definition SparseGridGpu.hpp:2584

Unpack_stat
Unpacking status object.
Definition Pack_stat.hpp:16

Unpack_stat::getOffset
size_t getOffset()
Return the actual counter.
Definition Pack_stat.hpp:41

Unpack_stat::addOffset
void addOffset(size_t off)
Increment the offset pointer by off.
Definition Pack_stat.hpp:31

Unpacker
Unpacker class.
Definition Unpacker.hpp:34

Unpacker::unpack
static void unpack(ExtPreAlloc< Mem >, T &obj)
Error, no implementation.
Definition Unpacker.hpp:40

VTKWriter
Definition VTKWriter.hpp:147

copy_prop_to_vector_block
Definition SparseGridGpu.hpp:396

copy_prop_to_vector_block::operator()
void operator()(T &t) const
It call the copy function for each property.
Definition SparseGridGpu.hpp:415

copy_prop_to_vector_block::dst
Tdst dst
destination
Definition SparseGridGpu.hpp:401

copy_prop_to_vector_block::src
Tsrc src
source
Definition SparseGridGpu.hpp:398

cp_block
Definition cp_block.hpp:27

data_ptr_fill
Definition SparseGridGpu.hpp:430

data_ptr_fill::operator()
void operator()(T &t) const
It call the copy function for each property.
Definition SparseGridGpu.hpp:452

data_ptr_fill::base_ptr
void * base_ptr
data pointers
Definition SparseGridGpu.hpp:434

encap_data_block
Definition SparseGridGpu.hpp:116

grid_key_dx
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19

grid_key_dx::set_d
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition grid_key.hpp:516

grid_key_dx::get
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition grid_key.hpp:503

grid_skin_iterator_bc
Definition grid_skin_iterator.hpp:55

grid_sm
Declaration grid_sm.
Definition grid_sm.hpp:167

grid_sm::setDimensions
void setDimensions(const size_t(&dims)[N])
Reset the dimension of the grid.
Definition grid_sm.hpp:326

grid_sm::LinId
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Definition grid_sm.hpp:454

grid_smb
Definition grid_smb.hpp:20

grid_zmb
Definition grid_zmb.hpp:26

openfpm::vector
Implementation of 1-D std::vector like structure.
Definition map_vector.hpp:203

openfpm::vector::size
size_t size()
Stub size.
Definition map_vector.hpp:211

sparse_grid_gpu_index
Element index contain a data chunk index and a point index.
Definition SparseGridGpu_iterator.hpp:18

sparse_grid_gpu_index::get_cnk_pos_id
int get_cnk_pos_id() const
Get chunk position id.
Definition SparseGridGpu_iterator.hpp:68

sparse_grid_gpu_index::get_data_id
int get_data_id() const
Get chunk local index (the returned index < getblockSize())
Definition SparseGridGpu_iterator.hpp:78

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

GetAddBlockType
get the type of the insertBlock
Definition SparseGridGpu.hpp:177

GetCpBlockType
get the type of the block
Definition SparseGridGpu.hpp:162

IntPow
Definition mathUtils.hpp:13

NNFull
Definition SparseGridGpu.hpp:263

NNFull::getNNindex_offset
static __device__ bool getNNindex_offset(grid_key_dx< dim, indexT2 > &coord, unsigned int &NN_index, unsigned int &offset_nn)
given a coordinate writtel in local coordinate for a given it return the neighborhood chunk position ...
Definition SparseGridGpu.hpp:336

NN_index
Definition vector_dist_operators_apply_kernel.hpp:85

NNfull_is_padding_impl
Check if is padding.
Definition SparseGridGpu.hpp:187

UIntDivCeil
Definition mathUtils.hpp:25

aggregate_add
Definition SparseGridGpu.hpp:103

aggregate_convert
Definition SparseGridGpu.hpp:92

aggregate_transform_datablock_impl
Definition SparseGridGpu.hpp:86

aggregate
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition aggregate.hpp:215

arr_arr_ptr
Definition SparseGridGpu_ker_util.hpp:177

arr_ptr
Definition SparseGridGpu_ker_util.hpp:171

copy_cpu_encap_single
this class is a functor for "for_each" algorithm
Definition Encap.hpp:222

copy_prop_to_vector_block_impl
Definition SparseGridGpu.hpp:373

ct_par
Definition SparseGridGpu.hpp:366

default_edge
Definition SparseGridGpu.hpp:40

enc_num
Definition encap_num.hpp:13

gpu::ofp_context_t
Definition ofp_context.hpp:303

ite_gpu
Definition grid_sm.hpp:102

lid
Definition ids.hpp:169

memory_traits_inte
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition memory_conf.hpp:84

meta_copy
This class copy general objects.
Definition meta_copy.hpp:53

openfpm::sparse_index
Definition map_vector_sparse_cuda_ker.cuh:34

process_data_block
Definition SparseGridGpu.hpp:74

sparse_grid_section
Definition SparseGridGpu.hpp:468

sparsegridgpu_pack_request
this class is a functor for "for_each" algorithm
Definition SparseGridGpu_ker_util.hpp:373

static_array
Definition common.hpp:19

to_boost_vmpl
Definition variadic_to_vmpl.hpp:283

type_identity
Definition SparseGridGpu.hpp:68

vmpl_sum_constant
Definition create_vmpl_sequence.hpp:156