5#ifndef OPENFPM_PDATA_SPARSEGRIDGPU_HPP
6#define OPENFPM_PDATA_SPARSEGRIDGPU_HPP
8constexpr int BLOCK_SIZE_STENCIL = 128;
11#include "util/cuda_launch.hpp"
13#include <SparseGridGpu/BlockMapGpu.hpp>
14#include <Grid/iterators/grid_skin_iterator.hpp>
15#include <Grid/Geometry/grid_smb.hpp>
16#include "SparseGridGpu_ker.cuh"
17#include "SparseGridGpu_kernels.cuh"
18#include "Iterators/SparseGridGpu_iterator_sub.hpp"
19#include "Grid/Geometry/grid_zmb.hpp"
20#include "util/stat/common_statistics.hpp"
21#include "Iterators/SparseGridGpu_iterator.hpp"
22#include "Space/SpaceBox.hpp"
24#if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST)
25#include "VTKWriter/VTKWriter.hpp"
28constexpr int NO_ITERATOR_INIT = 0;
34 NO_CALCULATE_EXISTING_POINTS,
35 CALCULATE_EXISTING_POINTS
38template<
unsigned int dim>
41 typedef boost::mpl::int_<2> type;
48 typedef boost::mpl::int_<256> type;
49 typedef boost::mpl::int_<256> tb;
55 typedef boost::mpl::int_<16> type;
56 typedef boost::mpl::int_<256> tb;
62 typedef boost::mpl::int_<8> type;
63 typedef boost::mpl::int_<512> tb;
72template<
typename T,
unsigned int dim,
unsigned int blockEdgeSize>
78template<
typename T,
unsigned int dim,
unsigned int blockEdgeSize,
unsigned int N1>
84template<
unsigned int dim,
unsigned int blockEdgeSize,
typename ... aggr_list>
90template<
unsigned int dim,
unsigned int blockEdgeSize,
typename aggr>
95template<
unsigned int dim,
unsigned int blockEdgeSize,
typename ... types>
101template<
typename aggr>
106template<
typename ... types>
114template<
typename enc_type>
123 :offset(offset),enc(enc)
130 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,enc_type::T_type::max_prop> >(cp);
135 template<
unsigned int p>
136 auto get() ->
decltype(enc.template get<p>()[offset])
138 return enc.template get<p>()[offset];
141 template<
unsigned int p>
142 auto get()
const ->
decltype(enc.template get<p>()[offset])
144 return enc.template get<p>()[offset];
152 STENCIL_MODE_INPLACE = 1,
153 STENCIL_MODE_INPLACE_NO_SHARED = 3
160template<
typename SGr
idGpu,
unsigned int prp,
unsigned int stencil_size>
166 SGridGpu::device_grid_type::dims>
type;
169#include "encap_num.hpp"
175template<
typename SGr
idGpu>
185template<
unsigned int dim>
188 __device__
inline static bool is_padding()
190 printf(
"NNfull_is_padding_impl with dim: %d not implemented yet \n",dim);
203 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
204 __device__
inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
206 bool isPadding_ =
false;
207 for (
int i = 0 ; i < 3 ; i++)
209 for (
int j = 0 ; j < 3 ; j++)
211 for (
int k = 0 ; k < 3 ; k++)
219 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
220 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
221 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
222 if (isPadding_)
break;
238 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
239 __device__
inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
241 bool isPadding_ =
false;
242 for (
int i = 0 ; i < 3 ; i++)
244 for (
int j = 0 ; j < 3 ; j++)
251 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
252 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
253 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
254 if (isPadding_)
break;
261template<
unsigned int dim>
266 template<
typename indexT,
typename blockCoord_type,
typename blockMap_type,
typename SparseGr
id_type>
267 __device__
static inline indexT getNNpos(blockCoord_type & blockCoord,
268 blockMap_type & blockMap,
269 SparseGrid_type & sparseGrid,
270 const unsigned int offset)
273 int neighbourPos = blockMap.size();
274 if (offset < nNN && offset != nNN / 2)
277 for (
int i = 0 ; i < dim ; i++)
281 blockCoord.set_d(i, blockCoord.get(i) + dPos - 1);
284 neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).id;
289 template<
typename indexT,
unsigned int blockEdgeSize,
typename coordType>
290 __host__
static inline indexT getNNskin(coordType & coord,
int stencilSupportRadius)
294 indexT neighbourNum = 0;
297 for(
int i = 0 ; i < dim ; i++)
299 int c =
static_cast<int>(coord.get(i)) -
static_cast<int>(stencilSupportRadius);
304 else if (c >= blockEdgeSize)
306 neighbourNum += 2*accu;
310 neighbourNum += accu;
319 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
320 __device__
static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
335 template<
unsigned int blockEdgeSize,
typename indexT2>
344 for (
unsigned int i = 0 ; i < dim ; i++)
346 int p = 1 - ((int)(coord.
get(i) < 0)) + ((int)(coord.
get(i) >= (int)blockEdgeSize));
350 offset_nn += (coord.
get(i) + (1 - p)*(
int)blockEdgeSize)*cnt_off;
353 cnt_off *= blockEdgeSize;
364template<
unsigned int nNN_,
unsigned int nLoop_>
367 static const unsigned int nNN = nNN_;
368 static const unsigned int nLoop = nLoop_;
371template<
typename copy_type>
374 template<
typename T,
typename dst_type,
typename src_type>
375 static inline void copy(src_type & src, dst_type & dst,
unsigned int bPos)
377 dst.template get<T::value>() = src.template get<T::value>()[bPos];
381template<
typename copy_type,
unsigned int N1>
384 template<
typename T,
typename dst_type,
typename src_type>
385 static inline void copy(src_type & src, dst_type & dst,
unsigned int bPos)
387 for (
int i = 0 ; i < N1 ; i++)
389 dst.template get<T::value>()[i] = src.template get<T::value>()[i][bPos];
394template<
typename Tsrc,
typename Tdst>
417 typedef typename std::remove_reference<
decltype(
dst.template get<T::value>())>::type copy_rtype;
428template<
typename AggregateT,
unsigned int n_it,
unsigned int ... prp>
436 mutable size_t tot = 0;
454 typedef typename boost::mpl::at<vprp,T>::type prp_cp;
457 typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
459 arrs.ptr[i][T::value] = (
void *)((((
unsigned char *)
base_ptr) + tot));
461 tot += sz *
sizeof(pack_type);
466template<
typename SparseGr
idType>
469 SparseGridType * grd;
478 :grd(&grd),src(src),dst(dst)
483template<
unsigned int dim,
487 typename indexT=
long int,
491 typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,
492 threadBlockSize, indexT, layout_base>
496 static constexpr unsigned int dims = dim;
502 threadBlockSize, indexT, layout_base>
BMG;
504 const static unsigned char PADDING_BIT = 1;
506 linearizer gridGeometry;
509 unsigned int stencilSupportRadius;
510 unsigned int ghostLayerSize;
621 mutable int index_size_swp_r = -1;
646 inline void swap_internal_remote()
648 n_cnk_cp_swp_r.swap(n_cnk_cp);
649 n_pnt_cp_swp_r.swap(n_pnt_cp);
650 n_shift_cp_swp_r.swap(n_shifts_cp);
651 convert_blk_swp_r.swap(convert_blk);
652 box_cp_swp_r.swap(box_cp);
656 inline void swap_internal_local()
658 offset_ptrs_cp_swp.swap(offset_ptrs_cp);
659 scan_ptrs_cp_swp.swap(scan_ptrs_cp);
660 data_base_ptr_cp_swp.swap(data_base_ptr_cp);
661 n_cnk_cp_swp.swap(n_cnk_cp);
662 n_pnt_cp_swp.swap(n_pnt_cp);
663 n_shift_cp_swp.swap(n_shifts_cp);
664 convert_blk_swp.swap(convert_blk);
665 box_cp_swp.swap(box_cp);
669 inline void swap_local_pack()
671 index_ptrs_swp.swap(index_ptrs);
672 scan_ptrs_swp.swap(scan_ptrs);
673 data_ptrs_swp.swap(data_ptrs);
674 offset_ptrs_swp.swap(offset_ptrs);
675 mask_ptrs_swp.swap(mask_ptrs);
685 inline void swap_remote_pack()
687 index_ptrs_swp_r.swap(index_ptrs);
688 scan_ptrs_swp_r.swap(scan_ptrs);
689 data_ptrs_swp_r.swap(data_ptrs);
690 offset_ptrs_swp_r.swap(offset_ptrs);
691 mask_ptrs_swp_r.swap(mask_ptrs);
703 static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
704 typedef AggregateBlockT AggregateInternalT;
711 static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;
713 typedef linearizer grid_info;
715 typedef linearizer linearizer_type;
721 typedef indexT indexT_;
723 typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;
732 return this->countExistingElements();
740 template <
typename stencil = no_stencil>
756 template<
typename dim3T>
757 inline static int dim3SizeToInt(dim3T d)
759 return d.x * d.y * d.z;
762 inline static int dim3SizeToInt(
size_t d)
767 inline static int dim3SizeToInt(
unsigned int d)
772 template<
typename ... v_reduce>
776 ::template flush<v_reduce ...>(context, opt);
783 void saveUnpackVariableIfNotKeepGeometry(
int opt,
bool is_unpack_remote)
785 if (is_unpack_remote ==
true)
786 {swap_internal_remote();}
788 if (is_unpack_remote ==
false)
789 {swap_internal_local();}
792 void RestoreUnpackVariableIfKeepGeometry(
int opt,
bool is_unpack_remote)
794 if (opt & KEEP_GEOMETRY && is_unpack_remote ==
true)
795 {swap_internal_remote();}
797 if (opt & KEEP_GEOMETRY && is_unpack_remote ==
false)
798 {swap_internal_local();}
802 void savePackVariableIfNotKeepGeometry(
int opt,
bool is_pack_remote)
804 if (is_pack_remote ==
false)
807 req_index_swp = req_index;
810 if (is_pack_remote ==
true)
813 req_index_swp_r = req_index;
817 void RestorePackVariableIfKeepGeometry(
int opt,
bool is_pack_remote)
819 if (opt & KEEP_GEOMETRY && is_pack_remote ==
false)
822 req_index = req_index_swp;
825 if (opt & KEEP_GEOMETRY && is_pack_remote ==
true)
828 req_index = req_index_swp_r;
832 template<
unsigned int n_it>
833 void calculatePackingPointsFromBoxes(
int opt,
size_t tot_pnt)
835 if (!(opt & KEEP_GEOMETRY))
845 ite.wthr.x = indexBuffer.size();
848 ite.thr.x = getBlockSize();
853 CUDA_LAUNCH((SparseGridGpuKernels::get_exist_points_with_boxes<dim,
858 indexBuffer.toKernel(),
861 dataBuffer.toKernel(),
871 void computeSizeOfGhostLayer()
873 unsigned int term1 = 1;
874 for (
int i = 0; i < dim; ++i)
876 term1 *= blockEdgeSize + 2 * stencilSupportRadius;
878 unsigned int term2 = 1;
879 for (
int i = 0; i < dim; ++i)
881 term2 *= blockEdgeSize;
883 ghostLayerSize = term1 - term2;
886 void allocateGhostLayerMapping()
891 template<
typename stencil_type>
892 void computeGhostLayerMapping()
894 size_t dimensions[dim],
896 innerDomainBegin[dim], innerDomainEnd[dim],
897 outerBoxBegin[dim], outerBoxEnd[dim],
899 for (
int i = 0; i < dim; ++i)
901 dimensions[i] = blockEdgeSize + 2 * stencilSupportRadius;
903 innerDomainBegin[i] = stencilSupportRadius - 1;
904 innerDomainEnd[i] = dimensions[i] - stencilSupportRadius;
905 outerBoxBegin[i] = origin[i];
906 outerBoxEnd[i] = dimensions[i];
907 bc[i] = NON_PERIODIC;
919 auto coord = gsi.get();
920 assert(i < ghostLayerSize);
921 mem_id linId = enlargedGrid.
LinId(coord);
925 ghostLayerToThreadsMapping.template get<nt>(i) = stencil_type::template getNNskin<indexT,blockEdgeSize>(coord,stencilSupportRadius);
930 assert(i == ghostLayerSize);
935 void initialize(
const size_t (& res)[dim])
937 gridGeometry = linearizer(res);
939 computeSizeOfGhostLayer();
940 allocateGhostLayerMapping();
941 computeGhostLayerMapping<NNStar<dim>>();
943 size_t extBlockDims[dim];
944 for (
int d=0; d<dim; ++d)
946 extBlockDims[d] = blockEdgeSize + 2*stencilSupportRadius;
954 template <
typename stencil,
typename... Args>
955 void applyStencilInPlace(
const Box<dim,int> & box, StencilMode & mode,Args... args)
961 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
962 unsigned int numScalars = indexBuffer_.size() * dataChunkSize;
964 if (numScalars == 0)
return;
967 constexpr unsigned int chunksPerBlock = 1;
968 const unsigned int localThreadBlockSize = dataChunkSize * chunksPerBlock;
969 const unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
970 ? numScalars / localThreadBlockSize
971 : 1 + numScalars / localThreadBlockSize;
975#ifdef CUDIFY_USE_CUDA
978 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
982 threadGridSize, localThreadBlockSize,
984 indexBuffer_.toKernel(),
985 dataBuffer_.toKernel(),
986 this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
992 auto indexBuffer = indexBuffer_.toKernel();
993 auto dataBuffer = dataBuffer_.toKernel();
994 auto sparseGrid = this->
template toKernelNN<stencil::stencil_type::nNN, nLoop>();
998 auto lamb = [=] __device__ () mutable
1000 constexpr unsigned int pIndex = 0;
1002 typedef typename decltype(indexBuffer)::value_type IndexAggregateT;
1003 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1005 typedef typename decltype(dataBuffer)::value_type AggregateT_;
1006 typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;
1007 typedef ScalarTypeOf<AggregateT_, pMask> MaskT;
1008 constexpr unsigned int blockSize = MaskBlockT::size;
1012 const unsigned int dataBlockPos = blockIdx.x;
1013 const unsigned int offset = threadIdx.x;
1015 if (dataBlockPos >= indexBuffer.size())
1020 auto dataBlockLoad = dataBuffer.get(dataBlockPos);
1023 const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1026 unsigned char curMask;
1028 if (offset < blockSize)
1031 curMask = dataBlockLoad.template get<pMask>()[offset];
1032 for (
int i = 0 ; i < dim ; i++)
1033 {curMask &= (pointCoord.
get(i) < bx.getLow(i) || pointCoord.
get(i) > bx.getHigh(i))?0:0xFF;}
1037 sdataBlockPos.id = dataBlockPos;
1040 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1044 CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);
1051 template <
typename stencil,
typename... Args>
1052 void applyStencilInPlaceNoShared(
const Box<dim,int> & box, StencilMode & mode,Args... args)
1058 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
1059 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
1061 if (numScalars == 0)
return;
1063 auto ite =
e_points.getGPUIterator(BLOCK_SIZE_STENCIL);
1065 CUDA_LAUNCH((SparseGridGpuKernels::applyStencilInPlaceNoShared
1071 indexBuffer.toKernel(),
1072 dataBuffer.toKernel(),
1073 this->template toKernelNN<stencil::stencil_type::nNN, 0>(),
1077 template<
typename ids_type>
1080 for (
int i = 0 ; i < chunk_ids.size() ; i++)
1084 auto c_pos = gridGeometry.InvLinId(chunk_ids.template get<0>(i)*blockSize);
1086 for (
int j = 0 ; j < dim ; j++)
1088 box.
setLow(j,c_pos.get(j) * spacing[j] - 0.5*spacing[j] + offset.
get(j)*spacing[j]);
1089 box.
setHigh(j,(c_pos.get(j) + blockEdgeSize)*spacing[j] - 0.5*spacing[j] + offset.
get(j)*spacing[j]);
1092 chunks_box.add(box);
1096 template<
typename MemType,
unsigned int ... prp>
1099 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1106 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1108 auto sub_it = this->
getIterator(copySect.get(i).dst.getKP1(),copySect.get(i).dst.getKP2(),NO_ITERATOR_INIT);
1116 template<
unsigned int ... prp>
1119 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1123 template<
unsigned int ... prp>
1129 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1136 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1138 auto sub_it = this->
getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1140 this->packRequest(sub_it,req);
1151 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1153 auto sub_it = this->
getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1160 size_t req = mem.
size();
1175 template<
unsigned int ... prp>
1176 void removeCopyToFinalize_phase3(
gpu::ofp_context_t & ctx,
int opt,
bool is_unpack_remote)
1180 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1182 if (
tmp2.size() == 0)
1189 auto & add_buff = this->blockMap.private_get_vct_add_index();
1190 add_buff.swap(
tmp2);
1192 auto & nadd_buff = this->blockMap.private_get_vct_nadd_index();
1193 ite = nadd_buff.getGPUIterator();
1194 CUDA_LAUNCH(SparseGridGpuKernels::set_one,ite,nadd_buff.toKernel());
1198 this->
template flush<sLeft_<prp>...>(ctx,flush_type::FLUSH_ON_DEVICE);
1204 auto & o_map = this->getSegmentToOutMap();
1205 auto & segments_data = this->getSegmentToMergeIndexMap();
1207 new_map.resize(a_map.size(),0);
1211 ite = segments_data.getGPUIterator();
1213 if (ite.nblocks() != 0)
1214 CUDA_LAUNCH(SparseGridGpuKernels::construct_new_chunk_map<1>,ite,
new_map.toKernel(),a_map.toKernel(),m_map.toKernel(),o_map.toKernel(),segments_data.toKernel(),sz_b);
1216 convert_blk.template hostToDevice<0>();
1230 RestoreUnpackVariableIfKeepGeometry(opt,is_unpack_remote);
1234 size_t n_accu_cnk = 0;
1235 for (
size_t i = 0 ; i < n_cnk_cp.
size() ; i++)
1238 size_t n_pnt = n_pnt_cp.get(i);
1240 void * data_base_ptr = data_base_ptr_cp.get(i);
1241 data_ptr_fill<AggregateT,1,prp...> dpf(data_base_ptr,0,data,n_pnt);
1242 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(dpf);
1244 ite.wthr.x = n_cnk_cp.get(i);
1250 for (
int j = 0 ; j < dim ; j++)
1254 if (l >= blockEdgeSize)
1255 {ite.thr.x *= blockEdgeSize;}
1261 if (ite.nblocks() != 0 && ite.thr.x != 0)
1266 AggregateT,
decltype(convert_blk.toKernel()),
decltype(
new_map.toKernel()),
1267 decltype(data),
decltype(chunks.toKernel()),prp... >),ite,
1268 (
unsigned int *)scan_ptrs_cp.get(i),
1269 (
unsigned short int *)offset_ptrs_cp.get(i),
1270 convert_blk.toKernel(),
1281 n_accu_cnk += n_cnk_cp.get(i)*n_shifts_cp.get(i);
1285 saveUnpackVariableIfNotKeepGeometry(opt,is_unpack_remote);
1288 template<
unsigned int n_it,
unsigned int ... prp>
1292 bool is_pack_remote)
1305 {std::cerr << __FILE__ <<
":" << __LINE__ <<
" error the packing request number differ from the number of packed objects " << req_index <<
" " <<
pack_subs.size() << std::endl;}
1311 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
1315 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
1317 size_t n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
1324 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
1326 size_t n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
1329 index_ptr.ptr[i] = index_ptrs.get(i);
1330 scan_ptr.ptr[i] = scan_ptrs.get(i);
1334 data_ptr_fill<AggregateT,n_it,prp...> dpf(data_ptrs.get(i),i,data_ptr,
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1));
1335 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(dpf);
1337 offset_ptr.ptr[i] = offset_ptrs.get(i);
1338 mask_ptr.ptr[i] = mask_ptrs.get(i);
1347 calculatePackingPointsFromBoxes<n_it>(opt,tot_pnt);
1359 for(
int i = 0 ; i < n_it ; i++)
1361 for (
int j = 0 ; j <
sizeof...(prp) ; j++)
1363 arr_data->ptr[i][j] = data_ptr.ptr[i][j];
1376 decltype(indexBuffer.toKernel()),
1377 decltype(dataBuffer.toKernel()),
1378 decltype(
tmp.toKernel()),
1383 dataBuffer.toKernel(),
1384 indexBuffer.toKernel(),
1403 {CUDA_LAUNCH(SparseGridGpuKernels::last_scan_point,ite,scan_ptr,
tmp.toKernel(),indexBuffer.size()+1,
pack_subs.size());}
1416 template<
unsigned int ... prp,
typename S2>
1423 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(prp)>>(spq);
1438 for (
int i = 0 ; i < dim ; i++)
1442 origPack_cnk.
set_d(i,((
int)(
tmp / blockEdgeSize))*blockEdgeSize);
1446 for (
int i = 0 ; i < dim ; i++)
1453 size_t actual_offset = n_cnk*
sizeof(indexT);
1456 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
1459 ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int) +
sizeof(
unsigned int));
1465 size_t n_pnt = *(
unsigned int *)((
unsigned char *)mem.
getPointer() + ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int));
1466 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
1470 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
1474 offset_ptrs_cp.add(offsets);
1475 scan_ptrs_cp.add(scan);
1476 n_cnk_cp.add(n_cnk);
1477 n_pnt_cp.add(n_pnt);
1478 data_base_ptr_cp.add(data_base_ptr);
1482 for (
int i = 0 ; i < dim ; i++)
1490 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
1499 for (
int i = 0 ; i < dim ; i++)
1500 {
shifts.last().template get<0>()[i] = 0;}
1502 for (
int i = 0 ; i < dim ; i++)
1504 int op_q = origPack_pnt.
get(i) % blockEdgeSize;
1505 int ou_q = sub_it.
getStart().
get(i) % blockEdgeSize;
1506 int quot = abs(ou_q - op_q) % blockEdgeSize;
1507 int squot = openfpm::math::sgn(ou_q - op_q);
1513 for (
int j = 0 ; j < sz ; j++)
1516 for (
int k = 0 ; k < dim ; k++)
1518 shifts.last().template get<0>()[k] =
shifts.template get<0>(j)[k] + ((i == k)?squot:0);
1524 shifts.template hostToDevice<0>();
1526 linearizer gridGeoPack(sz);
1529 size_t sz[1] = {n_cnk};
1531 auto ite = g.getGPUIterator();
1536 for (
int i = 0 ; i < dim ; i++)
1538 sz_g.
set_d(i,gridGeometry.getSize()[i]);
1539 origUnpack_cnk.
set_d(i,(
int)(sub_it.
getStart().
get(i) / blockEdgeSize)*blockEdgeSize);
1545 n_shifts_cp.add(
shifts.size());
1551 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,1,indexT>),ite,ids,
1553 gridGeoPack,origPack_cnk,
1554 gridGeometry,origUnpack_cnk,
1562 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,2,indexT>),ite,ids,
1564 gridGeoPack,origPack_cnk,
1565 gridGeometry,origUnpack_cnk,
1573 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,4,indexT>),ite,ids,
1575 gridGeoPack,origPack_cnk,
1576 gridGeometry,origUnpack_cnk,
1584 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,8,indexT>),ite,ids,
1586 gridGeoPack,origPack_cnk,
1587 gridGeometry,origUnpack_cnk,
1603 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
1612 template<
typename origPackType,
typename IteratorType>
1616 for (
int i = 0 ; i < dim ; i++)
1618 int op_q = origPack.get(i) % blockEdgeSize;
1619 int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1620 int quot = abs(ou_q - op_q) % blockEdgeSize;
1621 quot_diff[i] = openfpm::math::sgn(ou_q - op_q)*quot;
1628 for (
int j = 0 ; j < this->blockSize ; j++)
1637 for (
int i = 0 ; i < dim ; i++)
1639 int c = x % blockEdgeSize;
1641 if (quot_diff[i] + c < 0)
1643 offset += pos_c*(quot_diff[i] + c + blockEdgeSize);
1646 else if (quot_diff[i] + c >= blockEdgeSize)
1648 offset += pos_c*(quot_diff[i] + c - blockEdgeSize);
1653 offset += pos_c*(quot_diff[i] + c);
1657 pos_c *= blockEdgeSize;
1658 bp_c *= (quot_diff[i] != 0)?2:1;
1662 convert_blk.template get<0>(convert_blk.
size()-1)[pos] = (bpos << 16) + offset;
1668 typedef AggregateT value_type;
1670 typedef self device_grid_type;
1673 :stencilSupportRadius(1)
1691 :stencilSupportRadius(stencilSupportRadius)
1701 : gridGeometry(gridGeometry),
1702 stencilSupportRadius(stencilSupportRadius)
1707 for (
int i = 0 ; i < dim ; i++) {sz_st[i] = gridGeometry.getSize()[i];}
1716 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1720 decltype(extendedBlockGeometry),
1729 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1733 decltype(extendedBlockGeometry),
1739 extendedBlockGeometry,
1740 stencilSupportRadius,
1742 nn_blocks.toKernel(),
1749 template<
unsigned int nNN,
unsigned int nLoop>
1754 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1758 decltype(extendedBlockGeometry),
1767 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1771 decltype(extendedBlockGeometry),
1777 extendedBlockGeometry,
1778 stencilSupportRadius,
1780 nn_blocks.toKernel(),
1813 return gridGeometry;
1821 template<
typename stencil_type>
1824 computeGhostLayerMapping<stencil_type>();
1828 constexpr static unsigned int getBlockEdgeSize()
1830 return blockEdgeSize;
1833 constexpr unsigned int getBlockSize()
const
1839 template<
typename CoordT>
1840 inline size_t getLinId(CoordT &coord)
1842 return gridGeometry.LinId(coord);
1847 return gridGeometry.InvLinId(linId);
1852 return gridSize.getGPUIterator(start,stop,n_thr);
1864 template<
typename CoordT>
1871 auto glid = gridGeometry.LinId(coord);
1873 auto bid = glid / blockSize;
1874 auto lid = glid % blockSize;
1876 auto key = blockMap.get_sparse(bid);
1878 k.set_cnk_pos_id(key.id);
1893 template<
unsigned int p,
typename CoordT>
1908 template<
unsigned int p>
1941 template<
typename CoordT>
1946 gridGeometry.LinId(coord,lin,offset);
1982 template<
unsigned int p>
2000 template<
unsigned int p,
typename CoordT>
2001 auto insert(
const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2006 template<
typename CoordT>
2011 gridGeometry.LinId(coord,ind,offset);
2122 ite.wthr.x = indexBuffer.size();
2126 ite.thr.x = getBlockSize();
2131 output.resize(indexBuffer.size()+1);
2135 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2137 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2141 openfpm::scan((
unsigned int *)output.template getDeviceBuffer<0>(),output.
size(),(
unsigned int *)output.template getDeviceBuffer<0>(),context);
2143 output.template deviceToHost<0>(output.
size()-1,output.
size()-1);
2144 unsigned int padding_points = output.template get<0>(output.
size()-1);
2149 pd_points.resize(padding_points);
2162 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_dw_count<dim,
2165 ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),
link_dw_scan.toKernel(),p_dw);
2175 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_dw<dim,
2177 blockSize>),ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),
link_dw_scan.toKernel(),
link_dw.toKernel(),p_dw);
2200 ite.wthr.x = indexBuffer.size();
2204 ite.thr.x = getBlockSize();
2209 output.resize(indexBuffer.size()+1);
2213 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2215 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2219 openfpm::scan((
unsigned int *)output.template getDeviceBuffer<0>(),output.
size(),(
unsigned int *)output.template getDeviceBuffer<0>(),context);
2221 output.template deviceToHost<0>(output.
size()-1,output.
size()-1);
2222 unsigned int padding_points = output.template get<0>(output.
size()-1);
2227 pd_points.resize(padding_points);
2240 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_up_count<dim,
2243 ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),
link_up_scan.toKernel(),p_up);
2253 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_up<dim,
2255 blockSize>),ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),
link_up_scan.toKernel(),
link_up.toKernel(),p_up);
2266 template<
typename dim3T>
2271 dim3SizeToInt(nBlock),
2272 dim3SizeToInt(nSlot)
2286 template<
typename stencil_type = NNStar<dim>,
typename checker_type = No_check>
2287 void tagBoundaries(
gpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
2293 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
2294 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
2296 if (numScalars == 0)
return;
2297 if (findNN ==
false)
2299 findNeighbours<stencil_type>();
2305 unsigned int localThreadBlockSize = dataChunkSize;
2306 unsigned int threadGridSize = numScalars % dataChunkSize == 0
2307 ? numScalars / dataChunkSize
2308 : 1 + numScalars / dataChunkSize;
2313 if (stencilSupportRadius == 1)
2315 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2321 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2323 else if (stencilSupportRadius == 2)
2325 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2331 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2333 else if (stencilSupportRadius == 0)
2335 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2341 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2346 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: stencilSupportRadius supported only up to 2, passed: " << stencilSupportRadius << std::endl;
2350 if (opt == tag_boundaries::CALCULATE_EXISTING_POINTS)
2355 block_points.resize(indexBuffer.size() + 1);
2356 block_points.template get<0>(block_points.
size()-1) = 0;
2357 block_points.template hostToDevice<0>(block_points.
size()-1,block_points.
size()-1);
2361 ite.wthr.x = indexBuffer.size();
2364 ite.thr.x = getBlockSize();
2370 dataBuffer.toKernel(),
2371 block_points.toKernel());
2374 openfpm::scan((indexT *)block_points.template getDeviceBuffer<0>(),block_points.
size(),(indexT *)block_points.template getDeviceBuffer<0>(),context);
2377 block_points.template deviceToHost<0>(block_points.
size()-1,block_points.
size()-1);
2378 size_t tot = block_points.template get<0>(block_points.
size()-1);
2383 dataBuffer.toKernel(),
2384 block_points.toKernel(),
2389 cudaDeviceSynchronize();
2392 template<
typename NNtype = NNStar<dim>>
2393 void findNeighbours()
2398 const unsigned int numBlocks = indexBuffer.size();
2399 const unsigned int numScalars = numBlocks * NNtype::nNN;
2400 nn_blocks.resize(numScalars);
2402 if (numScalars == 0)
return;
2406 unsigned int localThreadBlockSize = NNtype::nNN;
2408 unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
2409 ? numScalars / localThreadBlockSize
2410 : 1 + numScalars / localThreadBlockSize;
2412 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::findNeighbours<dim,NNtype>),
2413 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
2418 size_t countExistingElements()
const
2425 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2426 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2427 constexpr unsigned int blockSize = MaskBlockT::size;
2428 const auto bufferSize = indexBuffer.size();
2430 size_t numExistingElements = 0;
2432 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2434 auto dataBlock = dataBuffer.get(blockId);
2435 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2437 const auto curMask = dataBlock.template get<pMask>()[elementId];
2439 if (this->exist(curMask))
2441 ++numExistingElements;
2446 return numExistingElements;
2449 size_t countBoundaryElements()
2456 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2457 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2458 constexpr unsigned int blockSize = MaskBlockT::size;
2459 const auto bufferSize = indexBuffer.size();
2461 size_t numBoundaryElements = 0;
2463 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2465 auto dataBlock = dataBuffer.get(blockId);
2466 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2468 const auto curMask = dataBlock.template get<pMask>()[elementId];
2470 if (this->exist(curMask) && this->isPadding(curMask))
2472 ++numBoundaryElements;
2477 return numBoundaryElements;
2481 void measureBlockOccupancyMemory(
double &mean,
double &deviation)
2488 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2489 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2490 constexpr unsigned int blockSize = MaskBlockT::size;
2491 const auto bufferSize = indexBuffer.size();
2495 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2497 auto dataBlock = dataBuffer.get(blockId);
2498 size_t numElementsInBlock = 0;
2499 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2501 const auto curMask = dataBlock.template get<pMask>()[elementId];
2503 if (this->exist(curMask))
2505 ++numElementsInBlock;
2508 double blockOccupancy =
static_cast<double>(numElementsInBlock)/blockSize;
2509 measures.add(blockOccupancy);
2512 standard_deviation(measures, mean, deviation);
2516 void measureBlockOccupancy(
double &mean,
double &deviation)
2523 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2524 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2525 constexpr unsigned int blockSize = MaskBlockT::size;
2526 const auto bufferSize = indexBuffer.size();
2530 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2532 auto dataBlock = dataBuffer.get(blockId);
2533 size_t numElementsInBlock = 0;
2534 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2536 const auto curMask = dataBlock.template get<pMask>()[elementId];
2538 if (this->exist(curMask) && !this->isPadding(curMask))
2540 ++numElementsInBlock;
2543 double blockOccupancy =
static_cast<double>(numElementsInBlock)/blockSize;
2544 measures.add(blockOccupancy);
2547 standard_deviation(measures, mean, deviation);
2564 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2569 for (
int i = 0 ; i < dim ; i++)
2575 applyStencils< SparseGridGpuKernels::stencil_cross_func<dim,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2583 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2588 for (
int i = 0 ; i < dim ; i++)
2596 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2603 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2608 for (
int i = 0 ; i < dim ; i++)
2616 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2623 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2628 for (
int i = 0 ; i < dim ; i++)
2636 applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2643 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_src3,
2644 unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int prop_dst3,
2645 unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2650 for (
int i = 0 ; i < dim ; i++)
2658 applyStencils< SparseGridGpuKernels::stencil_func_conv3_b<dim,nLoop,prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2665 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2670 for (
int i = 0 ; i < dim ; i++)
2678 applyStencils< SparseGridGpuKernels::stencil_func_conv2<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2690 for (
int i = 0 ; i < dim ; i++)
2693 b.setHigh(i,gridGeometry.getSize()[i]);
2700 template<
typename stencil,
typename... Args>
2701 void applyStencils(
const Box<dim,int> & box, StencilMode mode, Args... args)
2703 if (findNN ==
false)
2705 findNeighbours<typename stencil::stencil_type>();
2717 case STENCIL_MODE_INPLACE:
2718 applyStencilInPlace<stencil>(box,mode,args...);
2720 case STENCIL_MODE_INPLACE_NO_SHARED:
2721 applyStencilInPlaceNoShared<stencil>(box,mode,args...);
2725 template<
typename stencil1,
typename stencil2,
typename ... otherStencils,
typename... Args>
2726 void applyStencils(
Box<dim,int> box, StencilMode mode, Args... args)
2728 applyStencils<stencil1>(box,mode, args...);
2729 applyStencils<stencil2, otherStencils ...>(box,mode, args...);
2732 template<
typename BitMaskT>
2733 inline static bool isPadding(BitMaskT &bitMask)
2739 template<
typename BitMaskT>
2740 inline static void setPadding(BitMaskT &bitMask)
2746 template<
typename BitMaskT>
2747 inline static void unsetPadding(BitMaskT &bitMask)
2760 template<
typename CoordT>
2763 return gridGeometry.BlockLinId(blockCoord);
2776 template<
unsigned int p>
2781 indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());
2782 indexT local_id = coord.get_data_id();
2787 block_data.template get<BMG::pMask>()[local_id] = 1;
2789 return block_data.template get<p>()[local_id];
2802 template<
typename CoordT>
2805 auto lin = gridGeometry.LinId(coord);
2806 indexT block_id = lin / blockSize;
2807 local_id = lin % blockSize;
2812 block_data.template get<BMG::pMask>()[local_id] = 1;
2827 template<
unsigned int p,
typename CoordT>
2831 auto lin = gridGeometry.LinId(coord);
2832 indexT block_id = lin / blockSize;
2833 indexT local_id = lin % blockSize;
2838 block_data.template get<BMG::pMask>()[local_id] = 1;
2840 return block_data.template get<p>()[local_id];
2843 template<
unsigned int p>
2844 void print_vct_add_data()
2848 threadBlockSize, indexT, layout_base> BMG;
2850 auto & bM = BMG::blockMap.private_get_vct_add_data();
2851 auto & vI = BMG::blockMap.private_get_vct_add_index();
2852 bM.template deviceToHost<p>();
2853 vI.template deviceToHost<0>();
2855 std::cout <<
"vct_add_data: " << std::endl;
2857 for (
size_t i = 0 ; i < bM.size() ; i++)
2859 std::cout << i <<
" index: " << vI.template get<0>(i) <<
" BlockData: " << std::endl;
2860 for (
size_t j = 0 ; j < blockSize ; j++)
2862 std::cout << (int)bM.template get<p>(i)[j] <<
" ";
2865 std::cout << std::endl;
2874 template<
unsigned int p>
2875 void setBackgroundValue(
typename boost::mpl::at<
typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
2879 BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
2891 static bool packRequest()
2900 template<
int ... prp>
inline
2907 indexBuffer.template packRequest<prp ...>(req);
2908 dataBuffer.template packRequest<prp ...>(req);
2929 indexBuffer.template pack<prp ...>(mem,sts);
2930 dataBuffer.template pack<prp ...>(mem,sts);
2951 indexBuffer.template
unpack<prp ...>(mem,ps);
2952 dataBuffer.template
unpack<prp ...>(mem,ps);
2970 if (mem.
size() != 0)
2971 {std::cout << __FILE__ <<
":" << __LINE__ <<
" not implemented: " << std::endl;}
2979 template<
int ... prp>
inline
2987 ite.wthr.x = indexBuffer.size();
2990 ite.thr.x = getBlockSize();
2994 tmp.resize(indexBuffer.size() + 1);
2999 dataBuffer.toKernel(),
3002 openfpm::scan((indexT *)
tmp.
template getDeviceBuffer<0>(),
3003 tmp.size(), (indexT *)
tmp.
template getDeviceBuffer<0>(), context);
3005 tmp.template deviceToHost<0>(
tmp.size()-1,
tmp.size()-1);
3009 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof... (prp)>>(spq);
3011 size_t n_pnt =
tmp.template get<0>(
tmp.size()-1);
3016 req =
sizeof(indexT) +
3017 sizeof(indexT)*indexBuffer.size() +
3018 sizeof(indexT)*
tmp.size() +
3019 n_pnt*(spq.point_size +
sizeof(
short int) +
sizeof(
unsigned char));
3036 template<
int ... prp>
inline
3042 for (
int i = 0 ; i < dim ; i++)
3060 offset_ptrs.clear();
3072 template<
int ... prp>
inline
3081 ite.wthr.x = indexBuffer.size();
3084 ite.thr.x = getBlockSize();
3090 if (indexBuffer.size() != 0)
3095 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3100 indexBuffer.toKernel(),
3103 dataBuffer.toKernel(),
3105 indexBuffer.
size() + 1);
3110 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3115 indexBuffer.toKernel(),
3118 dataBuffer.toKernel(),
3120 indexBuffer.
size() + 1);
3125 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3130 indexBuffer.toKernel(),
3133 dataBuffer.toKernel(),
3135 indexBuffer.
size() + 1);
3140 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3145 indexBuffer.toKernel(),
3148 dataBuffer.toKernel(),
3150 indexBuffer.
size() + 1);
3154 std::cout << __FILE__ <<
":" << __LINE__ <<
" error no implementation available of packCalculate, create a new case for " <<
pack_subs.size() << std::endl;
3160 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(prp)>>(spq);
3165 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
3170 tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3171 tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3174 tmp.template hostToDevice<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3175 tmp.template hostToDevice<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3177 openfpm::scan(((indexT *)
tmp.
template getDeviceBuffer<0>()) + i*(indexBuffer.size() + 1),
3178 indexBuffer.size() + 1, (indexT *)
tmp.
template getDeviceBuffer<0>() + i*(indexBuffer.size() + 1), context);
3180 openfpm::scan(((
unsigned int *)
tmp.
template getDeviceBuffer<1>()) + i*(indexBuffer.size() + 1),
3181 indexBuffer.size() + 1, (
unsigned int *)
tmp.
template getDeviceBuffer<1>() + i*(indexBuffer.size() + 1), context);
3183 tmp.template deviceToHost<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3184 tmp.template deviceToHost<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3186 scan_it.template get<0>(i) =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3188 n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3189 n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3191 req +=
sizeof(size_t) +
3193 sizeof(indexT)*n_cnk +
3194 align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int)) +
3195 align_number(
sizeof(indexT),n_pnt*(spq.point_size)) +
3196 align_number(
sizeof(indexT),n_pnt*
sizeof(
short int)) +
3197 align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3200 scan_it.template hostToDevice<0>();
3202 openfpm::scan((indexT *)
scan_it.
template getDeviceBuffer<0>(),
3203 scan_it.size(), (indexT *)
scan_it.
template getDeviceBuffer<0>(), context);
3213 return this->blockMap.getMappingVector();
3223 return this->blockMap.getMergeIndexMapVector();
3244 unsigned int i = req_index;
3247 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(prp)>>(spq);
3252 size_t n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3253 size_t n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3262 for (
int i = 0 ; i < dim ; i++)
3265 for (
int i = 0 ; i < dim ; i++)
3271 mem.
allocate(n_cnk*
sizeof(indexT));
3275 mem.
allocate( align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int)) );
3279 mem.
allocate( align_number(
sizeof(indexT),n_pnt*(spq.point_size)) );
3283 mem.
allocate( align_number(
sizeof(indexT),n_pnt*
sizeof(
short int) ) );
3287 mem.
allocate( align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char) ) );
3310 template<
unsigned int ... prp>
3313 if ((opt & 0x3) == rem_copy_opt::PHASE1)
3315 this->
template removeCopyToFinalize_phase1<prp ...>(ctx,opt);
3317 else if ((opt & 0x3) == rem_copy_opt::PHASE2)
3319 this->
template removeCopyToFinalize_phase2<prp ...>(ctx,opt);
3323 this->
template removeCopyToFinalize_phase3<prp ...>(ctx,opt,
false);
3342 bool is_pack_remote =
false)
3345 RestorePackVariableIfKeepGeometry(opt,is_pack_remote);
3349 pack_sg_implement<32,prp...>(mem,sts,opt,is_pack_remote);
3353 pack_sg_implement<64, prp...>(mem,sts,opt,is_pack_remote);
3357 pack_sg_implement<80, prp...>(mem,sts,opt,is_pack_remote);
3361 std::cout << __FILE__ <<
":" << __LINE__ <<
" error no implementation available of packCalculate, create a new case for " <<
pack_subs.size() << std::endl;
3364 savePackVariableIfNotKeepGeometry(opt,is_pack_remote);
3376 auto & vad = BMG::blockMap.private_get_vct_add_data();
3377 auto & vai = BMG::blockMap.private_get_vct_add_index();
3383 offset_ptrs_cp.clear();
3384 scan_ptrs_cp.clear();
3387 data_base_ptr_cp.clear();
3389 n_shifts_cp.clear();
3390 convert_blk.clear();
3401 gridGeometry.swap(gr.gridGeometry);
3419 if (rem_sects.
size() != 0)
3421 rem_sects.template hostToDevice<0,1>();
3423 tmp.resize(indexBuffer.size() + 1);
3425 tmp.template get<1>(
tmp.size()-1) = 0;
3426 tmp.template hostToDevice<1>(
tmp.size()-1,
tmp.size()-1);
3428 auto ite = indexBuffer.getGPUIterator();
3430 if (has_work_gpu(ite) ==
true)
3433 CUDA_LAUNCH((SparseGridGpuKernels::calc_remove_points_chunks_boxes<dim,
3435 blockEdgeSize>),ite,indexBuffer.toKernel(),rem_sects.toKernel(),
3436 gridGeometry,dataBuffer.toKernel(),
3440 openfpm::scan((
unsigned int *)
tmp.template getDeviceBuffer<1>(),
tmp.size(),(
unsigned int *)
tmp.template getDeviceBuffer<1>(),context);
3442 tmp.template deviceToHost<1>(
tmp.size()-1,
tmp.size()-1);
3445 size_t nr_cnk =
tmp.template get<1>(
tmp.size()-1);
3447 tmp3.resize(nr_cnk);
3450 ite = indexBuffer.getGPUIterator();
3452 if (has_work_gpu(ite) ==
false) {
return;}
3454 CUDA_LAUNCH((SparseGridGpuKernels::collect_rem_chunks),ite,
tmp.toKernel(),
tmp3.toKernel());
3458 ite =
tmp3.getGPUIterator();
3460 ite.wthr.x =
tmp3.size();
3463 ite.thr.x = getBlockSize();
3467 if (has_work_gpu(ite) ==
false) {
return;}
3469 CUDA_LAUNCH((SparseGridGpuKernels::remove_points<dim,
3471 ite,indexBuffer.toKernel(),
3473 dataBuffer.toKernel(),
3475 rem_sects.toKernel());
3487 template<
unsigned int ... prp>
3490 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3493 removeCopyToFinalize_phase3<prp ...>(context,opt,
true);
3504 offset_ptrs_cp.clear();
3505 scan_ptrs_cp.clear();
3506 data_base_ptr_cp.clear();
3509 n_shifts_cp.clear();
3510 convert_blk.clear();
3512 data_base_ptr_cp.clear();
3526 rem_sects.add(section_to_delete);
3554 grid_src.copySect.add(sgs);
3560 template<
typename pointers_type,
3561 typename headers_type,
3562 typename result_type,
3563 unsigned int ... prp >
3564 static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result,
int n_slot)
3568 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(prp)>>(spq);
3570 result.allocate(
sizeof(
int));
3572 if (pointers.size())
3573 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<
decltype(std::declval<self>().toKernel())>),1,pointers.size(),
3574 pointers.toKernel(),
3576 (
int *)result.getDevicePointer(),
3590 template<
unsigned int ... prp,
typename S2,
typename header_type>
3593 header_type & headers,
3597 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3601 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3611 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(prp)>>(spq);
3615 size_t n_cnk = headers.template get<1>(ih);
3617 ps.
addOffset(2*dim*
sizeof(
unsigned int));
3619 size_t actual_offset = n_cnk*
sizeof(indexT);
3620 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
3624 size_t n_pnt = headers.template get<2>(ih);
3625 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
3629 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
3632 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
3633 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3635 scan_ptrs_cp.add(scan);
3636 offset_ptrs_cp.add(offsets);
3637 data_base_ptr_cp.add(data_base_ptr);
3660 template<
unsigned int ... prp,
typename S2>
3665 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3669 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3679 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(prp)>>(spq);
3702 ps.
addOffset(2*dim*
sizeof(
unsigned int));
3704 size_t actual_offset = n_cnk*
sizeof(indexT);
3705 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
3708 ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int) +
sizeof(
unsigned int));
3712 size_t n_pnt = *(
unsigned int *)((
unsigned char *)mem.
getPointer() + ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int));
3713 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
3717 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
3720 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
3721 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3723 scan_ptrs_cp.add(scan);
3724 offset_ptrs_cp.add(offsets);
3725 data_base_ptr_cp.add(data_base_ptr);
3797 auto getSegmentToOutMap() const -> decltype(
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3807 auto getSegmentToMergeIndexMap() const -> decltype(
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3833#if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) || defined(VTKWRITER_HPP_)
3840 template<
typename Tw =
float>
bool write(
const std::string & output)
3848 return write_with_spacing_offset(output,spacing,offset);
3856 template<
typename Tw =
float>
3859 file_type ft = file_type::BINARY;
3863 auto & index = bm.getIndexBuffer();
3864 auto & data = bm.getDataBuffer();
3871 auto it = index.getIterator();
3875 auto key = it.get();
3879 for (
size_t i = 0 ; i < gridGeometry.
getBlockSize() ; i++)
3886 for (
size_t k = 0 ; k < dim ; k++)
3887 {p.
get(k) = keyg.
get(k)*spacing[k] + offset[k]*spacing[k];}
3893 cp(data.get_o(key),tmp_prp.last(),key,i);
3895 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);
3897 tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];
3906 vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.
size());
3911 return vtk_writer.write(output,prp_names,
"sparse_grid",
"",ft);
3928 fill_chunks_boxes(chunks_box,ids,spacing,offset);
3930 vtk_box1.add(chunks_box);
3931 vtk_box1.write(std::string(
"chunks_") + output + std::string(
".vtk"));
3935 write_with_spacing_offset(std::string(
"data_") + output + std::string(
".vtk"),spacing,offset);
3943template<
unsigned int dim,
3944 typename AggregateT,
3947 typename indexT=
long int,
3952template<
unsigned int dim,
3953 typename AggregateT,
3956 typename indexT=
int,
3961template<
unsigned int dim,
3962 typename AggregateT,
3965 typename indexT=
int,
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
auto insert_o(unsigned int linId) -> decltype(blockMap.insert(0))
insert data, host version
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId, is_new).template get< p >())
insert a block + flush, host version
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
decltype(blockMap) & private_get_blockMap()
Return internal structure block map.
This class represent an N-dimensional box.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
__device__ __host__ T getHigh(int i) const
get the high interval of the box
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
virtual void * getDevicePointer()
get a readable pointer with the data
virtual bool resize(size_t sz)
resize the momory allocated
virtual void hostToDevice()
Move memory from host to device.
virtual size_t size() const
the the size of the allocated memory
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
virtual void decRef()
Decrement the reference counter.
size_t getOffsetEnd()
Get offset.
size_t getOffset()
Get offset.
virtual void * getDevicePointer()
Return the pointer of the last allocation.
virtual void incRef()
Increment the reference counter.
virtual void * getPointer()
Return the pointer of the last allocation.
virtual void deviceToHost()
Do nothing.
void reset()
Reset the internal counters.
virtual void hostToDevice()
Return the pointer of the last allocation.
virtual bool allocate(size_t sz)
Allocate a chunk of memory.
virtual size_t size() const
Get the size of the LAST allocated memory.
This class allocate, and destroy CPU memory.
static void pack(ExtPreAlloc< Mem >, const T &obj)
Error, no implementation.
This class implement the point shape in an N-dimensional space.
void one()
Set to one the point coordinate.
__device__ __host__ void zero()
Set to zero the point coordinate.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
This class represent an N-dimensional box.
grid_key_dx< dim > getStart() const
Return the starting point.
grid_key_dx< dim > getStop() const
Return the stop point.
__device__ unsigned int size(unsigned int i)
Size of the sparse grid in each direction.
constexpr __device__ unsigned int getBlockSize() const
Return the size of the block.
openfpm::vector_gpu< aggregate< int, short int > > link_dw
links of the padding points with real points of a finer sparsegrid
openfpm::vector_gpu< aggregate< size_t > > links_up
links of the padding points with real points of a coarse sparsegrid
SparseGridGpu(linearizer &gridGeometry, unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
size_t getBlockLinId(const CoordT &blockCoord) const
Linearization of block coordinates.
openfpm::vector_gpu< aggregate< indexT > > tmp3
temporal 3
auto private_get_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer()) &
Return the index array of the blocks.
void conv_cross(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a convolution using a cross like stencil.
openfpm::vector_gpu< aggregate< int[dim]> > shifts
shifts for chunk conversion
auto get(const grid_key_dx< dim, CoordT > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using the point coordinates.
openfpm::vector_gpu< aggregate< int, short int > > & getUpLinks()
Get the links up for each point.
void pack(ExtPreAlloc< CudaMemory > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Pack_stat &sts)
Pack the object into the memory given an iterator.
auto get_o(const grid_key_dx< dim, CoordT > &coord) const -> encap_data_block< typename std::remove_const< decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::get(0))>::type >
Get an element using the point coordinates.
auto get(const sparse_grid_gpu_index< self > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
SparseGridGpu(const size_t(&res)[dim], unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
auto private_get_add_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
void removeCopyToFinalize(gpu::ofp_context_t &ctx, int opt)
It finalize the queued operations of remove() and copy_to()
ExtPreAlloc< CudaMemory > * prAlloc_prp
Memory to remove copy finalize.
void remove(const Box< dim, int > §ion_to_delete)
Remove all the points in this region.
openfpm::vector_gpu< aggregate< indexT > > e_points
void removePoints(gpu::ofp_context_t &context)
Remove the points we queues to remove.
void unpack(ExtPreAlloc< CudaMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void construct_link(self &grid_up, self &grid_dw, gpu::ofp_context_t &context)
construct link between levels
void unpack_with_headers(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, header_type &headers, int ih, Unpack_stat &ps, gpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
auto private_get_neighborhood_array() -> decltype(nn_blocks) &
Return the index array of the blocks.
void setNNType()
Set the neighborhood type.
openfpm::vector_gpu< aggregate< unsigned int > > pack_output
Helper array to pack points.
auto get(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
openfpm::vector_gpu< aggregate< indexT > > tmp2
temporal 2
auto insertFlush(const grid_key_dx< dim, CoordT > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void packRequest(size_t &req) const
Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory.
auto private_get_data_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer())
Return the data array of the blocks.
auto insertFlush(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void packCalculate(size_t &req, gpu::ofp_context_t &context)
Calculate the size of the information to pack.
auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
Return the mapping vector used to know where the data has been added.
static bool is_unpack_header_supported()
Indicate that unpacking the header is supported.
openfpm::vector_gpu< aggregate< short int, short int > > ghostLayerToThreadsMapping
void copyRemoveReset()
Reset the queue to remove and copy section of grids.
openfpm::vector_gpu< aggregate< int, short int > > & getDownLinks()
Get the links down for each point.
unsigned char getFlag(const sparse_grid_gpu_index< self > &coord) const
Return the flag of the point.
void removeAddUnpackReset()
In this case it does nothing.
static SparseGridGpu_iterator_sub< dim, self > type_of_subiterator()
This is a meta-function return which type of sub iterator a grid produce.
size_t size() const
return the size of the grid
void packRequest(SparseGridGpu_iterator_sub< dim, self > &sub_it, size_t &req) const
Calculate the size to pack part of this structure.
openfpm::vector_gpu< aggregate< unsigned int > > & getDownLinksOffsets()
Get the offsets for each point of the links down.
void pack(ExtPreAlloc< HeapMemory > &mem, Pack_stat &sts) const
Pack the object into the memory.
openfpm::vector_gpu< aggregate< unsigned int > > & getUpLinksOffsets()
Get the offsets for each point of the links up.
static constexpr bool isCompressed()
This is a multiresolution sparse grid so is a compressed format.
void removeAddUnpackFinalize(gpu::ofp_context_t &context, int opt)
This function remove the points we queue to remove and it flush all the added/unpacked data.
auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
Return the mapping vector used to know where the data has been added.
decltype(self::type_of_iterator()) getIterator() const
Return a SparseGrid iterator.
void packRequest(size_t &req, gpu::ofp_context_t &context) const
memory requested to pack this object
void copy_to(self &grid_src, const Box< dim, size_t > &box_src, const Box< dim, size_t > &box_dst)
It queue a copy.
static SparseGridGpu_iterator< dim, self > type_of_iterator()
This is a meta-function return which type of iterator a grid produce.
void resize(size_t(&res)[dim])
resize the SparseGrid
void construct_link_dw(self &grid_dw, const Box< dim, int > &db_, Point< dim, int > p_dw, gpu::ofp_context_t &context)
construct link on the down level
openfpm::vector_gpu< aggregate< int, short int > > link_up
links of the padding points with real points of a finer sparsegrid
void conv2_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
void conv_cross_b(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< indexT, unsigned int > > tmp
temporal
void packReset()
Reset the pack calculation.
bool isSkipLabellingPossible()
This function check if keep geometry is possible for this grid.
void construct_link_up(self &grid_up, const Box< dim, int > &db_, Point< dim, int > p_up, gpu::ofp_context_t &context)
construct link on the up levels
openfpm::vector_gpu< Box< dim, int > > pack_subs
the set of all sub-set to pack
auto private_get_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer())
Return the index array of the blocks.
void addAndConvertPackedChunkToTmp(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &context)
unpack the sub-grid object
auto insertBlockFlush(const grid_key_dx< dim, CoordT > &coord, indexT &local_id) -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::insertBlockFlush(0))
Insert the point on host side and flush directly.
void unpack(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
auto private_get_data_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer()) &
Return the index array of the blocks.
auto get_o(const sparse_grid_gpu_index< self > &coord) const -> encap_data_block< typename std::remove_const< decltype(private_get_data_array().get(0))>::type >
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
static void unpack_headers(pointers_type &pointers, headers_type &headers, result_type &result, int n_slot)
Stub does not do anything.
void conv3_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< int > > new_map
Map between the (Last) added chunks and their position in chunks data.
int yes_i_am_grid
it define that this data-structure is a grid
void setBackgroundValue(typename boost::mpl::at< typename AggregateT::type, boost::mpl::int_< p > >::type backgroundValue)
set the background for property p
void convertChunkIds(short int *offset, origPackType &origPack, IteratorType &sub_it)
convert the offset index from the packed to the add buffer
void unpack(ExtPreAlloc< HeapMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void conv2(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< unsigned int > > link_dw_scan
scan offsets of the links down
Box< dim, int > getBox()
Return a Box with the range if the SparseGrid.
base_key get_sparse(const grid_key_dx< dim, CoordT > &coord) const
Get an element using the point coordinates.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
decltype(self::type_of_subiterator()) getIterator(const grid_key_dx< dim > &start, const grid_key_dx< dim > &stop, int is_to_init=1) const
Return a SparseGrid iterator only on a sub-set of elements.
openfpm::vector_gpu< aggregate< unsigned int > > link_up_scan
scan offsets of the links down
linearizer & getGrid()
Return the grid information object.
auto private_get_add_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
openfpm::vector_gpu< aggregate< indexT > > scan_it
contain the scan of the point for each iterator
void packFinalize(ExtPreAlloc< CudaMemory > &mem, Pack_stat &sts, int opt=0, bool is_pack_remote=false)
Finalize the packing procedure.
void conv(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
size_t getOffset()
Return the actual counter.
void addOffset(size_t off)
Increment the offset pointer by off.
static void unpack(ExtPreAlloc< Mem >, T &obj)
Error, no implementation.
void operator()(T &t) const
It call the copy function for each property.
void operator()(T &t) const
It call the copy function for each property.
void * base_ptr
data pointers
grid_key_dx is the key to access any element in the grid
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
__device__ __host__ index_type get(index_type i) const
Get the i index.
void setDimensions(const size_t(&dims)[N])
Reset the dimension of the grid.
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Implementation of 1-D std::vector like structure.
Element index contain a data chunk index and a point index.
int get_cnk_pos_id() const
Get chunk position id.
int get_data_id() const
Get chunk local index (the returned index < getblockSize())
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
get the type of the insertBlock
get the type of the block
static __device__ bool getNNindex_offset(grid_key_dx< dim, indexT2 > &coord, unsigned int &NN_index, unsigned int &offset_nn)
given a coordinate writtel in local coordinate for a given it return the neighborhood chunk position ...
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
this class is a functor for "for_each" algorithm
Transform the boost::fusion::vector into memory specification (memory_traits)
this class is a functor for "for_each" algorithm