5 #ifndef OPENFPM_PDATA_SPARSEGRIDGPU_HPP
6 #define OPENFPM_PDATA_SPARSEGRIDGPU_HPP
8 constexpr
int BLOCK_SIZE_STENCIL = 128;
11 #include "util/cuda_util.hpp"
13 #include <SparseGridGpu/BlockMapGpu.hpp>
14 #include <Grid/iterators/grid_skin_iterator.hpp>
15 #include <Grid/Geometry/grid_smb.hpp>
16 #include "SparseGridGpu_ker.cuh"
17 #include "SparseGridGpu_kernels.cuh"
18 #include "Iterators/SparseGridGpu_iterator_sub.hpp"
19 #include "Grid/Geometry/grid_zmb.hpp"
20 #include "util/stat/common_statistics.hpp"
21 #include "Iterators/SparseGridGpu_iterator.hpp"
22 #include "Space/Shape/Box.hpp"
24 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST)
25 #include "VTKWriter/VTKWriter.hpp"
28 constexpr
int NO_ITERATOR_INIT = 0;
34 NO_CALCULATE_EXISTING_POINTS,
35 CALCULATE_EXISTING_POINTS
38 template<
unsigned int dim>
41 typedef boost::mpl::int_<2> type;
48 typedef boost::mpl::int_<256> type;
49 typedef boost::mpl::int_<256> tb;
55 typedef boost::mpl::int_<16> type;
56 typedef boost::mpl::int_<256> tb;
62 typedef boost::mpl::int_<8> type;
63 typedef boost::mpl::int_<512> tb;
72 template<
typename T,
unsigned int dim,
unsigned int blockEdgeSize>
78 template<
typename T,
unsigned int dim,
unsigned int blockEdgeSize,
unsigned int N1>
84 template<
unsigned int dim,
unsigned int blockEdgeSize,
typename ... aggr_list>
90 template<
unsigned int dim,
unsigned int blockEdgeSize,
typename aggr>
95 template<
unsigned int dim,
unsigned int blockEdgeSize,
typename ... types>
101 template<
typename aggr>
106 template<
typename ... types>
114 template<
typename enc_type>
123 :offset(offset),enc(enc)
130 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,enc_type::T_type::max_prop> >(cp);
135 template<
unsigned int p>
136 auto get() -> decltype(enc.template get<p>()[offset])
138 return enc.template get<p>()[offset];
141 template<
unsigned int p>
142 auto get()
const -> decltype(enc.template get<p>()[offset])
144 return enc.template get<p>()[offset];
152 STENCIL_MODE_INPLACE = 1,
153 STENCIL_MODE_INPLACE_NO_SHARED = 3
160 template<
typename SGr
idGpu,
unsigned int prp,
unsigned int stencil_size>
166 SGridGpu::device_grid_type::dims>
type;
169 #include "encap_num.hpp"
175 template<
typename SGr
idGpu>
185 template<
unsigned int dim>
188 __device__
inline static bool is_padding()
190 printf(
"NNfull_is_padding_impl with dim: %d not implemented yet \n",dim);
203 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
204 __device__
inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
206 bool isPadding_ =
false;
207 for (
int i = 0 ; i < 3 ; i++)
209 for (
int j = 0 ; j < 3 ; j++)
211 for (
int k = 0 ; k < 3 ; k++)
219 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
220 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
221 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
222 if (isPadding_)
break;
238 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
239 __device__
inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
241 bool isPadding_ =
false;
242 for (
int i = 0 ; i < 3 ; i++)
244 for (
int j = 0 ; j < 3 ; j++)
251 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
252 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
253 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
254 if (isPadding_)
break;
261 template<
unsigned int dim>
266 template<
typename indexT,
typename blockCoord_type,
typename blockMap_type,
typename SparseGr
id_type>
267 __device__
static inline indexT getNNpos(blockCoord_type & blockCoord,
268 blockMap_type & blockMap,
269 SparseGrid_type & sparseGrid,
270 const unsigned int offset)
273 int neighbourPos = blockMap.size();
274 if (offset < nNN && offset != nNN / 2)
277 for (
int i = 0 ; i < dim ; i++)
281 blockCoord.set_d(i, blockCoord.get(i) + dPos - 1);
284 neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).id;
289 template<
typename indexT,
unsigned int blockEdgeSize,
typename coordType>
290 __host__
static inline indexT getNNskin(coordType & coord,
int stencilSupportRadius)
294 indexT neighbourNum = 0;
297 for(
int i = 0 ; i < dim ; i++)
299 int c =
static_cast<int>(coord.get(i)) -
static_cast<int>(stencilSupportRadius);
304 else if (c >= blockEdgeSize)
306 neighbourNum += 2*accu;
310 neighbourNum += accu;
319 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
320 __device__
static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
335 template<
unsigned int blockEdgeSize,
typename indexT2>
344 for (
unsigned int i = 0 ; i < dim ; i++)
346 int p = 1 - ((
int)(coord.
get(i) < 0)) + ((
int)(coord.
get(i) >= (
int)blockEdgeSize));
350 offset_nn += (coord.
get(i) + (1 - p)*(
int)blockEdgeSize)*cnt_off;
353 cnt_off *= blockEdgeSize;
364 template<
unsigned int nNN_,
unsigned int nLoop_>
367 static const unsigned int nNN = nNN_;
368 static const unsigned int nLoop = nLoop_;
371 template<
typename copy_type>
374 template<
typename T,
typename dst_type,
typename src_type>
375 static inline void copy(src_type & src, dst_type & dst,
unsigned int bPos)
377 dst.template get<T::value>() = src.template get<T::value>()[bPos];
381 template<
typename copy_type,
unsigned int N1>
384 template<
typename T,
typename dst_type,
typename src_type>
385 static inline void copy(src_type & src, dst_type & dst,
unsigned int bPos)
387 for (
int i = 0 ; i < N1 ; i++)
389 dst.template get<T::value>()[i] = src.template get<T::value>()[i][bPos];
394 template<
typename Tsrc,
typename Tdst>
417 typedef typename std::remove_reference<decltype(
dst.template get<T::value>())>::type copy_rtype;
428 template<
typename AggregateT,
unsigned int n_it,
unsigned int ... prp>
436 mutable size_t tot = 0;
454 typedef typename boost::mpl::at<vprp,T>::type prp_cp;
457 typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
459 arrs.ptr[i][T::value] = (
void *)((((
unsigned char *)
base_ptr) + tot));
461 tot += sz *
sizeof(pack_type);
466 template<
typename SparseGr
idType>
469 SparseGridType * grd;
478 :grd(&grd),src(src),dst(dst)
483 template<
unsigned int dim,
487 typename indexT=
long int,
491 typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,
492 threadBlockSize, indexT, layout_base>
496 static constexpr
unsigned int dims = dim;
502 threadBlockSize, indexT, layout_base>
BMG;
504 const static unsigned char PADDING_BIT = 1;
506 linearizer gridGeometry;
509 unsigned int stencilSupportRadius;
510 unsigned int ghostLayerSize;
621 mutable int index_size_swp_r = -1;
646 inline void swap_internal_remote()
648 n_cnk_cp_swp_r.swap(n_cnk_cp);
649 n_pnt_cp_swp_r.swap(n_pnt_cp);
650 n_shift_cp_swp_r.swap(n_shifts_cp);
651 convert_blk_swp_r.swap(convert_blk);
652 box_cp_swp_r.swap(box_cp);
656 inline void swap_internal_local()
658 offset_ptrs_cp_swp.swap(offset_ptrs_cp);
659 scan_ptrs_cp_swp.swap(scan_ptrs_cp);
660 data_base_ptr_cp_swp.swap(data_base_ptr_cp);
661 n_cnk_cp_swp.swap(n_cnk_cp);
662 n_pnt_cp_swp.swap(n_pnt_cp);
663 n_shift_cp_swp.swap(n_shifts_cp);
664 convert_blk_swp.swap(convert_blk);
665 box_cp_swp.swap(box_cp);
669 inline void swap_local_pack()
671 index_ptrs_swp.swap(index_ptrs);
672 scan_ptrs_swp.swap(scan_ptrs);
673 data_ptrs_swp.swap(data_ptrs);
674 offset_ptrs_swp.swap(offset_ptrs);
675 mask_ptrs_swp.swap(mask_ptrs);
685 inline void swap_remote_pack()
687 index_ptrs_swp_r.swap(index_ptrs);
688 scan_ptrs_swp_r.swap(scan_ptrs);
689 data_ptrs_swp_r.swap(data_ptrs);
690 offset_ptrs_swp_r.swap(offset_ptrs);
691 mask_ptrs_swp_r.swap(mask_ptrs);
703 static constexpr
unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
704 typedef AggregateBlockT AggregateInternalT;
711 static constexpr
unsigned int blockEdgeSize_ = blockEdgeSize;
713 typedef linearizer grid_info;
715 typedef linearizer linearizer_type;
721 typedef indexT indexT_;
723 typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;
732 return this->countExistingElements();
740 template <
typename stencil = no_stencil>
756 template<
typename dim3T>
757 inline static int dim3SizeToInt(dim3T d)
759 return d.x * d.y * d.z;
762 inline static int dim3SizeToInt(
size_t d)
767 inline static int dim3SizeToInt(
unsigned int d)
772 template<
typename ... v_reduce>
776 ::template flush<v_reduce ...>(gpuContext, opt);
783 void saveUnpackVariableIfNotKeepGeometry(
int opt,
bool is_unpack_remote)
785 if (is_unpack_remote ==
true)
786 {swap_internal_remote();}
788 if (is_unpack_remote ==
false)
789 {swap_internal_local();}
792 void RestoreUnpackVariableIfKeepGeometry(
int opt,
bool is_unpack_remote)
794 if (opt & KEEP_GEOMETRY && is_unpack_remote ==
true)
795 {swap_internal_remote();}
797 if (opt & KEEP_GEOMETRY && is_unpack_remote ==
false)
798 {swap_internal_local();}
802 void savePackVariableIfNotKeepGeometry(
int opt,
bool is_pack_remote)
804 if (is_pack_remote ==
false)
807 req_index_swp = req_index;
810 if (is_pack_remote ==
true)
813 req_index_swp_r = req_index;
817 void RestorePackVariableIfKeepGeometry(
int opt,
bool is_pack_remote)
819 if (opt & KEEP_GEOMETRY && is_pack_remote ==
false)
822 req_index = req_index_swp;
825 if (opt & KEEP_GEOMETRY && is_pack_remote ==
true)
828 req_index = req_index_swp_r;
832 template<
unsigned int n_it>
833 void calculatePackingPointsFromBoxes(
int opt,
size_t tot_pnt)
835 if (!(opt & KEEP_GEOMETRY))
845 ite.wthr.x = indexBuffer.size();
848 ite.thr.x = getBlockSize();
853 CUDA_LAUNCH((SparseGridGpuKernels::get_exist_points_with_boxes<dim,
858 indexBuffer.toKernel(),
861 dataBuffer.toKernel(),
871 void computeSizeOfGhostLayer()
873 unsigned int term1 = 1;
874 for (
int i = 0; i < dim; ++i)
876 term1 *= blockEdgeSize + 2 * stencilSupportRadius;
878 unsigned int term2 = 1;
879 for (
int i = 0; i < dim; ++i)
881 term2 *= blockEdgeSize;
883 ghostLayerSize = term1 - term2;
886 void allocateGhostLayerMapping()
891 template<
typename stencil_type>
892 void computeGhostLayerMapping()
894 size_t dimensions[dim],
896 innerDomainBegin[dim], innerDomainEnd[dim],
897 outerBoxBegin[dim], outerBoxEnd[dim],
899 for (
int i = 0; i < dim; ++i)
901 dimensions[i] = blockEdgeSize + 2 * stencilSupportRadius;
903 innerDomainBegin[i] = stencilSupportRadius - 1;
904 innerDomainEnd[i] = dimensions[i] - stencilSupportRadius;
905 outerBoxBegin[i] = origin[i];
906 outerBoxEnd[i] = dimensions[i];
907 bc[i] = NON_PERIODIC;
919 auto coord = gsi.get();
920 assert(i < ghostLayerSize);
921 mem_id linId = enlargedGrid.
LinId(coord);
925 ghostLayerToThreadsMapping.template get<nt>(i) = stencil_type::template getNNskin<indexT,blockEdgeSize>(coord,stencilSupportRadius);
930 assert(i == ghostLayerSize);
935 void initialize(
const size_t (& res)[dim])
937 gridGeometry = linearizer(res);
939 computeSizeOfGhostLayer();
940 allocateGhostLayerMapping();
941 computeGhostLayerMapping<NNStar<dim>>();
943 size_t extBlockDims[dim];
944 for (
int d=0; d<dim; ++d)
946 extBlockDims[d] = blockEdgeSize + 2*stencilSupportRadius;
954 template <
typename stencil,
typename... Args>
955 void applyStencilInPlace(
const Box<dim,int> & box, StencilMode & mode,Args... args)
961 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
962 unsigned int numScalars = indexBuffer_.size() * dataChunkSize;
964 if (numScalars == 0)
return;
967 constexpr
unsigned int chunksPerBlock = 1;
968 const unsigned int localThreadBlockSize = dataChunkSize * chunksPerBlock;
969 const unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
970 ? numScalars / localThreadBlockSize
971 : 1 + numScalars / localThreadBlockSize;
975 #ifdef CUDIFY_USE_CUDA
978 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
982 threadGridSize, localThreadBlockSize,
984 indexBuffer_.toKernel(),
985 dataBuffer_.toKernel(),
986 this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
992 auto indexBuffer = indexBuffer_.toKernel();
993 auto dataBuffer = dataBuffer_.toKernel();
994 auto sparseGrid = this->
template toKernelNN<stencil::stencil_type::nNN, nLoop>();
998 auto lamb = [=] __device__ () mutable
1000 constexpr
unsigned int pIndex = 0;
1002 typedef typename decltype(indexBuffer)::value_type IndexAggregateT;
1003 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1005 typedef typename decltype(dataBuffer)::value_type AggregateT_;
1006 typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;
1007 typedef ScalarTypeOf<AggregateT_, pMask> MaskT;
1008 constexpr
unsigned int blockSize = MaskBlockT::size;
1012 const unsigned int dataBlockPos = blockIdx.x;
1013 const unsigned int offset = threadIdx.x;
1015 if (dataBlockPos >= indexBuffer.size())
1020 auto dataBlockLoad = dataBuffer.get(dataBlockPos);
1023 const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1026 unsigned char curMask;
1028 if (offset < blockSize)
1031 curMask = dataBlockLoad.template get<pMask>()[offset];
1032 for (
int i = 0 ; i < dim ; i++)
1033 {curMask &= (pointCoord.
get(i) < bx.getLow(i) || pointCoord.
get(i) > bx.getHigh(i))?0:0xFF;}
1037 sdataBlockPos.id = dataBlockPos;
1040 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1044 CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);
1051 template <
typename stencil,
typename... Args>
1052 void applyStencilInPlaceNoShared(
const Box<dim,int> & box, StencilMode & mode,Args... args)
1058 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
1059 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
1061 if (numScalars == 0)
return;
1063 auto ite =
e_points.getGPUIterator(BLOCK_SIZE_STENCIL);
1065 CUDA_LAUNCH((SparseGridGpuKernels::applyStencilInPlaceNoShared
1071 indexBuffer.toKernel(),
1072 dataBuffer.toKernel(),
1073 this->template toKernelNN<stencil::stencil_type::nNN, 0>(),
1077 template<
typename ids_type>
1080 for (
int i = 0 ; i < chunk_ids.size() ; i++)
1084 auto c_pos = gridGeometry.InvLinId(chunk_ids.template get<0>(i)*blockSize);
1086 for (
int j = 0 ; j < dim ; j++)
1088 box.
setLow(j,c_pos.get(j) * spacing[j] - 0.5*spacing[j] + offset.
get(j)*spacing[j]);
1089 box.
setHigh(j,(c_pos.get(j) + blockEdgeSize)*spacing[j] - 0.5*spacing[j] + offset.
get(j)*spacing[j]);
1092 chunks_box.add(box);
1096 template<
typename MemType,
unsigned int ... prp>
1099 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1106 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1108 auto sub_it = this->
getIterator(copySect.get(i).dst.getKP1(),copySect.get(i).dst.getKP2(),NO_ITERATOR_INIT);
1116 template<
unsigned int ... prp>
1119 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1123 template<
unsigned int ... prp>
1129 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1136 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1138 auto sub_it = this->
getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1140 this->packRequest(sub_it,req);
1151 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1153 auto sub_it = this->
getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1160 size_t req = mem.
size();
1175 template<
unsigned int ... prp>
1176 void removeCopyToFinalize_phase3(
gpu::ofp_context_t& gpuContext,
int opt,
bool is_unpack_remote)
1180 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1182 if (
tmp2.size() == 0)
1189 auto & add_buff = this->blockMap.private_get_vct_add_index();
1190 add_buff.swap(
tmp2);
1192 auto & nadd_buff = this->blockMap.private_get_vct_nadd_index();
1193 ite = nadd_buff.getGPUIterator();
1194 CUDA_LAUNCH(SparseGridGpuKernels::set_one,ite,nadd_buff.toKernel());
1198 this->
template flush<sLeft_<prp>...>(gpuContext,flush_type::FLUSH_ON_DEVICE);
1204 auto & o_map = this->getSegmentToOutMap();
1205 auto & segments_data = this->getSegmentToMergeIndexMap();
1207 new_map.resize(a_map.size(),0);
1211 ite = segments_data.getGPUIterator();
1213 if (ite.nblocks() != 0)
1214 CUDA_LAUNCH(SparseGridGpuKernels::construct_new_chunk_map<1>,ite,
new_map.toKernel(),a_map.toKernel(),m_map.toKernel(),o_map.toKernel(),segments_data.toKernel(),sz_b);
1216 convert_blk.template hostToDevice<0>();
1230 RestoreUnpackVariableIfKeepGeometry(opt,is_unpack_remote);
1234 size_t n_accu_cnk = 0;
1235 for (
size_t i = 0 ; i < n_cnk_cp.
size() ; i++)
1238 size_t n_pnt = n_pnt_cp.get(i);
1240 void * data_base_ptr = data_base_ptr_cp.get(i);
1241 data_ptr_fill<AggregateT,1,prp...> dpf(data_base_ptr,0,data,n_pnt);
1242 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(dpf);
1244 ite.wthr.x = n_cnk_cp.get(i);
1252 for (
int j = 0 ; j < dim ; j++)
1256 if (l >= blockEdgeSize)
1257 {ite.thr.x *= blockEdgeSize;}
1265 if (ite.nblocks() != 0 && ite.thr.x != 0)
1270 AggregateT,decltype(convert_blk.toKernel()),decltype(
new_map.toKernel()),
1271 decltype(data),decltype(chunks.toKernel()),prp... >),ite,
1272 (
unsigned int *)scan_ptrs_cp.get(i),
1273 (
unsigned short int *)offset_ptrs_cp.get(i),
1274 convert_blk.toKernel(),
1278 (
unsigned int)n_cnk_cp.get(i),
1279 (
unsigned int)n_shifts_cp.get(i),
1280 (
unsigned int)n_pnt_cp.get(i),
1282 (
unsigned int)n_accu_cnk);
1285 n_accu_cnk += n_cnk_cp.get(i)*n_shifts_cp.get(i);
1289 saveUnpackVariableIfNotKeepGeometry(opt,is_unpack_remote);
1292 template<
unsigned int n_it,
unsigned int ... prp>
1296 bool is_pack_remote)
1309 {std::cerr << __FILE__ <<
":" << __LINE__ <<
" error the packing request number differ from the number of packed objects " << req_index <<
" " <<
pack_subs.size() << std::endl;}
1315 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
1319 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
1321 size_t n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
1328 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
1330 size_t n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
1333 index_ptr.ptr[i] = index_ptrs.get(i);
1334 scan_ptr.ptr[i] = scan_ptrs.get(i);
1338 data_ptr_fill<AggregateT,n_it,prp...> dpf(data_ptrs.get(i),i,data_ptr,
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1));
1339 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(dpf);
1341 offset_ptr.ptr[i] = offset_ptrs.get(i);
1342 mask_ptr.ptr[i] = mask_ptrs.get(i);
1351 calculatePackingPointsFromBoxes<n_it>(opt,tot_pnt);
1363 for(
int i = 0 ; i < n_it ; i++)
1365 for (
int j = 0 ; j <
sizeof...(prp) ; j++)
1367 arr_data->ptr[i][j] = data_ptr.ptr[i][j];
1380 decltype(indexBuffer.toKernel()),
1381 decltype(dataBuffer.toKernel()),
1382 decltype(
tmp.toKernel()),
1387 dataBuffer.toKernel(),
1388 indexBuffer.toKernel(),
1407 {CUDA_LAUNCH(SparseGridGpuKernels::last_scan_point,ite,scan_ptr,
tmp.toKernel(),(
unsigned int)indexBuffer.size()+1,(
unsigned int)
pack_subs.size());}
1420 template<
unsigned int ... prp,
typename S2>
1427 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
1442 for (
int i = 0 ; i < dim ; i++)
1446 origPack_cnk.
set_d(i,((
int)(
tmp / blockEdgeSize))*blockEdgeSize);
1450 for (
int i = 0 ; i < dim ; i++)
1457 size_t actual_offset = n_cnk*
sizeof(indexT);
1460 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
1463 ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int) +
sizeof(
unsigned int));
1469 size_t n_pnt = *(
unsigned int *)((
unsigned char *)mem.
getPointer() + ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int));
1470 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
1474 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
1478 offset_ptrs_cp.add(offsets);
1479 scan_ptrs_cp.add(scan);
1480 n_cnk_cp.add(n_cnk);
1481 n_pnt_cp.add(n_pnt);
1482 data_base_ptr_cp.add(data_base_ptr);
1486 for (
int i = 0 ; i < dim ; i++)
1494 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
1503 for (
int i = 0 ; i < dim ; i++)
1504 {
shifts.last().template get<0>()[i] = 0;}
1506 for (
int i = 0 ; i < dim ; i++)
1508 int op_q = origPack_pnt.
get(i) % blockEdgeSize;
1509 int ou_q = sub_it.
getStart().
get(i) % blockEdgeSize;
1510 int quot = abs(ou_q - op_q) % blockEdgeSize;
1517 for (
int j = 0 ; j < sz ; j++)
1520 for (
int k = 0 ; k < dim ; k++)
1522 shifts.last().template get<0>()[k] =
shifts.template get<0>(j)[k] + ((i == k)?squot:0);
1528 shifts.template hostToDevice<0>();
1530 linearizer gridGeoPack(sz);
1533 size_t sz[1] = {n_cnk};
1535 auto ite = g.getGPUIterator();
1540 for (
int i = 0 ; i < dim ; i++)
1542 sz_g.
set_d(i,gridGeometry.getSize()[i]);
1543 origUnpack_cnk.
set_d(i,(
int)(sub_it.
getStart().
get(i) / blockEdgeSize)*blockEdgeSize);
1549 n_shifts_cp.add(
shifts.size());
1555 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,1,indexT>),ite,ids,
1557 gridGeoPack,origPack_cnk,
1558 gridGeometry,origUnpack_cnk,
1566 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,2,indexT>),ite,ids,
1568 gridGeoPack,origPack_cnk,
1569 gridGeometry,origUnpack_cnk,
1577 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,4,indexT>),ite,ids,
1579 gridGeoPack,origPack_cnk,
1580 gridGeometry,origUnpack_cnk,
1588 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,8,indexT>),ite,ids,
1590 gridGeoPack,origPack_cnk,
1591 gridGeometry,origUnpack_cnk,
1607 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
1616 template<
typename origPackType,
typename IteratorType>
1620 for (
int i = 0 ; i < dim ; i++)
1622 int op_q = origPack.get(i) % blockEdgeSize;
1623 int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1624 int quot = abs(ou_q - op_q) % blockEdgeSize;
1632 for (
int j = 0 ; j < this->blockSize ; j++)
1641 for (
int i = 0 ; i < dim ; i++)
1643 int c = x % blockEdgeSize;
1645 if (quot_diff[i] + c < 0)
1647 offset += pos_c*(quot_diff[i] + c + blockEdgeSize);
1650 else if (quot_diff[i] + c >= blockEdgeSize)
1652 offset += pos_c*(quot_diff[i] + c - blockEdgeSize);
1657 offset += pos_c*(quot_diff[i] + c);
1661 pos_c *= blockEdgeSize;
1662 bp_c *= (quot_diff[i] != 0)?2:1;
1666 convert_blk.template get<0>(convert_blk.
size()-1)[pos] = (bpos << 16) + offset;
1672 typedef AggregateT value_type;
1674 typedef self device_grid_type;
1677 :stencilSupportRadius(1)
1695 :stencilSupportRadius(stencilSupportRadius)
1705 : gridGeometry(gridGeometry),
1706 stencilSupportRadius(stencilSupportRadius)
1711 for (
int i = 0 ; i < dim ; i++) {sz_st[i] = gridGeometry.getSize()[i];}
1720 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1724 decltype(extendedBlockGeometry),
1733 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1737 decltype(extendedBlockGeometry),
1743 extendedBlockGeometry,
1744 stencilSupportRadius,
1746 nn_blocks.toKernel(),
1753 template<
unsigned int nNN,
unsigned int nLoop>
1758 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1762 decltype(extendedBlockGeometry),
1771 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1775 decltype(extendedBlockGeometry),
1781 extendedBlockGeometry,
1782 stencilSupportRadius,
1784 nn_blocks.toKernel(),
1817 return gridGeometry;
1825 template<
typename stencil_type>
1828 computeGhostLayerMapping<stencil_type>();
1832 constexpr
static unsigned int getBlockEdgeSize()
1834 return blockEdgeSize;
1837 constexpr
unsigned int getBlockSize()
const
1843 template<
typename CoordT>
1844 inline size_t getLinId(CoordT &coord)
1846 return gridGeometry.LinId(coord);
1851 return gridGeometry.InvLinId(linId);
1856 return gridSize.getGPUIterator(start,stop,n_thr);
1868 template<
typename CoordT>
1875 auto glid = gridGeometry.LinId(coord);
1877 auto bid = glid / blockSize;
1878 auto lid = glid % blockSize;
1880 auto key = blockMap.get_sparse(bid);
1882 k.set_cnk_pos_id(key.id);
1897 template<
unsigned int p,
typename CoordT>
1912 template<
unsigned int p>
1945 template<
typename CoordT>
1950 gridGeometry.LinId(coord,lin,offset);
1986 template<
unsigned int p>
2004 template<
unsigned int p,
typename CoordT>
2005 auto insert(
const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2010 template<
typename CoordT>
2015 gridGeometry.LinId(coord,ind,offset);
2126 ite.wthr.x = indexBuffer.size();
2130 ite.thr.x = getBlockSize();
2135 output.resize(indexBuffer.size()+1);
2139 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2141 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2145 openfpm::scan((
unsigned int *)output.template getDeviceBuffer<0>(),output.
size(),(
unsigned int *)output.template getDeviceBuffer<0>(),gpuContext);
2147 output.template deviceToHost<0>(output.
size()-1,output.
size()-1);
2148 unsigned int padding_points = output.template get<0>(output.
size()-1);
2153 pd_points.resize(padding_points);
2166 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_dw_count<dim,
2169 ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),
link_dw_scan.toKernel(),p_dw);
2179 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_dw<dim,
2181 blockSize>),ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),
link_dw_scan.toKernel(),
link_dw.toKernel(),p_dw);
2204 ite.wthr.x = indexBuffer.size();
2208 ite.thr.x = getBlockSize();
2213 output.resize(indexBuffer.size()+1);
2217 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2219 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2223 openfpm::scan((
unsigned int *)output.template getDeviceBuffer<0>(),output.
size(),(
unsigned int *)output.template getDeviceBuffer<0>(),gpuContext);
2225 output.template deviceToHost<0>(output.
size()-1,output.
size()-1);
2226 unsigned int padding_points = output.template get<0>(output.
size()-1);
2231 pd_points.resize(padding_points);
2244 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_up_count<dim,
2247 ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),
link_up_scan.toKernel(),p_up);
2257 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_up<dim,
2259 blockSize>),ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),
link_up_scan.toKernel(),
link_up.toKernel(),p_up);
2270 template<
typename dim3T>
2275 dim3SizeToInt(nBlock),
2276 dim3SizeToInt(nSlot)
2290 template<
typename stencil_type = NNStar<dim>,
typename checker_type = No_check>
2291 void tagBoundaries(
gpu::ofp_context_t& gpuContext, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
2297 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
2298 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
2300 if (numScalars == 0)
return;
2301 if (findNN ==
false)
2303 findNeighbours<stencil_type>();
2309 unsigned int localThreadBlockSize = dataChunkSize;
2310 unsigned int threadGridSize = numScalars % dataChunkSize == 0
2311 ? numScalars / dataChunkSize
2312 : 1 + numScalars / dataChunkSize;
2317 if (stencilSupportRadius == 1)
2319 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2325 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2327 else if (stencilSupportRadius == 2)
2329 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2335 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2337 else if (stencilSupportRadius == 0)
2339 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2345 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2350 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: stencilSupportRadius supported only up to 2, passed: " << stencilSupportRadius << std::endl;
2354 if (opt == tag_boundaries::CALCULATE_EXISTING_POINTS)
2359 block_points.resize(indexBuffer.size() + 1);
2360 block_points.template get<0>(block_points.
size()-1) = 0;
2361 block_points.template hostToDevice<0>(block_points.
size()-1,block_points.
size()-1);
2365 ite.wthr.x = indexBuffer.size();
2368 ite.thr.x = getBlockSize();
2374 dataBuffer.toKernel(),
2375 block_points.toKernel());
2378 openfpm::scan((indexT *)block_points.template getDeviceBuffer<0>(),block_points.
size(),(indexT *)block_points.template getDeviceBuffer<0>(),gpuContext);
2381 block_points.template deviceToHost<0>(block_points.
size()-1,block_points.
size()-1);
2382 size_t tot = block_points.template get<0>(block_points.
size()-1);
2387 dataBuffer.toKernel(),
2388 block_points.toKernel(),
2393 cudaDeviceSynchronize();
2396 template<
typename NNtype = NNStar<dim>>
2397 void findNeighbours()
2402 const unsigned int numBlocks = indexBuffer.size();
2403 const unsigned int numScalars = numBlocks * NNtype::nNN;
2404 nn_blocks.resize(numScalars);
2406 if (numScalars == 0)
return;
2410 unsigned int localThreadBlockSize = NNtype::nNN;
2412 unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
2413 ? numScalars / localThreadBlockSize
2414 : 1 + numScalars / localThreadBlockSize;
2416 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::findNeighbours<dim,NNtype>),
2417 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
2422 size_t countExistingElements()
const
2429 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2430 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2431 constexpr
unsigned int blockSize = MaskBlockT::size;
2432 const auto bufferSize = indexBuffer.size();
2434 size_t numExistingElements = 0;
2436 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2438 auto dataBlock = dataBuffer.get(blockId);
2439 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2441 const auto curMask = dataBlock.template get<pMask>()[elementId];
2443 if (this->exist(curMask))
2445 ++numExistingElements;
2450 return numExistingElements;
2453 size_t countBoundaryElements()
2460 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2461 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2462 constexpr
unsigned int blockSize = MaskBlockT::size;
2463 const auto bufferSize = indexBuffer.size();
2465 size_t numBoundaryElements = 0;
2467 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2469 auto dataBlock = dataBuffer.get(blockId);
2470 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2472 const auto curMask = dataBlock.template get<pMask>()[elementId];
2474 if (this->exist(curMask) && this->isPadding(curMask))
2476 ++numBoundaryElements;
2481 return numBoundaryElements;
2485 void measureBlockOccupancyMemory(
double &mean,
double &deviation)
2492 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2493 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2494 constexpr
unsigned int blockSize = MaskBlockT::size;
2495 const auto bufferSize = indexBuffer.size();
2499 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2501 auto dataBlock = dataBuffer.get(blockId);
2502 size_t numElementsInBlock = 0;
2503 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2505 const auto curMask = dataBlock.template get<pMask>()[elementId];
2507 if (this->exist(curMask))
2509 ++numElementsInBlock;
2512 double blockOccupancy =
static_cast<double>(numElementsInBlock)/blockSize;
2513 measures.add(blockOccupancy);
2516 standard_deviation(measures, mean, deviation);
2520 void measureBlockOccupancy(
double &mean,
double &deviation)
2527 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2528 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2529 constexpr
unsigned int blockSize = MaskBlockT::size;
2530 const auto bufferSize = indexBuffer.size();
2534 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2536 auto dataBlock = dataBuffer.get(blockId);
2537 size_t numElementsInBlock = 0;
2538 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2540 const auto curMask = dataBlock.template get<pMask>()[elementId];
2542 if (this->exist(curMask) && !this->isPadding(curMask))
2544 ++numElementsInBlock;
2547 double blockOccupancy =
static_cast<double>(numElementsInBlock)/blockSize;
2548 measures.add(blockOccupancy);
2551 standard_deviation(measures, mean, deviation);
2568 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2573 for (
int i = 0 ; i < dim ; i++)
2579 applyStencils< SparseGridGpuKernels::stencil_cross_func<dim,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2587 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2592 for (
int i = 0 ; i < dim ; i++)
2600 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2607 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2612 for (
int i = 0 ; i < dim ; i++)
2620 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2627 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2632 for (
int i = 0 ; i < dim ; i++)
2640 applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2647 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_src3,
2648 unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int prop_dst3,
2649 unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2654 for (
int i = 0 ; i < dim ; i++)
2662 applyStencils< SparseGridGpuKernels::stencil_func_conv3_b<dim,nLoop,prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2669 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2674 for (
int i = 0 ; i < dim ; i++)
2682 applyStencils< SparseGridGpuKernels::stencil_func_conv2<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2694 for (
int i = 0 ; i < dim ; i++)
2697 b.setHigh(i,gridGeometry.getSize()[i]);
2704 template<
typename stencil,
typename... Args>
2705 void applyStencils(
const Box<dim,int> & box, StencilMode mode, Args... args)
2707 if (findNN ==
false)
2709 findNeighbours<typename stencil::stencil_type>();
2721 case STENCIL_MODE_INPLACE:
2722 applyStencilInPlace<stencil>(box,mode,args...);
2724 case STENCIL_MODE_INPLACE_NO_SHARED:
2725 applyStencilInPlaceNoShared<stencil>(box,mode,args...);
2729 template<
typename stencil1,
typename stencil2,
typename ... otherStencils,
typename... Args>
2730 void applyStencils(
Box<dim,int> box, StencilMode mode, Args... args)
2732 applyStencils<stencil1>(box,mode, args...);
2733 applyStencils<stencil2, otherStencils ...>(box,mode, args...);
2736 template<
typename BitMaskT>
2737 inline static bool isPadding(BitMaskT &bitMask)
2743 template<
typename BitMaskT>
2744 inline static void setPadding(BitMaskT &bitMask)
2750 template<
typename BitMaskT>
2751 inline static void unsetPadding(BitMaskT &bitMask)
2764 template<
typename CoordT>
2767 return gridGeometry.BlockLinId(blockCoord);
2780 template<
unsigned int p>
2785 indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());
2786 indexT local_id = coord.get_data_id();
2791 block_data.template get<BMG::pMask>()[local_id] = 1;
2793 return block_data.template get<p>()[local_id];
2806 template<
typename CoordT>
2809 auto lin = gridGeometry.LinId(coord);
2810 indexT block_id = lin / blockSize;
2811 local_id = lin % blockSize;
2816 block_data.template get<BMG::pMask>()[local_id] = 1;
2831 template<
unsigned int p,
typename CoordT>
2835 auto lin = gridGeometry.LinId(coord);
2836 indexT block_id = lin / blockSize;
2837 indexT local_id = lin % blockSize;
2842 block_data.template get<BMG::pMask>()[local_id] = 1;
2844 return block_data.template get<p>()[local_id];
2847 template<
unsigned int p>
2848 void print_vct_add_data()
2852 threadBlockSize, indexT, layout_base> BMG;
2854 auto & bM = BMG::blockMap.private_get_vct_add_data();
2855 auto & vI = BMG::blockMap.private_get_vct_add_index();
2856 bM.template deviceToHost<p>();
2857 vI.template deviceToHost<0>();
2859 std::cout <<
"vct_add_data: " << std::endl;
2861 for (
size_t i = 0 ; i < bM.size() ; i++)
2863 std::cout << i <<
" index: " << vI.template get<0>(i) <<
" BlockData: " << std::endl;
2864 for (
size_t j = 0 ; j < blockSize ; j++)
2866 std::cout << (
int)bM.template get<p>(i)[j] <<
" ";
2869 std::cout << std::endl;
2878 template<
unsigned int p>
2879 void setBackgroundValue(
typename boost::mpl::at<
typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
2883 BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
2895 static bool packRequest()
2904 template<
int ... prp>
inline
2911 indexBuffer.template packRequest<prp ...>(req);
2912 dataBuffer.template packRequest<prp ...>(req);
2933 indexBuffer.template pack<prp ...>(mem,sts);
2934 dataBuffer.template pack<prp ...>(mem,sts);
2955 indexBuffer.template
unpack<prp ...>(mem,ps);
2956 dataBuffer.template
unpack<prp ...>(mem,ps);
2974 if (mem.
size() != 0)
2975 {std::cout << __FILE__ <<
":" << __LINE__ <<
" not implemented: " << std::endl;}
2983 template<
int ... prp>
inline
2991 ite.wthr.x = indexBuffer.size();
2994 ite.thr.x = getBlockSize();
2998 tmp.resize(indexBuffer.size() + 1);
3003 dataBuffer.toKernel(),
3006 openfpm::scan((indexT *)
tmp.
template getDeviceBuffer<0>(),
3007 tmp.size(), (indexT *)
tmp.
template getDeviceBuffer<0>(), gpuContext);
3009 tmp.template deviceToHost<0>(
tmp.size()-1,
tmp.size()-1);
3013 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof... (prp)>>(spq);
3015 size_t n_pnt =
tmp.template get<0>(
tmp.size()-1);
3020 req =
sizeof(indexT) +
3021 sizeof(indexT)*indexBuffer.size() +
3022 sizeof(indexT)*
tmp.size() +
3023 n_pnt*(spq.point_size +
sizeof(
short int) +
sizeof(
unsigned char));
3040 template<
int ... prp>
inline
3046 for (
int i = 0 ; i < dim ; i++)
3064 offset_ptrs.clear();
3076 template<
int ... prp>
inline
3085 ite.wthr.x = indexBuffer.size();
3088 ite.thr.x = getBlockSize();
3094 if (indexBuffer.size() != 0)
3099 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3104 indexBuffer.toKernel(),
3107 dataBuffer.toKernel(),
3109 (
unsigned int)indexBuffer.size() + 1);
3114 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3119 indexBuffer.toKernel(),
3122 dataBuffer.toKernel(),
3124 (
unsigned int)indexBuffer.size() + 1);
3129 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3134 indexBuffer.toKernel(),
3137 dataBuffer.toKernel(),
3139 (
unsigned int)indexBuffer.size() + 1);
3144 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3149 indexBuffer.toKernel(),
3152 dataBuffer.toKernel(),
3154 (
unsigned int)indexBuffer.size() + 1);
3158 std::cout << __FILE__ <<
":" << __LINE__ <<
" error no implementation available of packCalculate, create a new case for " <<
pack_subs.size() << std::endl;
3164 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3169 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
3174 tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3175 tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3178 tmp.template hostToDevice<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3179 tmp.template hostToDevice<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3181 openfpm::scan(((indexT *)
tmp.
template getDeviceBuffer<0>()) + i*(indexBuffer.size() + 1),
3182 indexBuffer.size() + 1, (indexT *)
tmp.
template getDeviceBuffer<0>() + i*(indexBuffer.size() + 1), gpuContext);
3184 openfpm::scan(((
unsigned int *)
tmp.
template getDeviceBuffer<1>()) + i*(indexBuffer.size() + 1),
3185 indexBuffer.size() + 1, (
unsigned int *)
tmp.
template getDeviceBuffer<1>() + i*(indexBuffer.size() + 1), gpuContext);
3187 tmp.template deviceToHost<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3188 tmp.template deviceToHost<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3190 scan_it.template get<0>(i) =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3192 n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3193 n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3195 req +=
sizeof(size_t) +
3197 sizeof(indexT)*n_cnk +
3198 align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int)) +
3199 align_number(
sizeof(indexT),n_pnt*(spq.point_size)) +
3200 align_number(
sizeof(indexT),n_pnt*
sizeof(
short int)) +
3201 align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3204 scan_it.template hostToDevice<0>();
3206 openfpm::scan((indexT *)
scan_it.
template getDeviceBuffer<0>(),
3207 scan_it.size(), (indexT *)
scan_it.
template getDeviceBuffer<0>(), gpuContext);
3217 return this->blockMap.getMappingVector();
3227 return this->blockMap.getMergeIndexMapVector();
3248 unsigned int i = req_index;
3251 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3256 size_t n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3257 size_t n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3266 for (
int i = 0 ; i < dim ; i++)
3269 for (
int i = 0 ; i < dim ; i++)
3275 mem.
allocate(n_cnk*
sizeof(indexT));
3279 mem.
allocate( align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int)) );
3283 mem.
allocate( align_number(
sizeof(indexT),n_pnt*(spq.point_size)) );
3287 mem.
allocate( align_number(
sizeof(indexT),n_pnt*
sizeof(
short int) ) );
3291 mem.
allocate( align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char) ) );
3314 template<
unsigned int ... prp>
3317 if ((opt & 0x3) == rem_copy_opt::PHASE1)
3319 this->
template removeCopyToFinalize_phase1<prp ...>(gpuContext,opt);
3321 else if ((opt & 0x3) == rem_copy_opt::PHASE2)
3323 this->
template removeCopyToFinalize_phase2<prp ...>(gpuContext,opt);
3327 this->
template removeCopyToFinalize_phase3<prp ...>(gpuContext,opt,
false);
3346 bool is_pack_remote =
false)
3349 RestorePackVariableIfKeepGeometry(opt,is_pack_remote);
3353 pack_sg_implement<32,prp...>(mem,sts,opt,is_pack_remote);
3357 pack_sg_implement<64, prp...>(mem,sts,opt,is_pack_remote);
3361 pack_sg_implement<80, prp...>(mem,sts,opt,is_pack_remote);
3365 std::cout << __FILE__ <<
":" << __LINE__ <<
" error no implementation available of packCalculate, create a new case for " <<
pack_subs.size() << std::endl;
3368 savePackVariableIfNotKeepGeometry(opt,is_pack_remote);
3380 auto & vad = BMG::blockMap.private_get_vct_add_data();
3381 auto & vai = BMG::blockMap.private_get_vct_add_index();
3387 offset_ptrs_cp.clear();
3388 scan_ptrs_cp.clear();
3391 data_base_ptr_cp.clear();
3393 n_shifts_cp.clear();
3394 convert_blk.clear();
3405 gridGeometry.swap(gr.gridGeometry);
3423 if (rem_sects.
size() != 0)
3425 rem_sects.template hostToDevice<0,1>();
3427 tmp.resize(indexBuffer.size() + 1);
3429 tmp.template get<1>(
tmp.size()-1) = 0;
3430 tmp.template hostToDevice<1>(
tmp.size()-1,
tmp.size()-1);
3432 auto ite = indexBuffer.getGPUIterator();
3434 if (has_work_gpu(ite) ==
true)
3437 CUDA_LAUNCH((SparseGridGpuKernels::calc_remove_points_chunks_boxes<dim,
3439 blockEdgeSize>),ite,indexBuffer.toKernel(),rem_sects.toKernel(),
3440 gridGeometry,dataBuffer.toKernel(),
3444 openfpm::scan((
unsigned int *)
tmp.template getDeviceBuffer<1>(),
tmp.size(),(
unsigned int *)
tmp.template getDeviceBuffer<1>(),gpuContext);
3446 tmp.template deviceToHost<1>(
tmp.size()-1,
tmp.size()-1);
3449 size_t nr_cnk =
tmp.template get<1>(
tmp.size()-1);
3451 tmp3.resize(nr_cnk);
3454 ite = indexBuffer.getGPUIterator();
3456 if (has_work_gpu(ite) ==
false) {
return;}
3458 CUDA_LAUNCH((SparseGridGpuKernels::collect_rem_chunks),ite,
tmp.toKernel(),
tmp3.toKernel());
3462 ite =
tmp3.getGPUIterator();
3464 ite.wthr.x =
tmp3.size();
3467 ite.thr.x = getBlockSize();
3471 if (has_work_gpu(ite) ==
false) {
return;}
3473 CUDA_LAUNCH((SparseGridGpuKernels::remove_points<dim,
3475 ite,indexBuffer.toKernel(),
3477 dataBuffer.toKernel(),
3479 rem_sects.toKernel());
3491 template<
unsigned int ... prp>
3494 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3497 removeCopyToFinalize_phase3<prp ...>(gpuContext,opt,
true);
3508 offset_ptrs_cp.clear();
3509 scan_ptrs_cp.clear();
3510 data_base_ptr_cp.clear();
3513 n_shifts_cp.clear();
3514 convert_blk.clear();
3516 data_base_ptr_cp.clear();
3530 rem_sects.add(section_to_delete);
3558 grid_src.copySect.add(sgs);
3564 template<
typename pointers_type,
3565 typename headers_type,
3566 typename result_type,
3567 unsigned int ... prp >
3568 static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result,
int n_slot)
3572 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3574 result.allocate(
sizeof(
int));
3576 if (pointers.size())
3577 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),
3578 pointers.toKernel(),
3580 (
int *)result.getDevicePointer(),
3581 (
unsigned int)spq.point_size,
3594 template<
unsigned int ... prp,
typename S2,
typename header_type>
3597 header_type & headers,
3601 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3605 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3615 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3619 size_t n_cnk = headers.template get<1>(ih);
3621 ps.
addOffset(2*dim*
sizeof(
unsigned int));
3623 size_t actual_offset = n_cnk*
sizeof(indexT);
3624 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
3628 size_t n_pnt = headers.template get<2>(ih);
3629 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
3633 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
3636 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
3637 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3639 scan_ptrs_cp.add(scan);
3640 offset_ptrs_cp.add(offsets);
3641 data_base_ptr_cp.add(data_base_ptr);
3664 template<
unsigned int ... prp,
typename S2>
3669 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3673 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3683 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3706 ps.
addOffset(2*dim*
sizeof(
unsigned int));
3708 size_t actual_offset = n_cnk*
sizeof(indexT);
3709 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
3712 ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int) +
sizeof(
unsigned int));
3716 size_t n_pnt = *(
unsigned int *)((
unsigned char *)mem.
getPointer() + ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int));
3717 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
3721 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
3724 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
3725 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3727 scan_ptrs_cp.add(scan);
3728 offset_ptrs_cp.add(offsets);
3729 data_base_ptr_cp.add(data_base_ptr);
3801 auto getSegmentToOutMap() const -> decltype(
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3811 auto getSegmentToMergeIndexMap() const -> decltype(
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3837 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) || defined(VTKWRITER_HPP_)
3844 template<
typename Tw =
float>
bool write(
const std::string & output)
3852 return write_with_spacing_offset(output,spacing,offset);
3860 template<
typename Tw =
float>
3863 file_type ft = file_type::BINARY;
3867 auto & index = bm.getIndexBuffer();
3868 auto & data = bm.getDataBuffer();
3875 auto it = index.getIterator();
3879 auto key = it.get();
3883 for (
size_t i = 0 ; i < gridGeometry.getBlockSize() ; i++)
3890 for (
size_t k = 0 ; k < dim ; k++)
3891 {p.get(k) = keyg.
get(k)*spacing[k] + offset[k]*spacing[k];}
3897 cp(data.get_o(key),tmp_prp.last(),key,i);
3899 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);
3901 tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];
3910 vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.
size());
3915 return vtk_writer.write(output,prp_names,
"sparse_grid",
"",ft);
3932 fill_chunks_boxes(chunks_box,ids,spacing,offset);
3934 vtk_box1.add(chunks_box);
3935 vtk_box1.write(std::string(
"chunks_") + output + std::string(
".vtk"));
3939 write_with_spacing_offset(std::string(
"data_") + output + std::string(
".vtk"),spacing,offset);
3947 template<
unsigned int dim,
3948 typename AggregateT,
3951 typename indexT=
long int,
3956 template<
unsigned int dim,
3957 typename AggregateT,
3960 typename indexT=
int,
3965 template<
unsigned int dim,
3966 typename AggregateT,
3969 typename indexT=
int,
int sgn(T val)
Gets the sign of a variable.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
auto insert_o(unsigned int linId) -> decltype(blockMap.insert(0))
insert data, host version
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId, is_new).template get< p >())
insert a block + flush, host version
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void setGPUInsertBuffer(int nBlock, int nSlot)
decltype(blockMap) & private_get_blockMap()
Return internal structure block map.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
__device__ __host__ T getHigh(int i) const
get the high interval of the box
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
virtual void * getDevicePointer()
get a readable pointer with the data
virtual bool resize(size_t sz)
resize the momory allocated
virtual void hostToDevice()
Move memory from host to device.
virtual size_t size() const
the the size of the allocated memory
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
virtual void decRef()
Decrement the reference counter.
size_t getOffsetEnd()
Get offset.
size_t getOffset()
Get offset.
virtual void * getDevicePointer()
Return the pointer of the last allocation.
virtual void incRef()
Increment the reference counter.
virtual void deviceToHost()
Do nothing.
virtual void * getPointer()
Return the pointer of the last allocation.
void reset()
Reset the internal counters.
virtual void hostToDevice()
Return the pointer of the last allocation.
virtual bool allocate(size_t sz)
Allocate a chunk of memory.
virtual size_t size() const
Get the size of the LAST allocated memory.
This class allocate, and destroy CPU memory.
static void pack(ExtPreAlloc< Mem >, const T &obj)
Error, no implementation.
This class implement the point shape in an N-dimensional space.
void one()
Set to one the point coordinate.
__device__ __host__ void zero()
Set to zero the point coordinate.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
grid_key_dx< dim > getStop() const
Return the stop point.
grid_key_dx< dim > getStart() const
Return the starting point.
openfpm::vector_gpu< aggregate< int, short int > > link_dw
links of the padding points with real points of a finer sparsegrid
openfpm::vector_gpu< aggregate< size_t > > links_up
links of the padding points with real points of a coarse sparsegrid
void unpack_with_headers(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, header_type &headers, int ih, Unpack_stat &ps, gpu::ofp_context_t &gpuContext, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
void addAndConvertPackedChunkToTmp(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &gpuContext)
unpack the sub-grid object
openfpm::vector_gpu< aggregate< int, short int > > & getUpLinks()
Get the links up for each point.
SparseGridGpu(linearizer &gridGeometry, unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
size_t getBlockLinId(const CoordT &blockCoord) const
Linearization of block coordinates.
openfpm::vector_gpu< aggregate< indexT > > tmp3
temporal 3
auto private_get_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer()) &
Return the index array of the blocks.
void conv_cross(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a convolution using a cross like stencil.
openfpm::vector_gpu< aggregate< int[dim]> > shifts
shifts for chunk conversion
auto get(const grid_key_dx< dim, CoordT > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using the point coordinates.
void pack(ExtPreAlloc< CudaMemory > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Pack_stat &sts)
Pack the object into the memory given an iterator.
auto get_o(const grid_key_dx< dim, CoordT > &coord) const -> encap_data_block< typename std::remove_const< decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::get(0))>::type >
Get an element using the point coordinates.
auto get(const sparse_grid_gpu_index< self > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
SparseGridGpu(const size_t(&res)[dim], unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
auto private_get_add_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
ExtPreAlloc< CudaMemory > * prAlloc_prp
Memory to remove copy finalize.
void remove(const Box< dim, int > §ion_to_delete)
Remove all the points in this region.
openfpm::vector_gpu< aggregate< indexT > > e_points
void unpack(ExtPreAlloc< CudaMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
auto private_get_neighborhood_array() -> decltype(nn_blocks) &
Return the index array of the blocks.
void setNNType()
Set the neighborhood type.
openfpm::vector_gpu< aggregate< unsigned int > > pack_output
Helper array to pack points.
auto get(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
openfpm::vector_gpu< aggregate< indexT > > tmp2
temporal 2
auto insertFlush(const grid_key_dx< dim, CoordT > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void packRequest(size_t &req) const
Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory.
void setBackgroundValue(typename boost::mpl::at< typename AggregateT::type, boost::mpl::int_< p >>::type backgroundValue)
set the background for property p
auto private_get_data_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer())
Return the data array of the blocks.
auto insertFlush(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
Return the mapping vector used to know where the data has been added.
void packRequest(size_t &req, gpu::ofp_context_t &gpuContext) const
memory requested to pack this object
static bool is_unpack_header_supported()
Indicate that unpacking the header is supported.
Box< dim, int > getBox()
Return a Box with the range if the SparseGrid.
openfpm::vector_gpu< aggregate< short int, short int > > ghostLayerToThreadsMapping
void copyRemoveReset()
Reset the queue to remove and copy section of grids.
unsigned char getFlag(const sparse_grid_gpu_index< self > &coord) const
Return the flag of the point.
void removeAddUnpackReset()
In this case it does nothing.
static SparseGridGpu_iterator_sub< dim, self > type_of_subiterator()
This is a meta-function return which type of sub iterator a grid produce.
void packCalculate(size_t &req, gpu::ofp_context_t &gpuContext)
Calculate the size of the information to pack.
size_t size() const
return the size of the grid
void packRequest(SparseGridGpu_iterator_sub< dim, self > &sub_it, size_t &req) const
Calculate the size to pack part of this structure.
void pack(ExtPreAlloc< HeapMemory > &mem, Pack_stat &sts) const
Pack the object into the memory.
linearizer & getGrid()
Return the grid information object.
static constexpr bool isCompressed()
This is a multiresolution sparse grid so is a compressed format.
void construct_link_up(self &grid_up, const Box< dim, int > &db_, Point< dim, int > p_up, gpu::ofp_context_t &gpuContext)
construct link on the up levels
auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
Return the mapping vector used to know where the data has been added.
decltype(self::type_of_iterator()) getIterator() const
Return a SparseGrid iterator.
void copy_to(self &grid_src, const Box< dim, size_t > &box_src, const Box< dim, size_t > &box_dst)
It queue a copy.
void removeAddUnpackFinalize(gpu::ofp_context_t &gpuContext, int opt)
This function remove the points we queue to remove and it flush all the added/unpacked data.
openfpm::vector_gpu< aggregate< int, short int > > & getDownLinks()
Get the links down for each point.
void resize(size_t(&res)[dim])
resize the SparseGrid
openfpm::vector_gpu< aggregate< int, short int > > link_up
links of the padding points with real points of a finer sparsegrid
void construct_link(self &grid_up, self &grid_dw, gpu::ofp_context_t &gpuContext)
construct link between levels
void conv2_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
void conv_cross_b(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
void unpack(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &gpuContext, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
openfpm::vector_gpu< aggregate< indexT, unsigned int > > tmp
temporal
void packReset()
Reset the pack calculation.
void removeCopyToFinalize(gpu::ofp_context_t &gpuContext, int opt)
It finalize the queued operations of remove() and copy_to()
bool isSkipLabellingPossible()
This function check if keep geometry is possible for this grid.
openfpm::vector_gpu< Box< dim, int > > pack_subs
the set of all sub-set to pack
auto private_get_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer())
Return the index array of the blocks.
auto insertBlockFlush(const grid_key_dx< dim, CoordT > &coord, indexT &local_id) -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::insertBlockFlush(0))
Insert the point on host side and flush directly.
void construct_link_dw(self &grid_dw, const Box< dim, int > &db_, Point< dim, int > p_dw, gpu::ofp_context_t &gpuContext)
construct link on the down level
auto private_get_data_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer()) &
Return the index array of the blocks.
auto get_o(const sparse_grid_gpu_index< self > &coord) const -> encap_data_block< typename std::remove_const< decltype(private_get_data_array().get(0))>::type >
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
static SparseGridGpu_iterator< dim, self > type_of_iterator()
This is a meta-function return which type of iterator a grid produce.
openfpm::vector_gpu< aggregate< unsigned int > > & getDownLinksOffsets()
Get the offsets for each point of the links down.
static void unpack_headers(pointers_type &pointers, headers_type &headers, result_type &result, int n_slot)
Stub does not do anything.
void conv3_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< int > > new_map
Map between the (Last) added chunks and their position in chunks data.
int yes_i_am_grid
it define that this data-structure is a grid
void convertChunkIds(short int *offset, origPackType &origPack, IteratorType &sub_it)
convert the offset index from the packed to the add buffer
void unpack(ExtPreAlloc< HeapMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void removePoints(gpu::ofp_context_t &gpuContext)
Remove the points we queues to remove.
void conv2(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< unsigned int > > & getUpLinksOffsets()
Get the offsets for each point of the links up.
openfpm::vector_gpu< aggregate< unsigned int > > link_dw_scan
scan offsets of the links down
base_key get_sparse(const grid_key_dx< dim, CoordT > &coord) const
Get an element using the point coordinates.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
openfpm::vector_gpu< aggregate< unsigned int > > link_up_scan
scan offsets of the links down
auto private_get_add_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
openfpm::vector_gpu< aggregate< indexT > > scan_it
contain the scan of the point for each iterator
void packFinalize(ExtPreAlloc< CudaMemory > &mem, Pack_stat &sts, int opt=0, bool is_pack_remote=false)
Finalize the packing procedure.
void conv(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
size_t getOffset()
Return the actual counter.
void addOffset(size_t off)
Increment the offset pointer by off.
static void unpack(ExtPreAlloc< Mem >, T &obj)
Error, no implementation.
void operator()(T &t) const
It call the copy function for each property.
void operator()(T &t) const
It call the copy function for each property.
void * base_ptr
data pointers
grid_key_dx is the key to access any element in the grid
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
__device__ __host__ index_type get(index_type i) const
Get the i index.
void setDimensions(const size_t(&dims)[N])
Reset the dimension of the grid.
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Implementation of 1-D std::vector like structure.
Element index contain a data chunk index and a point index.
int get_cnk_pos_id() const
Get chunk position id.
int get_data_id() const
Get chunk local index (the returned index < getblockSize())
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
get the type of the insertBlock
get the type of the block
static __device__ bool getNNindex_offset(grid_key_dx< dim, indexT2 > &coord, unsigned int &NN_index, unsigned int &offset_nn)
given a coordinate writtel in local coordinate for a given it return the neighborhood chunk position ...
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
this class is a functor for "for_each" algorithm
Transform the boost::fusion::vector into memory specification (memory_traits)
this class is a functor for "for_each" algorithm