5 #ifndef OPENFPM_PDATA_SPARSEGRIDGPU_HPP 6 #define OPENFPM_PDATA_SPARSEGRIDGPU_HPP 8 constexpr
int BLOCK_SIZE_STENCIL = 128;
11 #include "util/cuda_launch.hpp" 13 #include <SparseGridGpu/BlockMapGpu.hpp> 14 #include <Grid/iterators/grid_skin_iterator.hpp> 15 #include <Grid/Geometry/grid_smb.hpp> 16 #include "SparseGridGpu_ker.cuh" 17 #include "SparseGridGpu_kernels.cuh" 18 #include "Iterators/SparseGridGpu_iterator_sub.hpp" 19 #include "Grid/Geometry/grid_zmb.hpp" 20 #include "util/stat/common_statistics.hpp" 21 #include "Iterators/SparseGridGpu_iterator.hpp" 22 #include "Space/SpaceBox.hpp" 24 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) 25 #include "VTKWriter/VTKWriter.hpp" 28 constexpr
int NO_ITERATOR_INIT = 0;
34 NO_CALCULATE_EXISTING_POINTS,
35 CALCULATE_EXISTING_POINTS
38 template<
unsigned int dim>
41 typedef boost::mpl::int_<2> type;
48 typedef boost::mpl::int_<256> type;
49 typedef boost::mpl::int_<256> tb;
55 typedef boost::mpl::int_<16> type;
56 typedef boost::mpl::int_<256> tb;
62 typedef boost::mpl::int_<8> type;
63 typedef boost::mpl::int_<512> tb;
72 template<
typename T,
unsigned int dim,
unsigned int blockEdgeSize>
78 template<
typename T,
unsigned int dim,
unsigned int blockEdgeSize,
unsigned int N1>
84 template<
unsigned int dim,
unsigned int blockEdgeSize,
typename ... aggr_list>
90 template<
unsigned int dim,
unsigned int blockEdgeSize,
typename aggr>
95 template<
unsigned int dim,
unsigned int blockEdgeSize,
typename ... types>
101 template<
typename aggr>
106 template<
typename ... types>
114 template<
typename enc_type>
123 :offset(offset),enc(enc)
130 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,enc_type::T_type::max_prop> >(cp);
135 template<
unsigned int p>
136 auto get() -> decltype(enc.template get<p>()[offset])
138 return enc.template get<p>()[offset];
141 template<
unsigned int p>
142 auto get()
const -> decltype(enc.template get<p>()[offset])
144 return enc.template get<p>()[offset];
152 STENCIL_MODE_INPLACE = 1,
153 STENCIL_MODE_INPLACE_NO_SHARED = 3
160 template<
typename SGr
idGpu,
unsigned int prp,
unsigned int stencil_size>
166 SGridGpu::device_grid_type::dims>
type;
169 #include "encap_num.hpp" 175 template<
typename SGr
idGpu>
185 template<
unsigned int dim>
188 __device__
inline static bool is_padding()
190 printf(
"NNfull_is_padding_impl with dim: %d not implemented yet \n",dim);
203 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
204 __device__
inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
206 bool isPadding_ =
false;
207 for (
int i = 0 ; i < 3 ; i++)
209 for (
int j = 0 ; j < 3 ; j++)
211 for (
int k = 0 ; k < 3 ; k++)
219 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
220 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
221 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
222 if (isPadding_)
break;
238 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
239 __device__
inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
241 bool isPadding_ =
false;
242 for (
int i = 0 ; i < 3 ; i++)
244 for (
int j = 0 ; j < 3 ; j++)
251 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
252 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
253 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
254 if (isPadding_)
break;
261 template<
unsigned int dim>
266 template<
typename indexT,
typename blockCoord_type,
typename blockMap_type,
typename SparseGr
id_type>
267 __device__
static inline indexT getNNpos(blockCoord_type & blockCoord,
268 blockMap_type & blockMap,
269 SparseGrid_type & sparseGrid,
270 const unsigned int offset)
273 int neighbourPos = blockMap.size();
274 if (offset < nNN && offset != nNN / 2)
277 for (
int i = 0 ; i < dim ; i++)
281 blockCoord.set_d(i, blockCoord.get(i) + dPos - 1);
284 neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).
id;
289 template<
typename indexT,
unsigned int blockEdgeSize,
typename coordType>
290 __host__
static inline indexT getNNskin(coordType & coord,
int stencilSupportRadius)
294 indexT neighbourNum = 0;
297 for(
int i = 0 ; i < dim ; i++)
299 int c = static_cast<int>(coord.get(i)) - static_cast<int>(stencilSupportRadius);
304 else if (c >= blockEdgeSize)
306 neighbourNum += 2*accu;
310 neighbourNum += accu;
319 template<
typename sparseGr
id_type,
typename coord_type,
typename Mask_type,
unsigned int eb_size>
320 __device__
static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
335 template<
unsigned int blockEdgeSize,
typename indexT2>
344 for (
unsigned int i = 0 ; i < dim ; i++)
346 int p = 1 - ((
int)(coord.
get(i) < 0)) + ((
int)(coord.
get(i) >= (
int)blockEdgeSize));
350 offset_nn += (coord.
get(i) + (1 - p)*(
int)blockEdgeSize)*cnt_off;
353 cnt_off *= blockEdgeSize;
364 template<
unsigned int nNN_,
unsigned int nLoop_>
367 static const unsigned int nNN = nNN_;
368 static const unsigned int nLoop = nLoop_;
371 template<
typename copy_type>
374 template<
typename T,
typename dst_type,
typename src_type>
375 static inline void copy(src_type & src, dst_type & dst,
unsigned int bPos)
377 dst.template get<T::value>() = src.template get<T::value>()[bPos];
381 template<
typename copy_type,
unsigned int N1>
384 template<
typename T,
typename dst_type,
typename src_type>
385 static inline void copy(src_type & src, dst_type & dst,
unsigned int bPos)
387 for (
int i = 0 ; i < N1 ; i++)
389 dst.template get<T::value>()[i] = src.template get<T::value>()[i][bPos];
394 template<
typename Tsrc,
typename Tdst>
417 typedef typename std::remove_reference<decltype(
dst.template get<T::value>())>::type copy_rtype;
428 template<
typename AggregateT,
unsigned int n_it,
unsigned int ... prp>
436 mutable size_t tot = 0;
454 typedef typename boost::mpl::at<vprp,T>::type prp_cp;
457 typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
459 arrs.ptr[i][T::value] = (
void *)((((
unsigned char *)
base_ptr) + tot));
461 tot += sz *
sizeof(pack_type);
466 template<
typename SparseGr
idType>
469 SparseGridType * grd;
478 :grd(&grd),src(src),dst(dst)
483 template<
unsigned int dim,
487 typename indexT=
long int,
491 typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,
492 threadBlockSize, indexT, layout_base>
496 static constexpr
unsigned int dims = dim;
502 threadBlockSize, indexT, layout_base>
BMG;
504 const static unsigned char PADDING_BIT = 1;
506 linearizer gridGeometry;
509 unsigned int stencilSupportRadius;
510 unsigned int ghostLayerSize;
621 mutable int index_size_swp_r = -1;
646 inline void swap_internal_remote()
648 n_cnk_cp_swp_r.swap(n_cnk_cp);
649 n_pnt_cp_swp_r.swap(n_pnt_cp);
650 n_shift_cp_swp_r.swap(n_shifts_cp);
651 convert_blk_swp_r.swap(convert_blk);
652 box_cp_swp_r.swap(box_cp);
656 inline void swap_internal_local()
658 offset_ptrs_cp_swp.swap(offset_ptrs_cp);
659 scan_ptrs_cp_swp.swap(scan_ptrs_cp);
660 data_base_ptr_cp_swp.swap(data_base_ptr_cp);
661 n_cnk_cp_swp.swap(n_cnk_cp);
662 n_pnt_cp_swp.swap(n_pnt_cp);
663 n_shift_cp_swp.swap(n_shifts_cp);
664 convert_blk_swp.swap(convert_blk);
665 box_cp_swp.swap(box_cp);
669 inline void swap_local_pack()
671 index_ptrs_swp.swap(index_ptrs);
672 scan_ptrs_swp.swap(scan_ptrs);
673 data_ptrs_swp.swap(data_ptrs);
674 offset_ptrs_swp.swap(offset_ptrs);
675 mask_ptrs_swp.swap(mask_ptrs);
685 inline void swap_remote_pack()
687 index_ptrs_swp_r.swap(index_ptrs);
688 scan_ptrs_swp_r.swap(scan_ptrs);
689 data_ptrs_swp_r.swap(data_ptrs);
690 offset_ptrs_swp_r.swap(offset_ptrs);
691 mask_ptrs_swp_r.swap(mask_ptrs);
703 static constexpr
unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
704 typedef AggregateBlockT AggregateInternalT;
711 static constexpr
unsigned int blockEdgeSize_ = blockEdgeSize;
713 typedef linearizer grid_info;
715 typedef linearizer linearizer_type;
721 typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;
730 return this->countExistingElements();
738 template <
typename stencil = no_stencil>
754 template<
typename dim3T>
755 inline static int dim3SizeToInt(dim3T d)
757 return d.x * d.y * d.z;
760 inline static int dim3SizeToInt(
size_t d)
765 inline static int dim3SizeToInt(
unsigned int d)
770 template<
typename ... v_reduce>
771 void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
774 ::template flush<v_reduce ...>(context, opt);
781 void saveUnpackVariableIfNotKeepGeometry(
int opt,
bool is_unpack_remote)
783 if (is_unpack_remote ==
true)
784 {swap_internal_remote();}
786 if (is_unpack_remote ==
false)
787 {swap_internal_local();}
790 void RestoreUnpackVariableIfKeepGeometry(
int opt,
bool is_unpack_remote)
792 if (opt & KEEP_GEOMETRY && is_unpack_remote ==
true)
793 {swap_internal_remote();}
795 if (opt & KEEP_GEOMETRY && is_unpack_remote ==
false)
796 {swap_internal_local();}
800 void savePackVariableIfNotKeepGeometry(
int opt,
bool is_pack_remote)
802 if (is_pack_remote ==
false)
805 req_index_swp = req_index;
808 if (is_pack_remote ==
true)
811 req_index_swp_r = req_index;
815 void RestorePackVariableIfKeepGeometry(
int opt,
bool is_pack_remote)
817 if (opt & KEEP_GEOMETRY && is_pack_remote ==
false)
820 req_index = req_index_swp;
823 if (opt & KEEP_GEOMETRY && is_pack_remote ==
true)
826 req_index = req_index_swp_r;
830 template<
unsigned int n_it>
831 void calculatePackingPointsFromBoxes(
int opt,
size_t tot_pnt)
833 if (!(opt & KEEP_GEOMETRY))
843 ite.wthr.x = indexBuffer.size();
846 ite.thr.x = getBlockSize();
851 CUDA_LAUNCH((SparseGridGpuKernels::get_exist_points_with_boxes<dim,
856 indexBuffer.toKernel(),
859 dataBuffer.toKernel(),
869 void computeSizeOfGhostLayer()
871 unsigned int term1 = 1;
872 for (
int i = 0; i < dim; ++i)
874 term1 *= blockEdgeSize + 2 * stencilSupportRadius;
876 unsigned int term2 = 1;
877 for (
int i = 0; i < dim; ++i)
879 term2 *= blockEdgeSize;
881 ghostLayerSize = term1 - term2;
884 void allocateGhostLayerMapping()
889 template<
typename stencil_type>
890 void computeGhostLayerMapping()
892 size_t dimensions[dim],
894 innerDomainBegin[dim], innerDomainEnd[dim],
895 outerBoxBegin[dim], outerBoxEnd[dim],
897 for (
int i = 0; i < dim; ++i)
899 dimensions[i] = blockEdgeSize + 2 * stencilSupportRadius;
901 innerDomainBegin[i] = stencilSupportRadius - 1;
902 innerDomainEnd[i] = dimensions[i] - stencilSupportRadius;
903 outerBoxBegin[i] = origin[i];
904 outerBoxEnd[i] = dimensions[i];
905 bc[i] = NON_PERIODIC;
917 auto coord = gsi.get();
918 assert(i < ghostLayerSize);
919 mem_id linId = enlargedGrid.
LinId(coord);
923 ghostLayerToThreadsMapping.template get<nt>(i) = stencil_type::template getNNskin<indexT,blockEdgeSize>(coord,stencilSupportRadius);
928 assert(i == ghostLayerSize);
933 void initialize(
const size_t (& res)[dim])
935 gridGeometry = linearizer(res);
937 computeSizeOfGhostLayer();
938 allocateGhostLayerMapping();
939 computeGhostLayerMapping<NNStar<dim>>();
941 size_t extBlockDims[dim];
942 for (
int d=0; d<dim; ++d)
944 extBlockDims[d] = blockEdgeSize + 2*stencilSupportRadius;
952 template <
typename stencil,
typename... Args>
953 void applyStencilInPlace(
const Box<dim,int> & box, StencilMode & mode,Args... args)
959 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
960 unsigned int numScalars = indexBuffer_.size() * dataChunkSize;
962 if (numScalars == 0)
return;
965 constexpr
unsigned int chunksPerBlock = 1;
966 const unsigned int localThreadBlockSize = dataChunkSize * chunksPerBlock;
967 const unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
968 ? numScalars / localThreadBlockSize
969 : 1 + numScalars / localThreadBlockSize;
973 #ifdef CUDIFY_USE_CUDA 976 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
980 threadGridSize, localThreadBlockSize,
982 indexBuffer_.toKernel(),
983 dataBuffer_.toKernel(),
984 this->
template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
990 auto indexBuffer = indexBuffer_.toKernel();
991 auto dataBuffer = dataBuffer_.toKernel();
992 auto sparseGrid = this->
template toKernelNN<stencil::stencil_type::nNN, nLoop>();
996 auto lamb = [=] __device__ () mutable
998 constexpr
unsigned int pIndex = 0;
1000 typedef typename decltype(indexBuffer)::value_type IndexAggregateT;
1001 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1003 typedef typename decltype(dataBuffer)::value_type AggregateT_;
1004 typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;
1005 typedef ScalarTypeOf<AggregateT_, pMask> MaskT;
1006 constexpr
unsigned int blockSize = MaskBlockT::size;
1010 const unsigned int dataBlockPos = blockIdx.x;
1011 const unsigned int offset = threadIdx.x;
1013 if (dataBlockPos >= indexBuffer.size())
1018 auto dataBlockLoad = dataBuffer.get(dataBlockPos);
1021 const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1024 unsigned char curMask;
1026 if (offset < blockSize)
1029 curMask = dataBlockLoad.template get<pMask>()[offset];
1030 for (
int i = 0 ; i < dim ; i++)
1031 {curMask &= (pointCoord.
get(i) < bx.getLow(i) || pointCoord.
get(i) > bx.getHigh(i))?0:0xFF;}
1035 sdataBlockPos.id = dataBlockPos;
1038 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1042 CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);
1048 template <
typename stencil,
typename... Args>
1049 void applyStencilInPlaceNoShared(
const Box<dim,int> & box, StencilMode & mode,Args... args)
1055 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
1056 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
1058 if (numScalars == 0)
return;
1060 auto ite =
e_points.getGPUIterator(BLOCK_SIZE_STENCIL);
1062 CUDA_LAUNCH((SparseGridGpuKernels::applyStencilInPlaceNoShared
1068 indexBuffer.toKernel(),
1069 dataBuffer.toKernel(),
1070 this->
template toKernelNN<stencil::stencil_type::nNN, 0>(),
1074 template<
typename ids_type>
1077 for (
int i = 0 ; i < chunk_ids.size() ; i++)
1081 auto c_pos = gridGeometry.InvLinId(chunk_ids.template get<0>(i)*blockSize);
1083 for (
int j = 0 ; j < dim ; j++)
1085 box.
setLow(j,c_pos.get(j) * spacing[j] - 0.5*spacing[j] + offset.
get(j)*spacing[j]);
1086 box.
setHigh(j,(c_pos.get(j) + blockEdgeSize)*spacing[j] - 0.5*spacing[j] + offset.
get(j)*spacing[j]);
1089 chunks_box.add(box);
1093 template<
typename MemType,
unsigned int ... prp>
1096 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1103 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1105 auto sub_it = this->
getIterator(copySect.get(i).dst.getKP1(),copySect.get(i).dst.getKP2(),NO_ITERATOR_INIT);
1113 template<
unsigned int ... prp>
1114 void removeCopyToFinalize_phase1(mgpu::ofp_context_t & ctx,
int opt)
1116 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1120 template<
unsigned int ... prp>
1121 void removeCopyToFinalize_phase2(mgpu::ofp_context_t & ctx,
int opt)
1126 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1133 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1135 auto sub_it = this->
getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1137 this->packRequest(sub_it,req);
1148 for (
size_t i = 0 ; i < copySect.
size() ; i++)
1150 auto sub_it = this->
getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1157 size_t req = mem.
size();
1172 template<
unsigned int ... prp>
1173 void removeCopyToFinalize_phase3(mgpu::ofp_context_t & ctx,
int opt,
bool is_unpack_remote)
1177 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
1179 if (
tmp2.size() == 0)
1186 auto & add_buff = this->blockMap.private_get_vct_add_index();
1187 add_buff.swap(
tmp2);
1189 auto & nadd_buff = this->blockMap.private_get_vct_nadd_index();
1190 ite = nadd_buff.getGPUIterator();
1191 CUDA_LAUNCH(SparseGridGpuKernels::set_one,ite,nadd_buff.toKernel());
1195 this->
template flush<sLeft_<prp>...>(ctx,flush_type::FLUSH_ON_DEVICE);
1201 auto & o_map = this->getSegmentToOutMap();
1202 auto & segments_data = this->getSegmentToMergeIndexMap();
1204 new_map.resize(a_map.size(),0);
1208 ite = segments_data.getGPUIterator();
1210 if (ite.nblocks() != 0)
1211 CUDA_LAUNCH(SparseGridGpuKernels::construct_new_chunk_map<1>,ite,
new_map.toKernel(),a_map.toKernel(),m_map.toKernel(),o_map.toKernel(),segments_data.toKernel(),sz_b);
1213 convert_blk.template hostToDevice<0>();
1227 RestoreUnpackVariableIfKeepGeometry(opt,is_unpack_remote);
1231 size_t n_accu_cnk = 0;
1232 for (
size_t i = 0 ; i < n_cnk_cp.
size() ; i++)
1235 size_t n_pnt = n_pnt_cp.get(i);
1237 void * data_base_ptr = data_base_ptr_cp.get(i);
1238 data_ptr_fill<AggregateT,1,prp...> dpf(data_base_ptr,0,data,n_pnt);
1239 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(dpf);
1241 ite.wthr.x = n_cnk_cp.get(i);
1247 for (
int j = 0 ; j < dim ; j++)
1251 if (l >= blockEdgeSize)
1252 {ite.thr.x *= blockEdgeSize;}
1258 if (ite.nblocks() != 0 && ite.thr.x != 0)
1263 AggregateT,decltype(convert_blk.toKernel()),decltype(
new_map.toKernel()),
1264 decltype(data),decltype(chunks.toKernel()),prp... >),ite,
1265 (
unsigned int *)scan_ptrs_cp.get(i),
1266 (
unsigned short int *)offset_ptrs_cp.get(i),
1267 convert_blk.toKernel(),
1278 n_accu_cnk += n_cnk_cp.get(i)*n_shifts_cp.get(i);
1282 saveUnpackVariableIfNotKeepGeometry(opt,is_unpack_remote);
1285 template<
unsigned int n_it,
unsigned int ... prp>
1289 bool is_pack_remote)
1302 {std::cerr << __FILE__ <<
":" << __LINE__ <<
" error the packing request number differ from the number of packed objects " << req_index <<
" " <<
pack_subs.size() << std::endl;}
1308 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
1312 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
1314 size_t n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
1321 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
1323 size_t n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
1326 index_ptr.ptr[i] = index_ptrs.get(i);
1327 scan_ptr.ptr[i] = scan_ptrs.get(i);
1331 data_ptr_fill<AggregateT,n_it,prp...> dpf(data_ptrs.get(i),i,data_ptr,
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1));
1332 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(dpf);
1334 offset_ptr.ptr[i] = offset_ptrs.get(i);
1335 mask_ptr.ptr[i] = mask_ptrs.get(i);
1344 calculatePackingPointsFromBoxes<n_it>(opt,tot_pnt);
1356 for(
int i = 0 ; i < n_it ; i++)
1358 for (
int j = 0 ; j <
sizeof...(prp) ; j++)
1360 arr_data->ptr[i][j] = data_ptr.ptr[i][j];
1373 decltype(indexBuffer.toKernel()),
1374 decltype(dataBuffer.toKernel()),
1375 decltype(
tmp.toKernel()),
1380 dataBuffer.toKernel(),
1381 indexBuffer.toKernel(),
1400 {CUDA_LAUNCH(SparseGridGpuKernels::last_scan_point,ite,scan_ptr,
tmp.toKernel(),indexBuffer.size()+1,
pack_subs.size());}
1413 template<
unsigned int ... prp,
typename S2>
1417 mgpu::ofp_context_t &context)
1420 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
1435 for (
int i = 0 ; i < dim ; i++)
1439 origPack_cnk.
set_d(i,((
int)(
tmp / blockEdgeSize))*blockEdgeSize);
1443 for (
int i = 0 ; i < dim ; i++)
1450 size_t actual_offset = n_cnk*
sizeof(indexT);
1453 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
1456 ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int) +
sizeof(
unsigned int));
1462 size_t n_pnt = *(
unsigned int *)((
unsigned char *)mem.
getPointer() + ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int));
1463 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
1467 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
1471 offset_ptrs_cp.add(offsets);
1472 scan_ptrs_cp.add(scan);
1473 n_cnk_cp.add(n_cnk);
1474 n_pnt_cp.add(n_pnt);
1475 data_base_ptr_cp.add(data_base_ptr);
1479 for (
int i = 0 ; i < dim ; i++)
1487 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
1496 for (
int i = 0 ; i < dim ; i++)
1497 {
shifts.last().template get<0>()[i] = 0;}
1499 for (
int i = 0 ; i < dim ; i++)
1501 int op_q = origPack_pnt.
get(i) % blockEdgeSize;
1502 int ou_q = sub_it.
getStart().
get(i) % blockEdgeSize;
1503 int quot = abs(ou_q - op_q) % blockEdgeSize;
1510 for (
int j = 0 ; j < sz ; j++)
1513 for (
int k = 0 ; k < dim ; k++)
1515 shifts.last().template get<0>()[k] =
shifts.template get<0>(j)[k] + ((i == k)?squot:0);
1521 shifts.template hostToDevice<0>();
1523 linearizer gridGeoPack(sz);
1526 size_t sz[1] = {n_cnk};
1528 auto ite = g.getGPUIterator();
1533 for (
int i = 0 ; i < dim ; i++)
1535 sz_g.
set_d(i,gridGeometry.getSize()[i]);
1536 origUnpack_cnk.
set_d(i,(
int)(sub_it.
getStart().
get(i) / blockEdgeSize)*blockEdgeSize);
1542 n_shifts_cp.add(
shifts.size());
1548 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,1,indexT>),ite,ids,
1550 gridGeoPack,origPack_cnk,
1551 gridGeometry,origUnpack_cnk,
1559 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,2,indexT>),ite,ids,
1561 gridGeoPack,origPack_cnk,
1562 gridGeometry,origUnpack_cnk,
1570 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,4,indexT>),ite,ids,
1572 gridGeoPack,origPack_cnk,
1573 gridGeometry,origUnpack_cnk,
1581 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,8,indexT>),ite,ids,
1583 gridGeoPack,origPack_cnk,
1584 gridGeometry,origUnpack_cnk,
1600 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
1609 template<
typename origPackType,
typename IteratorType>
1613 for (
int i = 0 ; i < dim ; i++)
1615 int op_q = origPack.get(i) % blockEdgeSize;
1616 int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1617 int quot = abs(ou_q - op_q) % blockEdgeSize;
1625 for (
int j = 0 ; j < this->blockSize ; j++)
1634 for (
int i = 0 ; i < dim ; i++)
1636 int c = x % blockEdgeSize;
1638 if (quot_diff[i] + c < 0)
1640 offset += pos_c*(quot_diff[i] + c + blockEdgeSize);
1643 else if (quot_diff[i] + c >= blockEdgeSize)
1645 offset += pos_c*(quot_diff[i] + c - blockEdgeSize);
1650 offset += pos_c*(quot_diff[i] + c);
1654 pos_c *= blockEdgeSize;
1655 bp_c *= (quot_diff[i] != 0)?2:1;
1659 convert_blk.template get<0>(convert_blk.
size()-1)[pos] = (bpos << 16) + offset;
1665 typedef AggregateT value_type;
1667 typedef self device_grid_type;
1670 :stencilSupportRadius(1)
1688 :stencilSupportRadius(stencilSupportRadius)
1698 : gridGeometry(gridGeometry),
1699 stencilSupportRadius(stencilSupportRadius)
1704 for (
int i = 0 ; i < dim ; i++) {sz_st[i] = gridGeometry.getSize()[i];}
1713 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1717 decltype(extendedBlockGeometry),
1726 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1730 decltype(extendedBlockGeometry),
1736 extendedBlockGeometry,
1737 stencilSupportRadius,
1739 nn_blocks.toKernel(),
1746 template<
unsigned int nNN,
unsigned int nLoop>
1751 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1755 decltype(extendedBlockGeometry),
1764 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1768 decltype(extendedBlockGeometry),
1774 extendedBlockGeometry,
1775 stencilSupportRadius,
1777 nn_blocks.toKernel(),
1805 return gridGeometry;
1813 template<
typename stencil_type>
1816 computeGhostLayerMapping<stencil_type>();
1820 constexpr
static unsigned int getBlockEdgeSize()
1822 return blockEdgeSize;
1825 constexpr
unsigned int getBlockSize()
const 1831 template<
typename CoordT>
1832 inline size_t getLinId(CoordT &coord)
1834 return gridGeometry.LinId(coord);
1839 return gridGeometry.InvLinId(linId);
1844 return gridSize.getGPUIterator(start,stop,n_thr);
1856 template<
typename CoordT>
1863 auto glid = gridGeometry.LinId(coord);
1865 auto bid = glid / blockSize;
1866 auto lid = glid % blockSize;
1868 auto key = blockMap.get_sparse(bid);
1870 k.set_cnk_pos_id(key.id);
1885 template<
unsigned int p,
typename CoordT>
1900 template<
unsigned int p>
1933 template<
typename CoordT>
1938 gridGeometry.LinId(coord,lin,offset);
1974 template<
unsigned int p>
1992 template<
unsigned int p,
typename CoordT>
1993 auto insert(
const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &
1998 template<
typename CoordT>
2003 gridGeometry.LinId(coord,ind,offset);
2114 ite.wthr.x = indexBuffer.size();
2118 ite.thr.x = getBlockSize();
2123 output.resize(indexBuffer.size()+1);
2127 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2129 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2133 openfpm::scan((
unsigned int *)output.template getDeviceBuffer<0>(),output.
size(),(
unsigned int *)output.template getDeviceBuffer<0>(),context);
2135 output.template deviceToHost<0>(output.
size()-1,output.
size()-1);
2136 unsigned int padding_points = output.template get<0>(output.
size()-1);
2141 pd_points.resize(padding_points);
2154 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_dw_count<dim,
2157 ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),
link_dw_scan.toKernel(),p_dw);
2167 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_dw<dim,
2169 blockSize>),ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),
link_dw_scan.toKernel(),
link_dw.toKernel(),p_dw);
2192 ite.wthr.x = indexBuffer.size();
2196 ite.thr.x = getBlockSize();
2201 output.resize(indexBuffer.size()+1);
2205 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2207 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2211 openfpm::scan((
unsigned int *)output.template getDeviceBuffer<0>(),output.
size(),(
unsigned int *)output.template getDeviceBuffer<0>(),context);
2213 output.template deviceToHost<0>(output.
size()-1,output.
size()-1);
2214 unsigned int padding_points = output.template get<0>(output.
size()-1);
2219 pd_points.resize(padding_points);
2232 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_up_count<dim,
2235 ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),
link_up_scan.toKernel(),p_up);
2245 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_up<dim,
2247 blockSize>),ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),
link_up_scan.toKernel(),
link_up.toKernel(),p_up);
2258 template<
typename dim3T>
2263 dim3SizeToInt(nBlock),
2264 dim3SizeToInt(nSlot)
2278 template<
typename stencil_type = NNStar<dim>,
typename checker_type = No_check>
2279 void tagBoundaries(mgpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
2285 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
2286 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
2288 if (numScalars == 0)
return;
2289 if (findNN ==
false)
2291 findNeighbours<stencil_type>();
2297 unsigned int localThreadBlockSize = dataChunkSize;
2298 unsigned int threadGridSize = numScalars % dataChunkSize == 0
2299 ? numScalars / dataChunkSize
2300 : 1 + numScalars / dataChunkSize;
2305 if (stencilSupportRadius == 1)
2307 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2313 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->
template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2315 else if (stencilSupportRadius == 2)
2317 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2323 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->
template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2325 else if (stencilSupportRadius == 0)
2327 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2333 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->
template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2338 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: stencilSupportRadius supported only up to 2, passed: " << stencilSupportRadius << std::endl;
2342 if (opt == tag_boundaries::CALCULATE_EXISTING_POINTS)
2347 block_points.resize(indexBuffer.size() + 1);
2348 block_points.template get<0>(block_points.
size()-1) = 0;
2349 block_points.template hostToDevice<0>(block_points.
size()-1,block_points.
size()-1);
2353 ite.wthr.x = indexBuffer.size();
2356 ite.thr.x = getBlockSize();
2362 dataBuffer.toKernel(),
2363 block_points.toKernel());
2366 openfpm::scan((indexT *)block_points.template getDeviceBuffer<0>(),block_points.
size(),(indexT *)block_points.template getDeviceBuffer<0>(),context);
2369 block_points.template deviceToHost<0>(block_points.
size()-1,block_points.
size()-1);
2370 size_t tot = block_points.template get<0>(block_points.
size()-1);
2375 dataBuffer.toKernel(),
2376 block_points.toKernel(),
2381 cudaDeviceSynchronize();
2384 template<
typename NNtype = NNStar<dim>>
2385 void findNeighbours()
2390 const unsigned int numBlocks = indexBuffer.size();
2391 const unsigned int numScalars = numBlocks * NNtype::nNN;
2392 nn_blocks.resize(numScalars);
2394 if (numScalars == 0)
return;
2398 unsigned int localThreadBlockSize = NNtype::nNN;
2400 unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
2401 ? numScalars / localThreadBlockSize
2402 : 1 + numScalars / localThreadBlockSize;
2404 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::findNeighbours<dim,NNtype>),
2405 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
2410 size_t countExistingElements()
const 2417 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2418 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2419 constexpr
unsigned int blockSize = MaskBlockT::size;
2420 const auto bufferSize = indexBuffer.size();
2422 size_t numExistingElements = 0;
2424 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2426 auto dataBlock = dataBuffer.get(blockId);
2427 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2429 const auto curMask = dataBlock.template get<pMask>()[elementId];
2431 if (this->exist(curMask))
2433 ++numExistingElements;
2438 return numExistingElements;
2441 size_t countBoundaryElements()
2448 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2449 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2450 constexpr
unsigned int blockSize = MaskBlockT::size;
2451 const auto bufferSize = indexBuffer.size();
2453 size_t numBoundaryElements = 0;
2455 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2457 auto dataBlock = dataBuffer.get(blockId);
2458 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2460 const auto curMask = dataBlock.template get<pMask>()[elementId];
2462 if (this->exist(curMask) && this->isPadding(curMask))
2464 ++numBoundaryElements;
2469 return numBoundaryElements;
2473 void measureBlockOccupancyMemory(
double &mean,
double &deviation)
2480 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2481 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2482 constexpr
unsigned int blockSize = MaskBlockT::size;
2483 const auto bufferSize = indexBuffer.size();
2487 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2489 auto dataBlock = dataBuffer.get(blockId);
2490 size_t numElementsInBlock = 0;
2491 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2493 const auto curMask = dataBlock.template get<pMask>()[elementId];
2495 if (this->exist(curMask))
2497 ++numElementsInBlock;
2500 double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2501 measures.add(blockOccupancy);
2504 standard_deviation(measures, mean, deviation);
2508 void measureBlockOccupancy(
double &mean,
double &deviation)
2515 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2516 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2517 constexpr
unsigned int blockSize = MaskBlockT::size;
2518 const auto bufferSize = indexBuffer.size();
2522 for (
size_t blockId=0; blockId<bufferSize; ++blockId)
2524 auto dataBlock = dataBuffer.get(blockId);
2525 size_t numElementsInBlock = 0;
2526 for (
size_t elementId=0; elementId<blockSize; ++elementId)
2528 const auto curMask = dataBlock.template get<pMask>()[elementId];
2530 if (this->exist(curMask) && !this->isPadding(curMask))
2532 ++numElementsInBlock;
2535 double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2536 measures.add(blockOccupancy);
2539 standard_deviation(measures, mean, deviation);
2556 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2561 for (
int i = 0 ; i < dim ; i++)
2567 applyStencils< SparseGridGpuKernels::stencil_cross_func<dim,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2575 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2580 for (
int i = 0 ; i < dim ; i++)
2588 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2595 template<
unsigned int prop_src,
unsigned int prop_dst,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2600 for (
int i = 0 ; i < dim ; i++)
2608 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2615 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2620 for (
int i = 0 ; i < dim ; i++)
2628 applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2635 template<
unsigned int prop_src1,
unsigned int prop_src2,
unsigned int prop_dst1 ,
unsigned int prop_dst2,
unsigned int stencil_size,
typename lambda_f,
typename ... ArgsT >
2640 for (
int i = 0 ; i < dim ; i++)
2648 applyStencils< SparseGridGpuKernels::stencil_func_conv2<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2660 for (
int i = 0 ; i < dim ; i++)
2663 b.setHigh(i,gridGeometry.getSize()[i]);
2670 template<
typename stencil,
typename... Args>
2671 void applyStencils(
const Box<dim,int> & box, StencilMode mode, Args... args)
2673 if (findNN ==
false)
2675 findNeighbours<typename stencil::stencil_type>();
2687 case STENCIL_MODE_INPLACE:
2688 applyStencilInPlace<stencil>(box,mode,args...);
2690 case STENCIL_MODE_INPLACE_NO_SHARED:
2691 applyStencilInPlaceNoShared<stencil>(box,mode,args...);
2695 template<
typename stencil1,
typename stencil2,
typename ... otherStencils,
typename... Args>
2696 void applyStencils(
Box<dim,int> box, StencilMode mode, Args... args)
2698 applyStencils<stencil1>(box,mode, args...);
2699 applyStencils<stencil2, otherStencils ...>(box,mode, args...);
2702 template<
typename BitMaskT>
2703 inline static bool isPadding(BitMaskT &bitMask)
2709 template<
typename BitMaskT>
2710 inline static void setPadding(BitMaskT &bitMask)
2716 template<
typename BitMaskT>
2717 inline static void unsetPadding(BitMaskT &bitMask)
2730 template<
typename CoordT>
2733 return gridGeometry.BlockLinId(blockCoord);
2746 template<
unsigned int p>
2751 indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());
2752 indexT local_id = coord.get_data_id();
2757 block_data.template get<BMG::pMask>()[local_id] = 1;
2759 return block_data.template get<p>()[local_id];
2772 template<
unsigned int p,
typename CoordT>
2776 auto lin = gridGeometry.LinId(coord);
2777 indexT block_id = lin / blockSize;
2778 indexT local_id = lin % blockSize;
2783 block_data.template get<BMG::pMask>()[local_id] = 1;
2785 return block_data.template get<p>()[local_id];
2788 template<
unsigned int p>
2789 void print_vct_add_data()
2793 threadBlockSize, indexT, layout_base> BMG;
2795 auto & bM = BMG::blockMap.private_get_vct_add_data();
2796 auto & vI = BMG::blockMap.private_get_vct_add_index();
2797 bM.template deviceToHost<p>();
2798 vI.template deviceToHost<0>();
2800 std::cout <<
"vct_add_data: " << std::endl;
2802 for (
size_t i = 0 ; i < bM.size() ; i++)
2804 std::cout << i <<
" index: " << vI.template get<0>(i) <<
" BlockData: " << std::endl;
2805 for (
size_t j = 0 ; j < blockSize ; j++)
2807 std::cout << (
int)bM.template get<p>(i)[j] <<
" ";
2810 std::cout << std::endl;
2819 template<
unsigned int p>
2820 void setBackgroundValue(
typename boost::mpl::at<
typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
2824 BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
2836 static bool packRequest()
2845 template<
int ... prp>
inline 2852 indexBuffer.template packRequest<prp ...>(req);
2853 dataBuffer.template packRequest<prp ...>(req);
2874 indexBuffer.template pack<prp ...>(mem,sts);
2875 dataBuffer.template pack<prp ...>(mem,sts);
2896 indexBuffer.template
unpack<prp ...>(mem,ps);
2897 dataBuffer.template
unpack<prp ...>(mem,ps);
2915 if (mem.
size() != 0)
2916 {std::cout << __FILE__ <<
":" << __LINE__ <<
" not implemented: " << std::endl;}
2924 template<
int ... prp>
inline 2932 ite.wthr.x = indexBuffer.size();
2935 ite.thr.x = getBlockSize();
2939 tmp.resize(indexBuffer.size() + 1);
2944 dataBuffer.toKernel(),
2947 openfpm::scan((indexT *)
tmp.
template getDeviceBuffer<0>(),
2948 tmp.size(), (indexT *)
tmp.
template getDeviceBuffer<0>(), context);
2950 tmp.template deviceToHost<0>(
tmp.size()-1,
tmp.size()-1);
2954 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof... (prp)>>(spq);
2956 size_t n_pnt =
tmp.template get<0>(
tmp.size()-1);
2961 req =
sizeof(indexT) +
2962 sizeof(indexT)*indexBuffer.size() +
2963 sizeof(indexT)*
tmp.size() +
2964 n_pnt*(spq.point_size +
sizeof(
short int) +
sizeof(
unsigned char));
2981 template<
int ... prp>
inline 2987 for (
int i = 0 ; i < dim ; i++)
3005 offset_ptrs.clear();
3017 template<
int ... prp>
inline 3026 ite.wthr.x = indexBuffer.size();
3029 ite.thr.x = getBlockSize();
3035 if (indexBuffer.size() != 0)
3040 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3045 indexBuffer.toKernel(),
3048 dataBuffer.toKernel(),
3050 indexBuffer.size() + 1);
3055 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3060 indexBuffer.toKernel(),
3063 dataBuffer.toKernel(),
3065 indexBuffer.size() + 1);
3070 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3075 indexBuffer.toKernel(),
3078 dataBuffer.toKernel(),
3080 indexBuffer.size() + 1);
3085 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3090 indexBuffer.toKernel(),
3093 dataBuffer.toKernel(),
3095 indexBuffer.size() + 1);
3099 std::cout << __FILE__ <<
":" << __LINE__ <<
" error no implementation available of packCalculate, create a new case for " <<
pack_subs.size() << std::endl;
3105 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3110 for (
size_t i = 0 ; i <
pack_subs.size() ; i++)
3115 tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3116 tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3119 tmp.template hostToDevice<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3120 tmp.template hostToDevice<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3122 openfpm::scan(((indexT *)
tmp.
template getDeviceBuffer<0>()) + i*(indexBuffer.size() + 1),
3123 indexBuffer.size() + 1, (indexT *)
tmp.
template getDeviceBuffer<0>() + i*(indexBuffer.size() + 1), context);
3125 openfpm::scan(((
unsigned int *)
tmp.
template getDeviceBuffer<1>()) + i*(indexBuffer.size() + 1),
3126 indexBuffer.size() + 1, (
unsigned int *)
tmp.
template getDeviceBuffer<1>() + i*(indexBuffer.size() + 1), context);
3128 tmp.template deviceToHost<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3129 tmp.template deviceToHost<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3131 scan_it.template get<0>(i) =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3133 n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3134 n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3136 req +=
sizeof(size_t) +
3138 sizeof(indexT)*n_cnk +
3139 align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int)) +
3140 align_number(
sizeof(indexT),n_pnt*(spq.point_size)) +
3141 align_number(
sizeof(indexT),n_pnt*
sizeof(
short int)) +
3142 align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3145 scan_it.template hostToDevice<0>();
3147 openfpm::scan((indexT *)
scan_it.
template getDeviceBuffer<0>(),
3148 scan_it.size(), (indexT *)
scan_it.
template getDeviceBuffer<0>(), context);
3158 return this->blockMap.getMappingVector();
3168 return this->blockMap.getMergeIndexMapVector();
3189 unsigned int i = req_index;
3192 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3197 size_t n_pnt =
tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3198 size_t n_cnk =
tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3207 for (
int i = 0 ; i < dim ; i++)
3210 for (
int i = 0 ; i < dim ; i++)
3216 mem.
allocate(n_cnk*
sizeof(indexT));
3220 mem.
allocate( align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int)) );
3224 mem.
allocate( align_number(
sizeof(indexT),n_pnt*(spq.point_size)) );
3228 mem.
allocate( align_number(
sizeof(indexT),n_pnt*
sizeof(
short int) ) );
3232 mem.
allocate( align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char) ) );
3255 template<
unsigned int ... prp>
3258 if ((opt & 0x3) == rem_copy_opt::PHASE1)
3260 this->
template removeCopyToFinalize_phase1<prp ...>(ctx,opt);
3262 else if ((opt & 0x3) == rem_copy_opt::PHASE2)
3264 this->
template removeCopyToFinalize_phase2<prp ...>(ctx,opt);
3268 this->
template removeCopyToFinalize_phase3<prp ...>(ctx,opt,
false);
3287 bool is_pack_remote =
false)
3290 RestorePackVariableIfKeepGeometry(opt,is_pack_remote);
3294 pack_sg_implement<32,prp...>(mem,sts,opt,is_pack_remote);
3298 pack_sg_implement<64, prp...>(mem,sts,opt,is_pack_remote);
3302 pack_sg_implement<80, prp...>(mem,sts,opt,is_pack_remote);
3306 std::cout << __FILE__ <<
":" << __LINE__ <<
" error no implementation available of packCalculate, create a new case for " <<
pack_subs.size() << std::endl;
3309 savePackVariableIfNotKeepGeometry(opt,is_pack_remote);
3321 auto & vad = BMG::blockMap.private_get_vct_add_data();
3322 auto & vai = BMG::blockMap.private_get_vct_add_index();
3328 offset_ptrs_cp.clear();
3329 scan_ptrs_cp.clear();
3332 data_base_ptr_cp.clear();
3334 n_shifts_cp.clear();
3335 convert_blk.clear();
3346 gridGeometry.swap(gr.gridGeometry);
3364 if (rem_sects.
size() != 0)
3366 rem_sects.template hostToDevice<0,1>();
3368 tmp.resize(indexBuffer.size() + 1);
3370 tmp.template get<1>(
tmp.size()-1) = 0;
3371 tmp.template hostToDevice<1>(
tmp.size()-1,
tmp.size()-1);
3373 auto ite = indexBuffer.getGPUIterator();
3375 if (has_work_gpu(ite) ==
true)
3378 CUDA_LAUNCH((SparseGridGpuKernels::calc_remove_points_chunks_boxes<dim,
3380 blockEdgeSize>),ite,indexBuffer.toKernel(),rem_sects.toKernel(),
3381 gridGeometry,dataBuffer.toKernel(),
3385 openfpm::scan((
unsigned int *)
tmp.template getDeviceBuffer<1>(),
tmp.size(),(
unsigned int *)
tmp.template getDeviceBuffer<1>(),context);
3387 tmp.template deviceToHost<1>(
tmp.size()-1,
tmp.size()-1);
3390 size_t nr_cnk =
tmp.template get<1>(
tmp.size()-1);
3392 tmp3.resize(nr_cnk);
3395 ite = indexBuffer.getGPUIterator();
3397 if (has_work_gpu(ite) ==
false) {
return;}
3399 CUDA_LAUNCH((SparseGridGpuKernels::collect_rem_chunks),ite,
tmp.toKernel(),
tmp3.toKernel());
3403 ite =
tmp3.getGPUIterator();
3405 ite.wthr.x =
tmp3.size();
3408 ite.thr.x = getBlockSize();
3412 if (has_work_gpu(ite) ==
false) {
return;}
3414 CUDA_LAUNCH((SparseGridGpuKernels::remove_points<dim,
3416 ite,indexBuffer.toKernel(),
3418 dataBuffer.toKernel(),
3420 rem_sects.toKernel());
3432 template<
unsigned int ... prp>
3435 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3438 removeCopyToFinalize_phase3<prp ...>(context,opt,
true);
3449 offset_ptrs_cp.clear();
3450 scan_ptrs_cp.clear();
3451 data_base_ptr_cp.clear();
3454 n_shifts_cp.clear();
3455 convert_blk.clear();
3457 data_base_ptr_cp.clear();
3471 rem_sects.add(section_to_delete);
3499 grid_src.copySect.add(sgs);
3505 template<
typename pointers_type,
3506 typename headers_type,
3507 typename result_type,
3508 unsigned int ... prp >
3509 static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result,
int n_slot)
3513 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3515 result.allocate(
sizeof(
int));
3517 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),
3518 pointers.toKernel(),
3520 (
int *)result.getDevicePointer(),
3534 template<
unsigned int ... prp,
typename S2,
typename header_type>
3537 header_type & headers,
3540 mgpu::ofp_context_t &context,
3541 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3545 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3555 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3559 size_t n_cnk = headers.template get<1>(ih);
3561 ps.
addOffset(2*dim*
sizeof(
unsigned int));
3563 size_t actual_offset = n_cnk*
sizeof(indexT);
3564 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
3568 size_t n_pnt = headers.template get<2>(ih);
3569 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
3573 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
3576 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
3577 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3579 scan_ptrs_cp.add(scan);
3580 offset_ptrs_cp.add(offsets);
3581 data_base_ptr_cp.add(data_base_ptr);
3604 template<
unsigned int ... prp,
typename S2>
3608 mgpu::ofp_context_t &context,
3609 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3613 if ((opt & rem_copy_opt::KEEP_GEOMETRY) ==
false)
3623 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(prp)>>(spq);
3646 ps.
addOffset(2*dim*
sizeof(
unsigned int));
3648 size_t actual_offset = n_cnk*
sizeof(indexT);
3649 unsigned int * scan = (
unsigned int *)((
unsigned char *)mem.
getDevicePointer() + ps.
getOffset() + n_cnk*
sizeof(indexT));
3652 ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int) +
sizeof(
unsigned int));
3656 size_t n_pnt = *(
unsigned int *)((
unsigned char *)mem.
getPointer() + ps.
getOffset() + actual_offset + n_cnk*
sizeof(
unsigned int));
3657 actual_offset += align_number(
sizeof(indexT),(n_cnk+1)*
sizeof(
unsigned int));
3661 actual_offset += align_number(
sizeof(indexT),n_pnt*(spq.point_size));
3664 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
short));
3665 actual_offset += align_number(
sizeof(indexT),n_pnt*
sizeof(
unsigned char));
3667 scan_ptrs_cp.add(scan);
3668 offset_ptrs_cp.add(offsets);
3669 data_base_ptr_cp.add(data_base_ptr);
3741 auto getSegmentToOutMap() const -> decltype(
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3751 auto getSegmentToMergeIndexMap() const -> decltype(
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3777 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) || defined(VTKWRITER_HPP_) 3784 template<
typename Tw =
float>
bool write(
const std::string & output)
3792 return write_with_spacing_offset(output,spacing,offset);
3800 template<
typename Tw =
float>
3803 file_type ft = file_type::BINARY;
3807 auto & index = bm.getIndexBuffer();
3808 auto & data = bm.getDataBuffer();
3815 auto it = index.getIterator();
3819 auto key = it.get();
3823 for (
size_t i = 0 ; i < gridGeometry.getBlockSize() ; i++)
3830 for (
size_t k = 0 ; k < dim ; k++)
3831 {p.
get(k) = keyg.
get(k)*spacing[k] + offset[k]*spacing[k];}
3837 cp(data.get_o(key),tmp_prp.last(),key,i);
3839 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);
3841 tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];
3850 vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.
size());
3855 return vtk_writer.write(output,prp_names,
"sparse_grid",
"",ft);
3872 fill_chunks_boxes(chunks_box,ids,spacing,offset);
3874 vtk_box1.add(chunks_box);
3875 vtk_box1.write(std::string(
"chunks_") + output + std::string(
".vtk"));
3879 write_with_spacing_offset(std::string(
"data_") + output + std::string(
".vtk"),spacing,offset);
3887 template<
unsigned int dim,
3888 typename AggregateT,
3891 typename indexT=
long int,
3896 template<
unsigned int dim,
3897 typename AggregateT,
3900 typename indexT=
int,
3905 template<
unsigned int dim,
3906 typename AggregateT,
3909 typename indexT=
int,
3914 #endif //OPENFPM_PDATA_SPARSEGRIDGPU_HPP Box< dim, int > getBox()
Return a Box with the range if the SparseGrid.
void operator()(T &t) const
It call the copy function for each property.
int get_cnk_pos_id() const
Get chunk position id.
auto get(const sparse_grid_gpu_index< self > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
void * base_ptr
data pointers
size_t getOffset()
Return the actual counter.
This class represent an N-dimensional box.
openfpm::vector_gpu< aggregate< int[dim]> > shifts
shifts for chunk conversion
void packRequest(size_t &req, mgpu::ofp_context_t &context) const
memory requested to pack this object
openfpm::vector_gpu< aggregate< indexT > > e_points
size_t getBlockLinId(const CoordT &blockCoord) const
Linearization of block coordinates.
virtual size_t size() const
Get the size of the LAST allocated memory.
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId, is_new).template get< p >())
insert a block + flush, host version
unsigned char getFlag(const sparse_grid_gpu_index< self > &coord) const
Return the flag of the point.
void setGPUInsertBuffer(int nBlock, int nSlot)
SparseGridGpu(linearizer &gridGeometry, unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
auto get_o(const grid_key_dx< dim, CoordT > &coord) const -> encap_data_block< typename std::remove_const< decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::get(0))>::type >
Get an element using the point coordinates.
decltype(blockMap) & private_get_blockMap()
Return internal structure block map.
auto private_get_add_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
grid_key_dx is the key to access any element in the grid
virtual void deviceToHost()
Do nothing.
SparseGridGpu(const size_t(&res)[dim], unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
void construct_link_dw(self &grid_dw, const Box< dim, int > &db_, Point< dim, int > p_dw, mgpu::ofp_context_t &context)
construct link on the down level
int get_data_id() const
Get chunk local index (the returned index < getblockSize())
openfpm::vector_gpu< aggregate< indexT, unsigned int > > tmp
temporal
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
ExtPreAlloc< CudaMemory > * prAlloc_prp
Memory to remove copy finalize.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
virtual bool allocate(size_t sz)
allocate memory
openfpm::vector_gpu< aggregate< short int, short int > > ghostLayerToThreadsMapping
openfpm::vector_gpu< aggregate< indexT > > tmp2
temporal 2
static constexpr bool isCompressed()
This is a multiresolution sparse grid so is a compressed format.
decltype(self::type_of_iterator()) getIterator() const
Return a SparseGrid iterator.
virtual void hostToDevice()
Move memory from host to device.
virtual void * getPointer()
Return the pointer of the last allocation.
openfpm::vector_gpu< aggregate< unsigned int > > pack_output
Helper array to pack points.
size_t getOffsetEnd()
Get offset.
void pack(ExtPreAlloc< HeapMemory > &mem, Pack_stat &sts) const
Pack the object into the memory.
void conv_cross_b(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
virtual void * getPointer()
get a readable pointer with the data
openfpm::vector_gpu< Box< dim, int > > pack_subs
the set of all sub-set to pack
void construct_link(self &grid_up, self &grid_dw, mgpu::ofp_context_t &context)
construct link between levels
__device__ __host__ index_type get(index_type i) const
Get the i index.
static __device__ bool getNNindex_offset(grid_key_dx< dim, indexT2 > &coord, unsigned int &NN_index, unsigned int &offset_nn)
given a coordinate writtel in local coordinate for a given it return the neighborhood chunk position ...
This class implement the point shape in an N-dimensional space.
void removeAddUnpackReset()
In this case it does nothing.
size_t size() const
return the size of the grid
void unpack(ExtPreAlloc< CudaMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void construct_link_up(self &grid_up, const Box< dim, int > &db_, Point< dim, int > p_up, mgpu::ofp_context_t &context)
construct link on the up levels
void packReset()
Reset the pack calculation.
void remove(const Box< dim, int > §ion_to_delete)
Remove all the points in this region.
auto get_o(const sparse_grid_gpu_index< self > &coord) const -> encap_data_block< typename std::remove_const< decltype(private_get_data_array().get(0))>::type >
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
This class allocate, and destroy CPU memory.
openfpm::vector_gpu< aggregate< int, short int > > link_dw
links of the padding points with real points of a finer sparsegrid
static SparseGridGpu_iterator< dim, self > type_of_iterator()
This is a meta-function return which type of iterator a grid produce.
virtual bool allocate(size_t sz)
Allocate a chunk of memory.
Transform the boost::fusion::vector into memory specification (memory_traits)
void removePoints(mgpu::ofp_context_t &context)
Remove the points we queues to remove.
this class is a functor for "for_each" algorithm
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
int sgn(T val)
Gets the sign of a variable.
void removeAddUnpackFinalize(mgpu::ofp_context_t &context, int opt)
This function remove the points we queue to remove and it flush all the added/unpacked data.
openfpm::vector_gpu< aggregate< int > > new_map
Map between the (Last) added chunks and their position in chunks data.
openfpm::vector_gpu< aggregate< unsigned int > > link_up_scan
scan offsets of the links down
virtual void * getDevicePointer()
get a readable pointer with the data
void resize(size_t(&res)[dim])
resize the SparseGrid
void unpack(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, mgpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
virtual void hostToDevice()
Return the pointer of the last allocation.
void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
openfpm::vector_gpu< aggregate< indexT > > tmp3
temporal 3
auto private_get_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer())
Return the index array of the blocks.
base_key get_sparse(const grid_key_dx< dim, CoordT > &coord) const
Get an element using the point coordinates.
static void unpack_headers(pointers_type &pointers, headers_type &headers, result_type &result, int n_slot)
Stub does not do anything.
Element index contain a data chunk index and a point index.
bool isSkipLabellingPossible()
This function check if keep geometry is possible for this grid.
void unpack(ExtPreAlloc< HeapMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void removeCopyToFinalize(mgpu::ofp_context_t &ctx, int opt)
It finalize the queued operations of remove() and copy_to()
void packFinalize(ExtPreAlloc< CudaMemory > &mem, Pack_stat &sts, int opt=0, bool is_pack_remote=false)
Finalize the packing procedure.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
Return the mapping vector used to know where the data has been added.
void one()
Set to one the point coordinate.
static bool is_unpack_header_supported()
Indicate that unpacking the header is supported.
size_t getOffset()
Get offset.
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
auto private_get_data_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer())
Return the data array of the blocks.
auto insert_o(unsigned int linId) -> decltype(blockMap.insert(0))
insert data, host version
void conv2(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
virtual bool resize(size_t sz)
resize the momory allocated
virtual size_t size() const
the the size of the allocated memory
void setNNType()
Set the neighborhood type.
void addOffset(size_t off)
Increment the offset pointer by off.
virtual void incRef()
Increment the reference counter.
void pack(ExtPreAlloc< CudaMemory > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Pack_stat &sts)
Pack the object into the memory given an iterator.
auto get(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
void convertChunkIds(short int *offset, origPackType &origPack, IteratorType &sub_it)
convert the offset index from the packed to the add buffer
auto private_get_data_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer()) &
Return the index array of the blocks.
grid_key_dx< dim > getStart() const
Return the starting point.
auto private_get_neighborhood_array() -> decltype(nn_blocks) &
Return the index array of the blocks.
void addAndConvertPackedChunkToTmp(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, mgpu::ofp_context_t &context)
unpack the sub-grid object
openfpm::vector_gpu< aggregate< unsigned int > > & getDownLinksOffsets()
Get the offsets for each point of the links down.
void copyRemoveReset()
Reset the queue to remove and copy section of grids.
void conv_cross(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a convolution using a cross like stencil.
void packCalculate(size_t &req, mgpu::ofp_context_t &context)
Calculate the size of the information to pack.
static SparseGridGpu_iterator_sub< dim, self > type_of_subiterator()
This is a meta-function return which type of sub iterator a grid produce.
__device__ __host__ void zero()
Set to zero the point coordinate.
openfpm::vector_gpu< aggregate< int, short int > > link_up
links of the padding points with real points of a finer sparsegrid
static void unpack(ExtPreAlloc< Mem >, T &obj)
Error, no implementation.
openfpm::vector_gpu< aggregate< unsigned int > > link_dw_scan
scan offsets of the links down
int yes_i_am_grid
it define that this data-structure is a grid
openfpm::vector_gpu< aggregate< size_t > > links_up
links of the padding points with real points of a coarse sparsegrid
void packRequest(size_t &req) const
Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory.
get the type of the block
openfpm::vector_gpu< aggregate< int, short int > > & getDownLinks()
Get the links down for each point.
openfpm::vector_gpu< aggregate< unsigned int > > & getUpLinksOffsets()
Get the offsets for each point of the links up.
auto insertFlush(const grid_key_dx< dim, CoordT > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
Return the mapping vector used to know where the data has been added.
grid_key_dx< dim > getStop() const
Return the stop point.
virtual void decRef()
Decrement the reference counter.
void copy_to(self &grid_src, const Box< dim, size_t > &box_src, const Box< dim, size_t > &box_dst)
It queue a copy.
openfpm::vector_gpu< aggregate< int, short int > > & getUpLinks()
Get the links up for each point.
void reset()
Reset the internal counters.
void unpack_with_headers(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, header_type &headers, int ih, Unpack_stat &ps, mgpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void conv(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
auto insertFlush(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
auto private_get_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer()) &
Return the index array of the blocks.
linearizer & getGrid()
Return the grid information object.
void operator()(T &t) const
It call the copy function for each property.
void packRequest(SparseGridGpu_iterator_sub< dim, self > &sub_it, size_t &req) const
Calculate the size to pack part of this structure.
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
openfpm::vector_gpu< aggregate< indexT > > scan_it
contain the scan of the point for each iterator
Implementation of 1-D std::vector like structure.
auto private_get_add_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
__device__ __host__ T getHigh(int i) const
get the high interval of the box
static void pack(ExtPreAlloc< Mem >, const T &obj)
Error, no implementation.
this class is a functor for "for_each" algorithm
virtual void * getDevicePointer()
Return the pointer of the last allocation.
get the type of the insertBlock
void setDimensions(const size_t(&dims)[N])
Reset the dimension of the grid.
void setBackgroundValue(typename boost::mpl::at< typename AggregateT::type, boost::mpl::int_< p >>::type backgroundValue)
set the background for property p
auto get(const grid_key_dx< dim, CoordT > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using the point coordinates.
void conv2_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.