5 #ifndef OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH 6 #define OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH 8 #include <SparseGridGpu/BlockMapGpu.hpp> 9 #include <SparseGridGpu/TemplateUtils/mathUtils.hpp> 10 #include "util/cuda_util.hpp" 11 #include "SparseGrid/cp_block.hpp" 13 #ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE 14 #define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE 17 #ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED 18 #define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED 30 namespace SparseGridGpuKernels
32 template<
typename SparseGr
idGpuType,
typename po
inters_type,
typename headers_type>
33 __global__
void unpack_headers(pointers_type pointers, headers_type headers,
int * result,
unsigned int sz_pack,
int n_slot)
37 if (t > pointers.size()) {
return;}
39 unsigned char * data_pack = (
unsigned char *)pointers.template get<0>(t);
41 while (data_pack < pointers.template get<1>(t) )
43 int ih = pointers.template get<2>(t);
46 if (
sizeof(
typename SparseGridGpuType::indexT_) == 8)
47 {headers.template get<0>(t*n_slot + ih) = *(
size_t *)data_pack;}
50 unsigned int dp1 = *(
unsigned int *)data_pack;
51 unsigned int dp2 = *(
unsigned int *)&(data_pack[4]);
52 headers.template get<0>(t*n_slot + ih) = (size_t)dp1 + (((
size_t)dp2) << 32);
54 data_pack +=
sizeof(size_t);
55 data_pack += SparseGridGpuType::unpack_headers(headers,data_pack,t*n_slot + ih,sz_pack);
56 pointers.template get<2>(t) += 1;
68 template<
unsigned int dim>
71 template<
typename ScalarT,
typename coordType,
typename SparseGridT,
unsigned int enlargedBlockSize,
typename lambda_func,
typename ... ArgsT>
72 __device__
static inline void stencil(ScalarT & res, ScalarT & cur, coordType & coord ,
73 ScalarT (& enlargedBlock)[enlargedBlockSize],
75 SparseGridT & sparseGrid, ArgsT ... args)
79 for (
int d = 0; d < dim; ++d)
81 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
82 auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
83 ScalarT neighbourPlus = enlargedBlock[nPlusId];
84 ScalarT neighbourMinus = enlargedBlock[nMinusId];
86 cs.xm[d] = neighbourMinus;
87 cs.xp[d] = neighbourPlus;
90 res = f(cur,cs, args ...);
94 template<
unsigned int dim>
97 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename lambda_func,
typename ... ArgsT>
98 __device__
static inline void stencil(ScalarT & res, coordType & coord ,
103 printf(
"Convolution operation on GPU: Dimension not implemented \n");
106 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename lambda_func,
typename ... ArgsT>
107 __device__
static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
113 printf(
"Convolution operation on GPU: Dimension not implemented \n");
120 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgsT>
121 __device__
static inline void stencil_block(ScalarT & res, coordType & coord ,
123 DataBlockWrapperT & DataBlockLoad,
128 res = f(cpb,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
131 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename lambda_func,
typename ... ArgsT>
132 __device__
static inline void stencil(ScalarT & res, coordType & coord ,
137 res = f(cpb,coord[0],coord[1],coord[2]);
140 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename lambda_func,
typename ... ArgsT>
141 __device__
static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
147 f(res1,res2,cpb1,cpb2,coord[0],coord[1],coord[2]);
150 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgsT>
151 __device__
static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
154 DataBlockWrapperT & DataBlockLoad,
159 f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
166 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgsT>
167 __device__
static inline void stencil_block(ScalarT & res, coordType & coord,
169 DataBlockWrapperT & DataBlockLoad,
174 res = f(cpb,DataBlockLoad,offset,coord[0],coord[1]);
177 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename lambda_func,
typename ... ArgsT>
178 __device__
static inline void stencil(ScalarT & res, coordType & coord ,
183 res = f(cpb,coord[0],coord[1]);
186 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename lambda_func,
typename ... ArgsT>
187 __device__
static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
193 f(res1,res2,cpb1,cpb2,coord[0],coord[1]);
196 template<
typename ScalarT,
typename coordType,
typename CpBlockType,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgsT>
197 __device__
static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
200 DataBlockWrapperT & DataBlockLoad,
205 f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]);
209 template<
unsigned int dim,
unsigned int n_loop,
unsigned int p_src,
unsigned int p_dst,
unsigned int stencil_size>
214 static constexpr
unsigned int supportRadius = stencil_size;
216 template<
typename SparseGridT,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgT>
217 static inline __device__
void stencil(
218 SparseGridT & sparseGrid,
219 const unsigned int dataBlockId,
223 DataBlockWrapperT & dataBlockLoad,
224 DataBlockWrapperT & dataBlockStore,
225 unsigned char curMask,
229 typedef typename SparseGridT::AggregateBlockType AggregateT;
230 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
232 constexpr
unsigned int enlargedBlockSize =
IntPow<
233 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
235 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
237 for (
int i = 0; i < n_loop ; i++)
239 if (i*
IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
241 enlargedBlock[i*
IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
248 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
252 sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
258 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
262 unsigned int linIdTmp = offset;
263 for (
unsigned int d = 0; d < dim; ++d)
265 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
266 linIdTmp /= SparseGridT::blockEdgeSize_;
271 dataBlockStore.template get<p_dst>()[offset] = res;
275 template <
typename SparseGr
idT,
typename CtxT>
276 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
283 template<
unsigned int dim,
unsigned int n_loop,
unsigned int p_src,
unsigned int p_dst,
unsigned int stencil_size>
288 static constexpr
unsigned int supportRadius = stencil_size;
290 template<
typename SparseGridT,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgT>
291 static inline __device__
void stencil(
292 SparseGridT & sparseGrid,
293 const unsigned int dataBlockId,
297 DataBlockWrapperT & dataBlockLoad,
298 DataBlockWrapperT & dataBlockStore,
299 unsigned char curMask,
303 typedef typename SparseGridT::AggregateBlockType AggregateT;
304 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
306 constexpr
unsigned int enlargedBlockSize =
IntPow<
307 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
309 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
311 for (
int i = 0; i < n_loop ; i++)
313 if (i*
IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
315 enlargedBlock[i*
IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
322 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
326 sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
332 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
336 unsigned int linIdTmp = offset;
337 for (
unsigned int d = 0; d < dim; ++d)
339 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
340 linIdTmp /= SparseGridT::blockEdgeSize_;
345 dataBlockStore.template get<p_dst>()[offset] = res;
349 template <
typename SparseGr
idT,
typename CtxT>
350 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
356 template<
unsigned int dim,
unsigned int n_loop,
unsigned int p_src1,
unsigned int p_src2,
unsigned int p_dst1,
unsigned int p_dst2,
unsigned int stencil_size>
361 static constexpr
unsigned int supportRadius = stencil_size;
363 template<
typename SparseGridT,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgT>
364 static inline __device__
void stencil(
365 SparseGridT & sparseGrid,
366 const unsigned int dataBlockId,
370 DataBlockWrapperT & dataBlockLoad,
371 DataBlockWrapperT & dataBlockStore,
372 unsigned char curMask,
376 typedef typename SparseGridT::AggregateBlockType AggregateT;
377 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
378 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
380 constexpr
unsigned int enlargedBlockSize =
IntPow<
381 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
383 __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
384 __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
389 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
394 sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
395 sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
402 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
406 unsigned int linIdTmp = offset;
407 for (
unsigned int d = 0; d < dim; ++d)
409 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
410 linIdTmp /= SparseGridT::blockEdgeSize_;
415 dataBlockStore.template get<p_dst1>()[offset] = res1;
416 dataBlockStore.template get<p_dst2>()[offset] = res2;
420 template <
typename SparseGr
idT,
typename CtxT>
421 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
427 template<
unsigned int dim,
unsigned int n_loop,
unsigned int p_src1,
unsigned int p_src2,
unsigned int p_dst1,
unsigned int p_dst2,
unsigned int stencil_size>
432 static constexpr
unsigned int supportRadius = stencil_size;
434 template<
typename SparseGridT,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgT>
435 static inline __device__
void stencil(
436 SparseGridT & sparseGrid,
437 const unsigned int dataBlockId,
441 DataBlockWrapperT & dataBlockLoad,
442 DataBlockWrapperT & dataBlockStore,
443 unsigned char curMask,
447 typedef typename SparseGridT::AggregateBlockType AggregateT;
448 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
449 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
451 constexpr
unsigned int enlargedBlockSize =
IntPow<
452 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
454 __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
455 __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
460 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
465 sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
466 sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
473 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
477 unsigned int linIdTmp = offset;
478 for (
unsigned int d = 0; d < dim; ++d)
480 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
481 linIdTmp /= SparseGridT::blockEdgeSize_;
486 dataBlockStore.template get<p_dst1>()[offset] = res1;
487 dataBlockStore.template get<p_dst2>()[offset] = res2;
491 template <
typename SparseGr
idT,
typename CtxT>
492 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
498 template<
unsigned int dim,
unsigned int p_src,
unsigned int p_dst,
unsigned int stencil_size>
503 static constexpr
unsigned int supportRadius = stencil_size;
505 template<
typename SparseGridT,
typename DataBlockWrapperT,
typename lambda_func,
typename ... ArgT>
506 static inline __device__
void stencil(
507 SparseGridT & sparseGrid,
508 const unsigned int dataBlockId,
512 DataBlockWrapperT & dataBlockLoad,
513 DataBlockWrapperT & dataBlockStore,
514 unsigned char curMask,
518 typedef typename SparseGridT::AggregateBlockType AggregateT;
519 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
521 constexpr
unsigned int enlargedBlockSize =
IntPow<
522 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
524 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
526 sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
530 decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
533 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
535 const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
537 linId = sparseGrid.getLinIdInEnlargedBlock(offset);
538 ScalarT cur = enlargedBlock[linId];
545 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
547 enlargedBlock[linId] = res;
550 sparseGrid.template storeBlock<p_dst>(dataBlockStore, enlargedBlock);
553 template <
typename SparseGr
idT,
typename CtxT>
554 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
561 template <
unsigned int dim,
562 unsigned int stencilSupportRadius,
565 typename checker_type,
568 typename SparseGridT,
570 __global__
void tagBoundaries(IndexBufT indexBuffer, DataBufT dataBuffer, SparseGridT sparseGrid,nn_blocksT nbT, checker_type chk)
573 constexpr
unsigned int pIndex = 0;
575 typedef typename IndexBufT::value_type IndexAggregateT;
576 typedef BlockTypeOf<IndexAggregateT, pIndex> IndexT;
578 typedef typename DataBufT::value_type AggregateT;
579 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
580 typedef ScalarTypeOf<AggregateT, pMask> MaskT;
581 constexpr
unsigned int blockSize = MaskBlockT::size;
585 const unsigned int dataBlockPos = blockIdx.x;
586 const unsigned int offset = threadIdx.x;
588 constexpr
unsigned int enlargedBlockSize =
IntPow<
589 sparseGrid.getBlockEdgeSize() + 2 * stencilSupportRadius, dim>::value;
590 __shared__ MaskT enlargedBlock[enlargedBlockSize];
592 if (dataBlockPos >= indexBuffer.size())
597 const long long dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
598 auto dataBlock = dataBuffer.get(dataBlockPos);
601 sdataBlockPos.id = dataBlockPos;
602 sparseGrid.template loadGhostBlock<pMask>(dataBlock,sdataBlockPos,enlargedBlock);
606 bool check = chk.check(sparseGrid,dataBlockId,offset);
609 if (offset < blockSize && check ==
true)
611 const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
612 const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
614 MaskT cur = enlargedBlock[linId];
615 if (sparseGrid.exist(cur))
617 bool isPadding = NN_type::isPadding(sparseGrid,coord,enlargedBlock);
620 sparseGrid.setPadding(enlargedBlock[linId]);
624 sparseGrid.unsetPadding(enlargedBlock[linId]);
630 sparseGrid.template storeBlock<pMask>(dataBlock, enlargedBlock);
637 template<
unsigned int dim,
unsigned int pMask,
unsigned int chunk_size ,
typename SparseGr
idType,
typename outputType>
638 __global__
void link_construct(SparseGridType grid_up, SparseGridType grid_cu, outputType out)
640 const unsigned int dataBlockPos = blockIdx.x;
641 const unsigned int offset = threadIdx.x;
643 auto & indexBuffer = grid_cu.getIndexBuffer();
644 auto & dataBuffer = grid_cu.getDataBuffer();
647 if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
649 auto id = indexBuffer.template get<0>(dataBlockPos);
652 printf(
"HERE %d %d \n",pos.
get(0),pos.
get(1));
654 for (
int i = 0 ; i < dim ; i++)
657 if (grid_up.template get<pMask>(pos) == 0x1)
659 atomicAdd(&out.template get<0>(dataBlockPos),1);
668 template<
unsigned int dim,
unsigned int pMask,
unsigned int chunk_size ,
typename SparseGr
idType,
typename outputType,
typename BoxType>
669 __global__
void count_paddings(SparseGridType grid_cu, outputType out, BoxType box)
671 const unsigned int dataBlockPos = blockIdx.x;
672 const unsigned int offset = threadIdx.x;
674 auto & indexBuffer = grid_cu.getIndexBuffer();
675 auto & dataBuffer = grid_cu.getDataBuffer();
677 auto id = indexBuffer.template get<0>(dataBlockPos);
680 auto coord = grid_cu.getCoord(
id,offset);
682 bool active = box.isInsideKey(coord);
688 if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
690 atomicAdd(&out.template get<0>(dataBlockPos),1);
698 template<
unsigned int pMask,
typename SparseGr
idType,
typename ScanType,
typename outputType,
typename BoxType>
699 __global__
void collect_paddings(SparseGridType grid_cu, ScanType stp, outputType out, BoxType box)
701 const unsigned int dataBlockPos = blockIdx.x;
702 const unsigned int offset = threadIdx.x;
704 __shared__
int counter;
708 auto & indexBuffer = grid_cu.getIndexBuffer();
709 auto & dataBuffer = grid_cu.getDataBuffer();
711 auto id = indexBuffer.template get<0>(dataBlockPos);
714 auto coord = grid_cu.getCoord(
id,offset);
716 bool active = box.isInsideKey(coord);
721 int pad_offset = stp.template get<0>(dataBlockPos);
724 if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
726 int cnt = atomicAdd(&counter,1);
728 out.template get<0>(pad_offset + cnt) = dataBlockPos;
729 out.template get<1>(pad_offset + cnt) = offset;
737 template<
unsigned int dim,
unsigned int pMask,
unsigned int chunk_size,
738 typename padPointType ,
typename SparseGridType,
740 __global__
void link_construct_dw_count(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, outputType out,
Point<dim,int> p_dw)
742 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
744 if (p >= padPoints.size()) {
return;}
746 const unsigned int dataBlockPos = padPoints.template get<0>(p);
747 const unsigned int offset = padPoints.template get<1>(p);
749 auto & indexBuffer = grid_cu.getIndexBuffer();
750 auto & dataBuffer = grid_cu.getDataBuffer();
752 auto id = indexBuffer.template get<0>(dataBlockPos);
755 for (
int i = 0 ; i < dim ; i++)
758 for (
int j = 0 ; j < 2*dim ; j++)
761 for (
int k = 0 ; k < dim ; k++)
763 kc.
set_d(k,pos.
get(k) + ((j >> k) & 0x1) );
766 if (grid_dw.template get<pMask>(kc) & 0x1)
768 int a = atomicAdd(&out.template get<0>(p),1);
777 template<
unsigned int dim,
unsigned int pMask,
unsigned int chunk_size,
778 typename padPointType ,
typename SparseGridType,
780 __global__
void link_construct_up_count(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, outputType out,
Point<dim,int> p_up)
782 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
784 if (p >= padPoints.size()) {
return;}
786 const unsigned int dataBlockPos = padPoints.template get<0>(p);
787 const unsigned int offset = padPoints.template get<1>(p);
789 auto & indexBuffer = grid_cu.getIndexBuffer();
790 auto & dataBuffer = grid_cu.getDataBuffer();
792 auto id = indexBuffer.template get<0>(dataBlockPos);
795 for (
int i = 0 ; i < dim ; i++)
798 if (grid_up.template get<pMask>(pos) & 0x1)
800 int a = atomicAdd(&out.template get<0>(p),1);
808 template<
unsigned int dim,
unsigned int pMask,
unsigned int chunk_size,
809 typename padPointType ,
typename SparseGridType,
typename scanType,
typename outputType>
810 __global__
void link_construct_insert_dw(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, scanType scan, outputType out,
Point<dim,int> p_dw)
812 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
814 if (p >= padPoints.size()) {
return;}
816 const unsigned int dataBlockPos = padPoints.template get<0>(p);
817 const unsigned int offset = padPoints.template get<1>(p);
819 auto & indexBuffer = grid_cu.getIndexBuffer();
820 auto & dataBuffer = grid_cu.getDataBuffer();
822 auto & dataBuffer_dw = grid_dw.getDataBuffer();
824 auto id = indexBuffer.template get<0>(dataBlockPos);
827 for (
int i = 0 ; i < dim ; i++)
830 unsigned int dataBlockPos_dw;
831 unsigned int offset_dw;
833 int link_offset = scan.template get<0>(p);
836 for (
int j = 0 ; j < 2*dim ; j++)
839 for (
int k = 0 ; k < dim ; k++)
841 kc.
set_d(k,pos.
get(k) + ((j >> k) & 0x1) );
844 grid_dw.get_sparse(kc,dataBlockPos_dw,offset_dw);
846 if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
848 out.template get<0>(link_offset + c) = dataBlockPos_dw;
849 out.template get<1>(link_offset + c) = offset_dw;
860 template<
unsigned int dim,
unsigned int pMask,
unsigned int chunk_size,
861 typename padPointType ,
typename SparseGridType,
typename scanType,
typename outputType>
862 __global__
void link_construct_insert_up(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, scanType scan, outputType out,
Point<dim,int> p_up)
864 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
866 if (p >= padPoints.size()) {
return;}
868 const unsigned int dataBlockPos = padPoints.template get<0>(p);
869 const unsigned int offset = padPoints.template get<1>(p);
871 auto & indexBuffer = grid_cu.getIndexBuffer();
872 auto & dataBuffer = grid_cu.getDataBuffer();
874 auto & dataBuffer_dw = grid_up.getDataBuffer();
876 auto id = indexBuffer.template get<0>(dataBlockPos);
879 for (
int i = 0 ; i < dim ; i++)
882 unsigned int dataBlockPos_dw;
883 unsigned int offset_dw;
885 int link_offset = scan.template get<0>(p);
887 grid_up.get_sparse(pos,dataBlockPos_dw,offset_dw);
889 if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
891 out.template get<0>(link_offset) = dataBlockPos_dw;
892 out.template get<1>(link_offset) = offset_dw;
903 template <
unsigned int dim,
906 typename SparseGridT,
908 __global__
void findNeighbours(IndexBufT indexBuffer, SparseGridT sparseGrid, nn_blocksT nn_blocks)
911 constexpr
unsigned int pIndex = 0;
913 typedef typename IndexBufT::value_type IndexAggregateT;
914 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
916 const unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
918 const unsigned int dataBlockPos = pos / nNN_type::nNN;
919 const unsigned int offset = pos % nNN_type::nNN;
921 if (dataBlockPos >= indexBuffer.size())
924 const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
926 auto neighbourPos = sparseGrid.template getNeighboursPos<nNN_type>(dataBlockId, offset);
928 nn_blocks.template get<0>(dataBlockPos*nNN_type::nNN + offset) = neighbourPos;
931 template <
unsigned int dim,
936 typename SparseGridT,
941 IndexBufT indexBuffer,
943 SparseGridT sparseGrid,
946 constexpr
unsigned int pIndex = 0;
948 typedef typename IndexBufT::value_type IndexAggregateT;
949 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
951 typedef typename DataBufT::value_type AggregateT;
952 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
953 typedef ScalarTypeOf<AggregateT, pMask> MaskT;
954 constexpr
unsigned int blockSize = MaskBlockT::size;
958 const unsigned int dataBlockPos = blockIdx.x;
959 const unsigned int offset = threadIdx.x;
961 if (dataBlockPos >= indexBuffer.size())
966 auto dataBlockLoad = dataBuffer.get(dataBlockPos);
969 const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
972 unsigned char curMask;
974 if (offset < blockSize)
977 curMask = dataBlockLoad.template get<pMask>()[offset];
978 for (
int i = 0 ; i < dim ; i++)
979 {curMask &= (pointCoord.
get(i) < bx.
getLow(i) || pointCoord.
get(i) > bx.
getHigh(i))?0:0xFF;}
983 sdataBlockPos.id = dataBlockPos;
986 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
990 template <
unsigned int dim,
995 typename SparseGridT,
998 applyStencilInPlaceNoShared(
1000 IndexBufT indexBuffer,
1001 DataBufT dataBuffer,
1002 SparseGridT sparseGrid,
1005 constexpr
unsigned int pIndex = 0;
1007 typedef typename IndexBufT::value_type IndexAggregateT;
1008 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1010 typedef typename DataBufT::value_type AggregateT;
1011 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1012 typedef ScalarTypeOf<AggregateT, pMask> MaskT;
1013 constexpr
unsigned int blockSize = MaskBlockT::size;
1015 int p = blockIdx.x * blockDim.x + threadIdx.x;
1017 auto & pntBuff = sparseGrid.getPointBuffer();
1019 if (p >= pntBuff.size())
1024 auto id = pntBuff.template get<0>(p);
1026 const unsigned int dataBlockPos =
id / blockSize;
1027 const unsigned int offset =
id % blockSize;
1029 auto dataBlockLoad = dataBuffer.get(dataBlockPos);
1031 const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1034 unsigned char curMask;
1036 if (offset < blockSize)
1039 curMask = dataBlockLoad.template get<pMask>()[offset];
1040 if (bx.
isInsideKey(pointCoord) ==
false) {curMask = 0;}
1044 sdataBlockPos.id = dataBlockPos;
1047 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1051 template<
unsigned int pMask,
1052 typename dataBuffType,
1055 __global__
void fill_e_points(dataBuffType dataBuf, scanType scanBuf, outType output)
1057 typedef typename dataBuffType::value_type AggregateT;
1058 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1059 constexpr
unsigned int blockSize = MaskBlockT::size;
1061 const unsigned int dataBlockPos = blockIdx.x;
1062 const unsigned int offset = threadIdx.x % blockSize;
1064 __shared__
int ato_cnt;
1066 if (threadIdx.x == 0)
1071 if (dataBlockPos >= scanBuf.size() - 1)
1076 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1078 int id = atomicAdd(&ato_cnt,predicate);
1082 if (predicate ==
true)
1084 output.template get<0>(
id + scanBuf.template get<0>(dataBlockPos)) = offset + dataBlockPos * blockSize;
1088 template<
unsigned int pMask,
1089 typename dataBufferType,
1091 __global__
void calc_exist_points(dataBufferType dataBuf, outType output)
1093 typedef typename dataBufferType::value_type AggregateT;
1094 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1095 constexpr
unsigned int blockSize = MaskBlockT::size;
1097 const unsigned int dataBlockPos = blockIdx.x;
1098 const unsigned int offset = threadIdx.x % blockSize;
1100 __shared__
int ato_cnt;
1102 if (threadIdx.x == 0)
1107 if (dataBlockPos >= output.size())
1112 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1114 atomicAdd(&ato_cnt,predicate);
1118 output.template get<0>(dataBlockPos) = ato_cnt;
1121 template<
unsigned int dim,
1123 unsigned int blockEdgeSize,
1124 typename dataBufferType,
1126 typename boxesVector_type,
1127 typename grid_smb_type,
1128 typename indexBuffer_type>
1129 __global__
void calc_remove_points_chunks_boxes(indexBuffer_type indexBuffer,
1130 boxesVector_type boxes,
1132 dataBufferType dataBuf,
1135 typedef typename dataBufferType::value_type AggregateT;
1136 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1138 const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1140 if (dataBlockPos >= indexBuffer.size())
1143 auto id = indexBuffer.template get<0>(dataBlockPos);
1148 for (
int i = 0 ; i < dim ; i++)
1150 b.setLow(i,pnt.
get(i));
1151 b.setHigh(i,pnt.
get(i) + blockEdgeSize - 1);
1156 output.template get<1>(dataBlockPos) = 0;
1157 for (
int k = 0 ; k < boxes.size() ; k++ )
1165 output.template get<1>(dataBlockPos) = 1;
1170 template<
typename outType,
1171 typename activeCnkType>
1172 __global__
void collect_rem_chunks(activeCnkType act,
1175 const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1177 if (dataBlockPos >= act.size()-1)
1180 auto id = act.template get<1>(dataBlockPos);
1181 auto id_p1 = act.template get<1>(dataBlockPos+1);
1185 output.template get<0>(
id) = dataBlockPos;
1189 template<
unsigned int dim,
unsigned int pMask,
1190 typename dataBufferType,
1191 typename indexBufferType,
1192 typename grid_smb_type,
1193 typename activeCntType,
1195 __global__
void remove_points(indexBufferType indexBuffer,
1197 dataBufferType dataBuffer,
1198 activeCntType active_blocks,
1201 typedef typename dataBufferType::value_type AggregateT;
1202 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1203 constexpr
unsigned int blockSize = MaskBlockT::size;
1205 const unsigned int dataBlockPos = active_blocks.template get<0>(blockIdx.x);
1206 const unsigned int offset = threadIdx.x % blockSize;
1208 if (dataBlockPos >= dataBuffer.size()-1)
1211 int predicate = dataBuffer.template get<pMask>(dataBlockPos)[offset] & 0x1;
1213 auto id = indexBuffer.template get<0>(dataBlockPos);
1217 for (
int i = 0 ; i < dim ; i++)
1222 if (predicate ==
true)
1224 for (
int k = 0 ; k < boxes.size() ; k++ )
1230 dataBuffer.template get<pMask>(dataBlockPos)[offset] = 0;
1236 template<
unsigned int dim,
1238 unsigned int numCnt,
1240 typename dataBufferType,
1242 typename boxesVector_type,
1243 typename grid_smb_type,
1244 typename indexBuffer_type>
1245 __global__
void calc_exist_points_with_boxes(indexBuffer_type indexBuffer,
1246 boxesVector_type boxes,
1248 dataBufferType dataBuf,
1250 unsigned int stride_size)
1252 typedef typename dataBufferType::value_type AggregateT;
1253 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1254 constexpr
unsigned int blockSize = MaskBlockT::size;
1256 const unsigned int dataBlockPos = blockIdx.x;
1257 const unsigned int offset = threadIdx.x % blockSize;
1259 __shared__
int ato_cnt[numCnt];
1261 if (threadIdx.x < numCnt)
1262 {ato_cnt[threadIdx.x] = 0;}
1268 if (numCnt > blockDim.x)
1269 {printf(
"Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(
int)blockDim.x);}
1273 if (dataBlockPos >= output.size())
1276 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1278 indexT
id = indexBuffer.template get<0>(dataBlockPos);
1282 for (
int i = 0 ; i < dim ; i++)
1287 if (predicate ==
true)
1289 for (
int k = 0 ; k < boxes.size() ; k++ )
1295 atomicAdd(&ato_cnt[k],1);
1302 if (threadIdx.x < boxes.size())
1304 output.template get<0>(dataBlockPos+threadIdx.x*stride_size) = ato_cnt[threadIdx.x];
1305 output.template get<1>(dataBlockPos+threadIdx.x*stride_size) = (ato_cnt[threadIdx.x] != 0);
1319 template<
unsigned int dim,
1321 unsigned int numCnt,
1323 typename dataBufferType,
1324 typename packBufferType,
1326 typename scanItType,
1327 typename outputType,
1328 typename boxesVector_type,
1329 typename grid_smb_type,
1330 typename indexBuffer_type>
1331 __global__
void get_exist_points_with_boxes(indexBuffer_type indexBuffer,
1332 boxesVector_type boxes,
1334 dataBufferType dataBuf,
1335 packBufferType pack_output,
1340 typedef typename dataBufferType::value_type AggregateT;
1341 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1342 constexpr
unsigned int blockSize = MaskBlockT::size;
1344 const unsigned int dataBlockPos = blockIdx.x;
1345 const unsigned int offset = threadIdx.x % blockSize;
1347 __shared__
int ato_cnt[numCnt];
1349 if (threadIdx.x < numCnt)
1350 {ato_cnt[threadIdx.x] = 0;}
1356 if (numCnt > blockDim.x)
1357 {printf(
"Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(
int)blockDim.x);}
1361 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1363 indexT
id = indexBuffer.template get<0>(dataBlockPos);
1367 for (
int i = 0 ; i < dim ; i++)
1368 {p_.
get(i) = pnt.
get(i);}
1372 if (predicate ==
true)
1374 for (
int k = 0 ; k < boxes.size() ; k++ )
1381 int p = atomicAdd(&ato_cnt[k] , 1);
1384 const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuffer.size() + 1));
1385 unsigned int sit = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1));
1386 int scan_id = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1)) + scan_it.template get<0>(k);
1387 output.template get<0>(scan_id + p) = (offset + dataBlockPos * blockSize) * numCnt + k;
1388 pack_output.template get<0>(scan_id + p) = p + sit;
1397 template<
unsigned int dim,
1398 unsigned int blockSize,
1399 unsigned int blockEdgeSize,
1400 unsigned int nshifts,
1402 typename linearizer,
1403 typename shiftTypeVector,
1404 typename outputType>
1405 __global__
void convert_chunk_ids(indexT * ids,
1407 linearizer gridGeoPack,
1412 shiftTypeVector shifts,
1417 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1424 for (
int i = 0 ; i < nshifts ; i++)
1428 for (
int j = 0 ; j < dim ; j++)
1430 pos.
set_d(j,pos.
get(j) + shifts.template get<0>(i)[j]*blockEdgeSize);
1435 if (pos.
get(j) >= sz.
get(j))
1437 pos.
set_d(j,pos.
get(j) - blockEdgeSize);
1441 auto plin = gridGeo.LinId(pos);
1443 output.template get<0>(p*nshifts + i + bs) = plin / blockSize;
1447 template<
typename vectorType>
1448 __global__
void set_one(vectorType vt)
1451 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1456 vt.template get<0>(p) = 1;
1459 template<
unsigned int pSegment,
typename newMapType,
typename mergeMapType,
1460 typename dataMapType,
typename segmentOffsetType,
1461 typename outMapType>
1462 __global__
void construct_new_chunk_map(newMapType new_map, dataMapType dataMap,
1463 mergeMapType merge_id, outMapType outMap,
1464 segmentOffsetType segments_data,
int start_p)
1467 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1469 if (p >= segments_data.size()-1)
1472 unsigned int st = segments_data.template get<pSegment>(p);
1474 int segmentSize = segments_data.template get<pSegment>(p + 1)
1475 - segments_data.template get<pSegment>(p);
1477 for (
int j = 0 ; j < segmentSize ; j++)
1479 int dm = dataMap.template get<0>(st+j);
1480 new_map.template get<0>(dm) = outMap.template get<0>(p);
1484 template<
unsigned int pMask,
typename AggregateT,
typename blockConvertType,
typename newMapType,
typename dataType_ptrs,
typename dataType,
unsigned int ... prp>
1485 __global__
void copy_packed_data_to_chunks(
unsigned int * scan,
1486 unsigned short int * offsets,
1487 blockConvertType blc,
1489 dataType_ptrs data_ptrs,
1495 unsigned int n_accu_cnk)
1498 const unsigned int p = blockIdx.x;
1503 int scan_pp = scan[p];
1504 int n_block_pnt = scan[p+1] - scan_pp;
1506 if (threadIdx.x < n_block_pnt)
1508 unsigned short int off = offsets[scan[p] + threadIdx.x];
1510 int conv = blc.template get<0>(i)[off];
1512 unsigned short int off_c = conv & 0xFFFF;
1513 unsigned short int shf_c = conv >> 16;
1515 unsigned int pos_c = new_map.template get<0>(n_shf*p + shf_c + n_accu_cnk);
1518 spi(pos_c,off_c,data_buff,scan_pp + threadIdx.x,data_ptrs,n_pnt);
1520 boost::mpl::for_each_ref< boost::mpl::range_c<
int,0,
sizeof...(prp)> >(spi);
1522 data_buff.template get<pMask>(pos_c)[off_c] |= 0x1;
1527 template<
typename scanPo
interType,
typename scanType>
1528 __global__
void last_scan_point(scanPointerType scan_ptr, scanType scan,
unsigned int stride,
unsigned int n_pack)
1530 const unsigned int k = blockIdx.x * blockDim.x + threadIdx.x;
1532 if (k >= n_pack) {
return;}
1534 unsigned int ppos = scan.template get<0>((k+1)*stride-1);
1535 unsigned int pos = scan.template get<1>((k+1)*stride-1);
1537 ((
unsigned int *)scan_ptr.ptr[k])[pos] = ppos;
1540 template<
unsigned int pMask,
1541 typename AggregateT,
1545 typename pntBuff_type,
1546 typename pointOffset_type,
1547 typename indexBuffer_type,
1548 typename dataBuffer_type,
1550 unsigned int blockSize,
1551 unsigned int ... prp>
1552 __global__
void pack_data(pntBuff_type pntBuff,
1553 dataBuffer_type dataBuff,
1554 indexBuffer_type indexBuff,
1556 pointOffset_type point_offsets,
1564 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1566 if (p >= pntBuff.size())
1569 const unsigned int pb = pntBuff.template get<0>(p);
1570 const unsigned int p_offset = point_offsets.template get<0>(p);
1572 const unsigned int k = pb % n_it;
1573 const unsigned int id = pb / n_it;
1575 const unsigned int dataBlockPos =
id / blockSize;
1576 const unsigned int offset =
id % blockSize;
1578 unsigned int ppos = scan.template get<0>(dataBlockPos + k*(indexBuff.size() + 1));
1579 const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuff.size() + 1));
1582 spi(dataBlockPos,offset,dataBuff,p_offset,data_ptr->ptr[k],sar.sa[k]);
1584 boost::mpl::for_each_ref< boost::mpl::range_c<
int,0,
sizeof...(prp)> >(spi);
1586 ((
unsigned int *)scan_ptr.ptr[k])[dataBlockPosPack] = ppos;
1588 ((indexT *)index_ptr.ptr[k])[dataBlockPosPack] = indexBuff.template get<0>(dataBlockPos);
1589 ((
short int *)offset_ptr.ptr[k])[p_offset] = offset;
1590 ((
unsigned char *)mask_ptr.ptr[k])[p_offset] = dataBuff.template get<pMask>(dataBlockPos)[offset];
1594 #endif //OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
this class is a functor for "for_each" algorithm
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
__device__ __host__ index_type get(index_type i) const
Get the i index.
This class implement the point shape in an N-dimensional space.
__host__ __device__ bool isInside(const Point< dim, T > &p) const
Check if the point is inside the box.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
this class is a functor for "for_each" algorithm
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ __host__ bool isInsideKey(const KeyType &k) const
Check if the point is inside the region (Border included)
__device__ __host__ bool Intersect(const Box< dim, T > &b, Box< dim, T > &b_out) const
Intersect.
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
__device__ __host__ T getHigh(int i) const
get the high interval of the box
to_variadic_const_impl< 1, N, M, exit_::value, M >::type type
generate the boost::fusion::vector apply H on each term