OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_kernels.cuh
1 //
2 // Created by tommaso on 19/06/19.
3 //
4 
5 #ifndef OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
6 #define OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
7 
8 #include <SparseGridGpu/BlockMapGpu.hpp>
9 #include <SparseGridGpu/TemplateUtils/mathUtils.hpp>
10 #include "util/cuda_util.hpp"
11 #include "SparseGrid/cp_block.hpp"
12 
13 #ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE
14 #define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE
15 #endif
16 
17 #ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED
18 #define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED
19 #endif
20 
21 enum mask_sparse
22 {
23  NOT_EXIST = 0,
24  EXIST = 1,
25  PADDING = 2,
26  EXIST_AND_PADDING = 3
27 };
28 
29 // Kernels for SparseGridGpu
30 namespace SparseGridGpuKernels
31 {
32  template<typename SparseGridGpuType, typename pointers_type, typename headers_type>
33  __global__ void unpack_headers(pointers_type pointers, headers_type headers, int * result, unsigned int sz_pack, int n_slot)
34  {
35  int t = threadIdx.x;
36 
37  if (t > pointers.size()) {return;}
38 
39  unsigned char * data_pack = (unsigned char *)pointers.template get<0>(t);
40 
41  while (data_pack < pointers.template get<1>(t) )
42  {
43  int ih = pointers.template get<2>(t);
44  if (n_slot > ih)
45  {
46  if (sizeof(typename SparseGridGpuType::indexT_) == 8)
47  {headers.template get<0>(t*n_slot + ih) = *(size_t *)data_pack;}
48  else
49  {
50  unsigned int dp1 = *(unsigned int *)data_pack;
51  unsigned int dp2 = *(unsigned int *)&(data_pack[4]);
52  headers.template get<0>(t*n_slot + ih) = (size_t)dp1 + (((size_t)dp2) << 32);
53  }
54  data_pack += sizeof(size_t);
55  data_pack += SparseGridGpuType::unpack_headers(headers,data_pack,t*n_slot + ih,sz_pack);
56  pointers.template get<2>(t) += 1;
57  }
58  else
59  {
60  // report error
61  result[0] = 1;
62  return;
63  }
64  }
65  }
66 
67 
68  template<unsigned int dim>
70  {
71  template<typename ScalarT, typename coordType, typename SparseGridT, unsigned int enlargedBlockSize, typename lambda_func, typename ... ArgsT>
72  __device__ static inline void stencil(ScalarT & res, ScalarT & cur, coordType & coord ,
73  ScalarT (& enlargedBlock)[enlargedBlockSize],
74  lambda_func f,
75  SparseGridT & sparseGrid, ArgsT ... args)
76  {
78 
79  for (int d = 0; d < dim; ++d)
80  {
81  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
82  auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
83  ScalarT neighbourPlus = enlargedBlock[nPlusId];
84  ScalarT neighbourMinus = enlargedBlock[nMinusId];
85 
86  cs.xm[d] = neighbourMinus;
87  cs.xp[d] = neighbourPlus;
88  }
89 
90  res = f(cur,cs, args ...);
91  }
92  };
93 
94  template<unsigned int dim>
96  {
97  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
98  __device__ static inline void stencil(ScalarT & res, coordType & coord ,
99  CpBlockType & cpb,
100  lambda_func f,
101  ArgsT ... args)
102  {
103  printf("Convolution operation on GPU: Dimension not implemented \n");
104  }
105 
106  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
107  __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
108  CpBlockType & cpb1,
109  CpBlockType & cpb2,
110  lambda_func f,
111  ArgsT ... args)
112  {
113  printf("Convolution operation on GPU: Dimension not implemented \n");
114  }
115  };
116 
117  template<>
119  {
120  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
121  __device__ static inline void stencil_block(ScalarT & res, coordType & coord ,
122  CpBlockType & cpb,
123  DataBlockWrapperT & DataBlockLoad,
124  int offset,
125  lambda_func f,
126  ArgsT ... args)
127  {
128  res = f(cpb,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
129  }
130 
131  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
132  __device__ static inline void stencil(ScalarT & res, coordType & coord ,
133  CpBlockType & cpb,
134  lambda_func f,
135  ArgsT ... args)
136  {
137  res = f(cpb,coord[0],coord[1],coord[2]);
138  }
139 
140  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
141  __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
142  CpBlockType & cpb1,
143  CpBlockType & cpb2,
144  lambda_func f,
145  ArgsT ... args)
146  {
147  f(res1,res2,cpb1,cpb2,coord[0],coord[1],coord[2]);
148  }
149 
150  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
151  __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
152  CpBlockType & cpb1,
153  CpBlockType & cpb2,
154  DataBlockWrapperT & DataBlockLoad,
155  int offset,
156  lambda_func f,
157  ArgsT ... args)
158  {
159  f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
160  }
161  };
162 
163  template<>
165  {
166  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
167  __device__ static inline void stencil_block(ScalarT & res, coordType & coord,
168  CpBlockType & cpb,
169  DataBlockWrapperT & DataBlockLoad,
170  int offset,
171  lambda_func f,
172  ArgsT ... args)
173  {
174  res = f(cpb,DataBlockLoad,offset,coord[0],coord[1]);
175  }
176 
177  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
178  __device__ static inline void stencil(ScalarT & res, coordType & coord ,
179  CpBlockType & cpb,
180  lambda_func f,
181  ArgsT ... args)
182  {
183  res = f(cpb,coord[0],coord[1]);
184  }
185 
186  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
187  __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
188  CpBlockType & cpb1,
189  CpBlockType & cpb2,
190  lambda_func f,
191  ArgsT ... args)
192  {
193  f(res1,res2,cpb1,cpb2,coord[0],coord[1]);
194  }
195 
196  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
197  __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
198  CpBlockType & cpb1,
199  CpBlockType & cpb2,
200  DataBlockWrapperT & DataBlockLoad,
201  int offset,
202  lambda_func f,
203  ArgsT ... args)
204  {
205  f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]);
206  }
207  };
208 
209  template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
211  {
212  typedef NNStar<dim> stencil_type;
213 
214  static constexpr unsigned int supportRadius = stencil_size;
215 
216  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
217  static inline __device__ void stencil(
218  SparseGridT & sparseGrid,
219  const unsigned int dataBlockId,
221  unsigned int offset,
222  grid_key_dx<dim, int> & pointCoord,
223  DataBlockWrapperT & dataBlockLoad,
224  DataBlockWrapperT & dataBlockStore,
225  unsigned char curMask,
226  lambda_func f,
227  ArgT ... args)
228  {
229  typedef typename SparseGridT::AggregateBlockType AggregateT;
230  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
231 
232  constexpr unsigned int enlargedBlockSize = IntPow<
233  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
234 
235  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
236 
237  for (int i = 0; i < n_loop ; i++)
238  {
239  if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
240  {
241  enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
242  }
243  }
244 
245  __syncthreads();
246 
248  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
249 
251 
252  sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
253 
254  __syncthreads();
255 
256  ScalarT res = 0;
257 
258  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
259  {
260  int coord[dim];
261 
262  unsigned int linIdTmp = offset;
263  for (unsigned int d = 0; d < dim; ++d)
264  {
265  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
266  linIdTmp /= SparseGridT::blockEdgeSize_;
267  }
268 
269  stencil_conv_func_impl<dim>::stencil(res,coord,cpb,f,args...);
270 
271  dataBlockStore.template get<p_dst>()[offset] = res;
272  }
273  }
274 
275  template <typename SparseGridT, typename CtxT>
276  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
277  {
278  // No flush
279  }
280  };
281 
282 
283  template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
285  {
286  typedef NNStar<dim> stencil_type;
287 
288  static constexpr unsigned int supportRadius = stencil_size;
289 
290  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
291  static inline __device__ void stencil(
292  SparseGridT & sparseGrid,
293  const unsigned int dataBlockId,
295  unsigned int offset,
296  grid_key_dx<dim, int> & pointCoord,
297  DataBlockWrapperT & dataBlockLoad,
298  DataBlockWrapperT & dataBlockStore,
299  unsigned char curMask,
300  lambda_func f,
301  ArgT ... args)
302  {
303  typedef typename SparseGridT::AggregateBlockType AggregateT;
304  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
305 
306  constexpr unsigned int enlargedBlockSize = IntPow<
307  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
308 
309  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
310 
311  for (int i = 0; i < n_loop ; i++)
312  {
313  if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
314  {
315  enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
316  }
317  }
318 
319  __syncthreads();
320 
322  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
323 
325 
326  sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
327 
328  __syncthreads();
329 
330  ScalarT res = 0;
331 
332  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
333  {
334  int coord[dim];
335 
336  unsigned int linIdTmp = offset;
337  for (unsigned int d = 0; d < dim; ++d)
338  {
339  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
340  linIdTmp /= SparseGridT::blockEdgeSize_;
341  }
342 
343  stencil_conv_func_impl<dim>::stencil_block(res,coord,cpb,dataBlockLoad,offset,f,args...);
344 
345  dataBlockStore.template get<p_dst>()[offset] = res;
346  }
347  }
348 
349  template <typename SparseGridT, typename CtxT>
350  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
351  {
352  // No flush
353  }
354  };
355 
356  template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
358  {
359  typedef NNStar<dim> stencil_type;
360 
361  static constexpr unsigned int supportRadius = stencil_size;
362 
363  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
364  static inline __device__ void stencil(
365  SparseGridT & sparseGrid,
366  const unsigned int dataBlockId,
368  unsigned int offset,
369  grid_key_dx<dim, int> & pointCoord,
370  DataBlockWrapperT & dataBlockLoad,
371  DataBlockWrapperT & dataBlockStore,
372  unsigned char curMask,
373  lambda_func f,
374  ArgT ... args)
375  {
376  typedef typename SparseGridT::AggregateBlockType AggregateT;
377  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
378  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
379 
380  constexpr unsigned int enlargedBlockSize = IntPow<
381  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
382 
383  __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
384  __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
385 
386  // fill with background
387 
389  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
390 
393 
394  sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
395  sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
396 
397  __syncthreads();
398 
399  ScalarT1 res1 = 0;
400  ScalarT2 res2 = 0;
401 
402  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
403  {
404  int coord[dim];
405 
406  unsigned int linIdTmp = offset;
407  for (unsigned int d = 0; d < dim; ++d)
408  {
409  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
410  linIdTmp /= SparseGridT::blockEdgeSize_;
411  }
412 
413  stencil_conv_func_impl<dim>::stencil2_block(res1,res2,coord,cpb1,cpb2,dataBlockLoad,offset,f,args...);
414 
415  dataBlockStore.template get<p_dst1>()[offset] = res1;
416  dataBlockStore.template get<p_dst2>()[offset] = res2;
417  }
418  }
419 
420  template <typename SparseGridT, typename CtxT>
421  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
422  {
423  // No flush
424  }
425  };
426 
427  template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
429  {
430  typedef NNStar<dim> stencil_type;
431 
432  static constexpr unsigned int supportRadius = stencil_size;
433 
434  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
435  static inline __device__ void stencil(
436  SparseGridT & sparseGrid,
437  const unsigned int dataBlockId,
439  unsigned int offset,
440  grid_key_dx<dim, int> & pointCoord,
441  DataBlockWrapperT & dataBlockLoad,
442  DataBlockWrapperT & dataBlockStore,
443  unsigned char curMask,
444  lambda_func f,
445  ArgT ... args)
446  {
447  typedef typename SparseGridT::AggregateBlockType AggregateT;
448  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
449  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
450 
451  constexpr unsigned int enlargedBlockSize = IntPow<
452  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
453 
454  __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
455  __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
456 
457  // fill with background
458 
460  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
461 
464 
465  sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
466  sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
467 
468  __syncthreads();
469 
470  ScalarT1 res1 = 0;
471  ScalarT2 res2 = 0;
472 
473  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
474  {
475  int coord[dim];
476 
477  unsigned int linIdTmp = offset;
478  for (unsigned int d = 0; d < dim; ++d)
479  {
480  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
481  linIdTmp /= SparseGridT::blockEdgeSize_;
482  }
483 
484  stencil_conv_func_impl<dim>::stencil2(res1,res2,coord,cpb1,cpb2,f,args...);
485 
486  dataBlockStore.template get<p_dst1>()[offset] = res1;
487  dataBlockStore.template get<p_dst2>()[offset] = res2;
488  }
489  }
490 
491  template <typename SparseGridT, typename CtxT>
492  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
493  {
494  // No flush
495  }
496  };
497 
498  template<unsigned int dim, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
500  {
501  typedef NNStar<dim> stencil_type;
502 
503  static constexpr unsigned int supportRadius = stencil_size;
504 
505  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
506  static inline __device__ void stencil(
507  SparseGridT & sparseGrid,
508  const unsigned int dataBlockId,
510  unsigned int offset,
511  grid_key_dx<dim, int> & pointCoord,
512  DataBlockWrapperT & dataBlockLoad,
513  DataBlockWrapperT & dataBlockStore,
514  unsigned char curMask,
515  lambda_func f,
516  ArgT ... args)
517  {
518  typedef typename SparseGridT::AggregateBlockType AggregateT;
519  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
520 
521  constexpr unsigned int enlargedBlockSize = IntPow<
522  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
523 
524  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
525 
526  sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
527 
528  __syncthreads();
529 
530  decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
531  ScalarT res = 0;
532 
533  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
534  {
535  const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
536  // const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
537  linId = sparseGrid.getLinIdInEnlargedBlock(offset);
538  ScalarT cur = enlargedBlock[linId];
539 
540  stencil_cross_func_impl<dim>::stencil(res,cur,coord,enlargedBlock,f,sparseGrid,args ...);
541 
542 
543  }
544  __syncthreads();
545  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
546  {
547  enlargedBlock[linId] = res;
548  }
549  __syncthreads();
550  sparseGrid.template storeBlock<p_dst>(dataBlockStore, enlargedBlock);
551  }
552 
553  template <typename SparseGridT, typename CtxT>
554  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
555  {
556  // No flush
557  }
558  };
559 
560  // This kernel is to be called with 1D parameters (?)
561  template <unsigned int dim,
562  unsigned int stencilSupportRadius,
563  unsigned int pMask,
564  typename NN_type,
565  typename checker_type,
566  typename IndexBufT,
567  typename DataBufT,
568  typename SparseGridT,
569  typename nn_blocksT>
570  __global__ void tagBoundaries(IndexBufT indexBuffer, DataBufT dataBuffer, SparseGridT sparseGrid,nn_blocksT nbT, checker_type chk)
571  {
572  //todo: #ifdef __NVCC__
573  constexpr unsigned int pIndex = 0;
574 
575  typedef typename IndexBufT::value_type IndexAggregateT;
576  typedef BlockTypeOf<IndexAggregateT, pIndex> IndexT;
577 
578  typedef typename DataBufT::value_type AggregateT;
579  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
580  typedef ScalarTypeOf<AggregateT, pMask> MaskT;
581  constexpr unsigned int blockSize = MaskBlockT::size;
582 
583  // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
584  // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
585  const unsigned int dataBlockPos = blockIdx.x;
586  const unsigned int offset = threadIdx.x;
587 
588  constexpr unsigned int enlargedBlockSize = IntPow<
589  sparseGrid.getBlockEdgeSize() + 2 * stencilSupportRadius, dim>::value;
590  __shared__ MaskT enlargedBlock[enlargedBlockSize];
591 
592  if (dataBlockPos >= indexBuffer.size())
593  {
594  return;
595  }
596 
597  const long long dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
598  auto dataBlock = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
599 
601  sdataBlockPos.id = dataBlockPos;
602  sparseGrid.template loadGhostBlock<pMask>(dataBlock,sdataBlockPos,enlargedBlock);
603 
604  __syncthreads();
605 
606  bool check = chk.check(sparseGrid,dataBlockId,offset);
607 
608  //Here code for tagging the boundary
609  if (offset < blockSize && check == true)
610  {
611  const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
612  const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
613 
614  MaskT cur = enlargedBlock[linId];
615  if (sparseGrid.exist(cur))
616  {
617  bool isPadding = NN_type::isPadding(sparseGrid,coord,enlargedBlock);
618  if (isPadding)
619  {
620  sparseGrid.setPadding(enlargedBlock[linId]);
621  }
622  else
623  {
624  sparseGrid.unsetPadding(enlargedBlock[linId]);
625  }
626  }
627  }
628  // Write block back to global memory
629  __syncthreads();
630  sparseGrid.template storeBlock<pMask>(dataBlock, enlargedBlock);
631  }
632 
637  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size , typename SparseGridType, typename outputType>
638  __global__ void link_construct(SparseGridType grid_up, SparseGridType grid_cu, outputType out)
639  {
640  const unsigned int dataBlockPos = blockIdx.x;
641  const unsigned int offset = threadIdx.x;
642 
643  auto & indexBuffer = grid_cu.getIndexBuffer();
644  auto & dataBuffer = grid_cu.getDataBuffer();
645 
646  // if the point is a padding
647  if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
648  {
649  auto id = indexBuffer.template get<0>(dataBlockPos);
650  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
651 
652  printf("HERE %d %d \n",pos.get(0),pos.get(1));
653 
654  for (int i = 0 ; i < dim ; i++)
655  {pos.set_d(i,pos.get(i) / 2);}
656 
657  if (grid_up.template get<pMask>(pos) == 0x1)
658  {
659  atomicAdd(&out.template get<0>(dataBlockPos),1);
660  }
661  }
662  }
663 
668  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size , typename SparseGridType, typename outputType, typename BoxType>
669  __global__ void count_paddings(SparseGridType grid_cu, outputType out, BoxType box)
670  {
671  const unsigned int dataBlockPos = blockIdx.x;
672  const unsigned int offset = threadIdx.x;
673 
674  auto & indexBuffer = grid_cu.getIndexBuffer();
675  auto & dataBuffer = grid_cu.getDataBuffer();
676 
677  auto id = indexBuffer.template get<0>(dataBlockPos);
678 
679  // check if the point is inside the box
680  auto coord = grid_cu.getCoord(id,offset);
681 
682  bool active = box.isInsideKey(coord);
683 
684  if (active == false)
685  {return;}
686 
687  // if the point is a padding
688  if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
689  {
690  atomicAdd(&out.template get<0>(dataBlockPos),1);
691  }
692  }
693 
698  template<unsigned int pMask, typename SparseGridType, typename ScanType, typename outputType, typename BoxType>
699  __global__ void collect_paddings(SparseGridType grid_cu, ScanType stp, outputType out, BoxType box)
700  {
701  const unsigned int dataBlockPos = blockIdx.x;
702  const unsigned int offset = threadIdx.x;
703 
704  __shared__ int counter;
705  counter = 0;
706  __syncthreads();
707 
708  auto & indexBuffer = grid_cu.getIndexBuffer();
709  auto & dataBuffer = grid_cu.getDataBuffer();
710 
711  auto id = indexBuffer.template get<0>(dataBlockPos);
712 
713  // check if the point is inside the box
714  auto coord = grid_cu.getCoord(id,offset);
715 
716  bool active = box.isInsideKey(coord);
717 
718  if (active == false)
719  {return;}
720 
721  int pad_offset = stp.template get<0>(dataBlockPos);
722 
723  // if the point is a padding
724  if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
725  {
726  int cnt = atomicAdd(&counter,1);
727 
728  out.template get<0>(pad_offset + cnt) = dataBlockPos;
729  out.template get<1>(pad_offset + cnt) = offset;
730  }
731  }
732 
737  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
738  typename padPointType , typename SparseGridType,
739  typename outputType>
740  __global__ void link_construct_dw_count(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, outputType out, Point<dim,int> p_dw)
741  {
742  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
743 
744  if (p >= padPoints.size()) {return;}
745 
746  const unsigned int dataBlockPos = padPoints.template get<0>(p);
747  const unsigned int offset = padPoints.template get<1>(p);
748 
749  auto & indexBuffer = grid_cu.getIndexBuffer();
750  auto & dataBuffer = grid_cu.getDataBuffer();
751 
752  auto id = indexBuffer.template get<0>(dataBlockPos);
753  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
754 
755  for (int i = 0 ; i < dim ; i++)
756  {pos.set_d(i,pos.get(i) * 2 + p_dw.get(i) );}
757 
758  for (int j = 0 ; j < 2*dim ; j++)
759  {
761  for (int k = 0 ; k < dim ; k++)
762  {
763  kc.set_d(k,pos.get(k) + ((j >> k) & 0x1) );
764  }
765 
766  if (grid_dw.template get<pMask>(kc) & 0x1)
767  {
768  int a = atomicAdd(&out.template get<0>(p),1);
769  }
770  }
771  }
772 
777  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
778  typename padPointType , typename SparseGridType,
779  typename outputType>
780  __global__ void link_construct_up_count(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, outputType out, Point<dim,int> p_up)
781  {
782  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
783 
784  if (p >= padPoints.size()) {return;}
785 
786  const unsigned int dataBlockPos = padPoints.template get<0>(p);
787  const unsigned int offset = padPoints.template get<1>(p);
788 
789  auto & indexBuffer = grid_cu.getIndexBuffer();
790  auto & dataBuffer = grid_cu.getDataBuffer();
791 
792  auto id = indexBuffer.template get<0>(dataBlockPos);
793  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
794 
795  for (int i = 0 ; i < dim ; i++)
796  {pos.set_d(i,(pos.get(i) - p_up.get(i)) / 2);}
797 
798  if (grid_up.template get<pMask>(pos) & 0x1)
799  {
800  int a = atomicAdd(&out.template get<0>(p),1);
801  }
802  }
803 
808  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
809  typename padPointType , typename SparseGridType, typename scanType, typename outputType>
810  __global__ void link_construct_insert_dw(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, scanType scan, outputType out, Point<dim,int> p_dw)
811  {
812  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
813 
814  if (p >= padPoints.size()) {return;}
815 
816  const unsigned int dataBlockPos = padPoints.template get<0>(p);
817  const unsigned int offset = padPoints.template get<1>(p);
818 
819  auto & indexBuffer = grid_cu.getIndexBuffer();
820  auto & dataBuffer = grid_cu.getDataBuffer();
821 
822  auto & dataBuffer_dw = grid_dw.getDataBuffer();
823 
824  auto id = indexBuffer.template get<0>(dataBlockPos);
825  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
826 
827  for (int i = 0 ; i < dim ; i++)
828  {pos.set_d(i,pos.get(i) * 2 + p_dw.get(i) );}
829 
830  unsigned int dataBlockPos_dw;
831  unsigned int offset_dw;
832 
833  int link_offset = scan.template get<0>(p);
834 
835  int c = 0;
836  for (int j = 0 ; j < 2*dim ; j++)
837  {
839  for (int k = 0 ; k < dim ; k++)
840  {
841  kc.set_d(k,pos.get(k) + ((j >> k) & 0x1) );
842  }
843 
844  grid_dw.get_sparse(kc,dataBlockPos_dw,offset_dw);
845 
846  if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
847  {
848  out.template get<0>(link_offset + c) = dataBlockPos_dw;
849  out.template get<1>(link_offset + c) = offset_dw;
850 
851  c++;
852  }
853  }
854  }
855 
860  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
861  typename padPointType , typename SparseGridType, typename scanType, typename outputType>
862  __global__ void link_construct_insert_up(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, scanType scan, outputType out, Point<dim,int> p_up)
863  {
864  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
865 
866  if (p >= padPoints.size()) {return;}
867 
868  const unsigned int dataBlockPos = padPoints.template get<0>(p);
869  const unsigned int offset = padPoints.template get<1>(p);
870 
871  auto & indexBuffer = grid_cu.getIndexBuffer();
872  auto & dataBuffer = grid_cu.getDataBuffer();
873 
874  auto & dataBuffer_dw = grid_up.getDataBuffer();
875 
876  auto id = indexBuffer.template get<0>(dataBlockPos);
877  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
878 
879  for (int i = 0 ; i < dim ; i++)
880  {pos.set_d(i,(pos.get(i) - p_up.get(i)) / 2);}
881 
882  unsigned int dataBlockPos_dw;
883  unsigned int offset_dw;
884 
885  int link_offset = scan.template get<0>(p);
886 
887  grid_up.get_sparse(pos,dataBlockPos_dw,offset_dw);
888 
889  if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
890  {
891  out.template get<0>(link_offset) = dataBlockPos_dw;
892  out.template get<1>(link_offset) = offset_dw;
893  }
894  }
895 
903  template <unsigned int dim,
904  typename nNN_type,
905  typename IndexBufT,
906  typename SparseGridT,
907  typename nn_blocksT>
908  __global__ void findNeighbours(IndexBufT indexBuffer, SparseGridT sparseGrid, nn_blocksT nn_blocks)
909  {
910  //todo: #ifdef __NVCC__
911  constexpr unsigned int pIndex = 0;
912 
913  typedef typename IndexBufT::value_type IndexAggregateT;
914  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
915 
916  const unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
917 
918  const unsigned int dataBlockPos = pos / nNN_type::nNN;
919  const unsigned int offset = pos % nNN_type::nNN;
920 
921  if (dataBlockPos >= indexBuffer.size())
922  {return;}
923 
924  const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
925 
926  auto neighbourPos = sparseGrid.template getNeighboursPos<nNN_type>(dataBlockId, offset);
927 
928  nn_blocks.template get<0>(dataBlockPos*nNN_type::nNN + offset) = neighbourPos;
929  }
930 
931  template <unsigned int dim,
932  unsigned int pMask,
933  typename stencil,
934  typename IndexBufT,
935  typename DataBufT,
936  typename SparseGridT,
937  typename... Args>
938  __global__ void
939  applyStencilInPlace(
940  Box<dim,int> bx,
941  IndexBufT indexBuffer,
942  DataBufT dataBuffer,
943  SparseGridT sparseGrid,
944  Args... args)
945  {
946  constexpr unsigned int pIndex = 0;
947 
948  typedef typename IndexBufT::value_type IndexAggregateT;
949  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
950 
951  typedef typename DataBufT::value_type AggregateT;
952  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
953  typedef ScalarTypeOf<AggregateT, pMask> MaskT;
954  constexpr unsigned int blockSize = MaskBlockT::size;
955 
956  // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
957  // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
958  const unsigned int dataBlockPos = blockIdx.x;
959  const unsigned int offset = threadIdx.x;
960 
961  if (dataBlockPos >= indexBuffer.size())
962  {
963  return;
964  }
965 
966  auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
967 
968  // todo: Add management of RED-BLACK stencil application! :)
969  const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
970  grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
971 
972  unsigned char curMask;
973 
974  if (offset < blockSize)
975  {
976  // Read local mask to register
977  curMask = dataBlockLoad.template get<pMask>()[offset];
978  for (int i = 0 ; i < dim ; i++)
979  {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
980  }
981 
983  sdataBlockPos.id = dataBlockPos;
984 
985  stencil::stencil(
986  sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
987  curMask, args...);
988  }
989 
990  template <unsigned int dim,
991  unsigned int pMask,
992  typename stencil,
993  typename IndexBufT,
994  typename DataBufT,
995  typename SparseGridT,
996  typename... Args>
997  __global__ void
998  applyStencilInPlaceNoShared(
999  Box<dim,int> bx,
1000  IndexBufT indexBuffer,
1001  DataBufT dataBuffer,
1002  SparseGridT sparseGrid,
1003  Args... args)
1004  {
1005  constexpr unsigned int pIndex = 0;
1006 
1007  typedef typename IndexBufT::value_type IndexAggregateT;
1008  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1009 
1010  typedef typename DataBufT::value_type AggregateT;
1011  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1012  typedef ScalarTypeOf<AggregateT, pMask> MaskT;
1013  constexpr unsigned int blockSize = MaskBlockT::size;
1014 
1015  int p = blockIdx.x * blockDim.x + threadIdx.x;
1016 
1017  auto & pntBuff = sparseGrid.getPointBuffer();
1018 
1019  if (p >= pntBuff.size())
1020  {
1021  return;
1022  }
1023 
1024  auto id = pntBuff.template get<0>(p);
1025 
1026  const unsigned int dataBlockPos = id / blockSize;
1027  const unsigned int offset = id % blockSize;
1028 
1029  auto dataBlockLoad = dataBuffer.get(dataBlockPos);
1030 
1031  const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1032  grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1033 
1034  unsigned char curMask;
1035 
1036  if (offset < blockSize)
1037  {
1038  // Read local mask to register
1039  curMask = dataBlockLoad.template get<pMask>()[offset];
1040  if (bx.isInsideKey(pointCoord) == false) {curMask = 0;}
1041  }
1042 
1044  sdataBlockPos.id = dataBlockPos;
1045 
1046  stencil::stencil(
1047  sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1048  curMask, args...);
1049  }
1050 
1051  template<unsigned int pMask,
1052  typename dataBuffType,
1053  typename scanType,
1054  typename outType>
1055  __global__ void fill_e_points(dataBuffType dataBuf, scanType scanBuf, outType output)
1056  {
1057  typedef typename dataBuffType::value_type AggregateT;
1058  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1059  constexpr unsigned int blockSize = MaskBlockT::size;
1060 
1061  const unsigned int dataBlockPos = blockIdx.x;
1062  const unsigned int offset = threadIdx.x % blockSize;
1063 
1064  __shared__ int ato_cnt;
1065 
1066  if (threadIdx.x == 0)
1067  {ato_cnt = 0;}
1068 
1069  __syncthreads();
1070 
1071  if (dataBlockPos >= scanBuf.size() - 1)
1072  {
1073  return;
1074  }
1075 
1076  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1077 
1078  int id = atomicAdd(&ato_cnt,predicate);
1079 
1080  __syncthreads();
1081 
1082  if (predicate == true)
1083  {
1084  output.template get<0>(id + scanBuf.template get<0>(dataBlockPos)) = offset + dataBlockPos * blockSize;
1085  }
1086  }
1087 
1088  template<unsigned int pMask,
1089  typename dataBufferType,
1090  typename outType>
1091  __global__ void calc_exist_points(dataBufferType dataBuf, outType output)
1092  {
1093  typedef typename dataBufferType::value_type AggregateT;
1094  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1095  constexpr unsigned int blockSize = MaskBlockT::size;
1096 
1097  const unsigned int dataBlockPos = blockIdx.x;
1098  const unsigned int offset = threadIdx.x % blockSize;
1099 
1100  __shared__ int ato_cnt;
1101 
1102  if (threadIdx.x == 0)
1103  {ato_cnt = 0;}
1104 
1105  __syncthreads();
1106 
1107  if (dataBlockPos >= output.size())
1108  {
1109  return;
1110  }
1111 
1112  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1113 
1114  atomicAdd(&ato_cnt,predicate);
1115 
1116  __syncthreads();
1117 
1118  output.template get<0>(dataBlockPos) = ato_cnt;
1119  }
1120 
1121  template<unsigned int dim,
1122  unsigned int pMask,
1123  unsigned int blockEdgeSize,
1124  typename dataBufferType,
1125  typename outType,
1126  typename boxesVector_type,
1127  typename grid_smb_type,
1128  typename indexBuffer_type>
1129  __global__ void calc_remove_points_chunks_boxes(indexBuffer_type indexBuffer,
1130  boxesVector_type boxes,
1131  grid_smb_type grd,
1132  dataBufferType dataBuf,
1133  outType output)
1134  {
1135  typedef typename dataBufferType::value_type AggregateT;
1136  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1137 
1138  const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1139 
1140  if (dataBlockPos >= indexBuffer.size())
1141  {return;}
1142 
1143  auto id = indexBuffer.template get<0>(dataBlockPos);
1144  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize());
1145 
1147 
1148  for (int i = 0 ; i < dim ; i++)
1149  {
1150  b.setLow(i,pnt.get(i));
1151  b.setHigh(i,pnt.get(i) + blockEdgeSize - 1);
1152  }
1153 
1154  // this block intersect a remove box section so mark the chunk
1155 
1156  output.template get<1>(dataBlockPos) = 0;
1157  for (int k = 0 ; k < boxes.size() ; k++ )
1158  {
1159  Box<dim,unsigned int> btest = boxes.get(k);
1160 
1161  Box<dim,unsigned int> bout;
1162 
1163  if (btest.Intersect(b,bout) == true)
1164  {
1165  output.template get<1>(dataBlockPos) = 1;
1166  }
1167  }
1168  }
1169 
1170  template<typename outType,
1171  typename activeCnkType>
1172  __global__ void collect_rem_chunks(activeCnkType act,
1173  outType output)
1174  {
1175  const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1176 
1177  if (dataBlockPos >= act.size()-1)
1178  {return;}
1179 
1180  auto id = act.template get<1>(dataBlockPos);
1181  auto id_p1 = act.template get<1>(dataBlockPos+1);
1182 
1183  if (id != id_p1)
1184  {
1185  output.template get<0>(id) = dataBlockPos;
1186  }
1187  }
1188 
1189  template<unsigned int dim, unsigned int pMask,
1190  typename dataBufferType,
1191  typename indexBufferType,
1192  typename grid_smb_type,
1193  typename activeCntType,
1194  typename boxesType>
1195  __global__ void remove_points(indexBufferType indexBuffer,
1196  grid_smb_type grd,
1197  dataBufferType dataBuffer,
1198  activeCntType active_blocks,
1199  boxesType boxes)
1200  {
1201  typedef typename dataBufferType::value_type AggregateT;
1202  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1203  constexpr unsigned int blockSize = MaskBlockT::size;
1204 
1205  const unsigned int dataBlockPos = active_blocks.template get<0>(blockIdx.x);
1206  const unsigned int offset = threadIdx.x % blockSize;
1207 
1208  if (dataBlockPos >= dataBuffer.size()-1)
1209  {return;}
1210 
1211  int predicate = dataBuffer.template get<pMask>(dataBlockPos)[offset] & 0x1;
1212  // calculate point coord;
1213  auto id = indexBuffer.template get<0>(dataBlockPos);
1214  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1216 
1217  for (int i = 0 ; i < dim ; i++)
1218  {p.get(i) = pnt.get(i);}
1219 
1220  // if this block intersect any box
1221 
1222  if (predicate == true)
1223  {
1224  for (int k = 0 ; k < boxes.size() ; k++ )
1225  {
1226  Box<dim,unsigned int> box = boxes.get(k);
1227 
1228  if (box.isInside(p) == true)
1229  {
1230  dataBuffer.template get<pMask>(dataBlockPos)[offset] = 0;
1231  }
1232  }
1233  }
1234  }
1235 
1236  template<unsigned int dim,
1237  unsigned int pMask,
1238  unsigned int numCnt,
1239  typename indexT,
1240  typename dataBufferType,
1241  typename outType,
1242  typename boxesVector_type,
1243  typename grid_smb_type,
1244  typename indexBuffer_type>
1245  __global__ void calc_exist_points_with_boxes(indexBuffer_type indexBuffer,
1246  boxesVector_type boxes,
1247  grid_smb_type grd,
1248  dataBufferType dataBuf,
1249  outType output,
1250  unsigned int stride_size)
1251  {
1252  typedef typename dataBufferType::value_type AggregateT;
1253  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1254  constexpr unsigned int blockSize = MaskBlockT::size;
1255 
1256  const unsigned int dataBlockPos = blockIdx.x;
1257  const unsigned int offset = threadIdx.x % blockSize;
1258 
1259  __shared__ int ato_cnt[numCnt];
1260 
1261  if (threadIdx.x < numCnt)
1262  {ato_cnt[threadIdx.x] = 0;}
1263 
1264  __syncthreads();
1265 
1266 #ifdef SE_CLASS1
1267 
1268  if (numCnt > blockDim.x)
1269  {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);}
1270 
1271 #endif
1272 
1273  if (dataBlockPos >= output.size())
1274  {return;}
1275 
1276  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1277  // calculate point coord;
1278  indexT id = indexBuffer.template get<0>(dataBlockPos);
1279  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1280  Point<dim,int> p;
1281 
1282  for (int i = 0 ; i < dim ; i++)
1283  {p.get(i) = pnt.get(i);}
1284 
1285  // if this block intersect any box
1286 
1287  if (predicate == true)
1288  {
1289  for (int k = 0 ; k < boxes.size() ; k++ )
1290  {
1291  Box<dim,int> box = boxes.get(k);
1292 
1293  if (box.isInside(p) == true)
1294  {
1295  atomicAdd(&ato_cnt[k],1);
1296  }
1297  }
1298  }
1299 
1300  __syncthreads();
1301 
1302  if (threadIdx.x < boxes.size())
1303  {
1304  output.template get<0>(dataBlockPos+threadIdx.x*stride_size) = ato_cnt[threadIdx.x];
1305  output.template get<1>(dataBlockPos+threadIdx.x*stride_size) = (ato_cnt[threadIdx.x] != 0);
1306  }
1307  }
1308 
1319  template<unsigned int dim,
1320  unsigned int pMask,
1321  unsigned int numCnt,
1322  typename indexT,
1323  typename dataBufferType,
1324  typename packBufferType,
1325  typename scanType,
1326  typename scanItType,
1327  typename outputType,
1328  typename boxesVector_type,
1329  typename grid_smb_type,
1330  typename indexBuffer_type>
1331  __global__ void get_exist_points_with_boxes(indexBuffer_type indexBuffer,
1332  boxesVector_type boxes,
1333  grid_smb_type grd,
1334  dataBufferType dataBuf,
1335  packBufferType pack_output,
1336  scanType scan,
1337  scanItType scan_it,
1338  outputType output)
1339  {
1340  typedef typename dataBufferType::value_type AggregateT;
1341  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1342  constexpr unsigned int blockSize = MaskBlockT::size;
1343 
1344  const unsigned int dataBlockPos = blockIdx.x;
1345  const unsigned int offset = threadIdx.x % blockSize;
1346 
1347  __shared__ int ato_cnt[numCnt];
1348 
1349  if (threadIdx.x < numCnt)
1350  {ato_cnt[threadIdx.x] = 0;}
1351 
1352  __syncthreads();
1353 
1354 #ifdef SE_CLASS1
1355 
1356  if (numCnt > blockDim.x)
1357  {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);}
1358 
1359 #endif
1360 
1361  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1362  // calculate point coord;
1363  indexT id = indexBuffer.template get<0>(dataBlockPos);
1364  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1365  Point<dim,int> p_;
1366 
1367  for (int i = 0 ; i < dim ; i++)
1368  {p_.get(i) = pnt.get(i);}
1369 
1370  // if this block intersect any box
1371 
1372  if (predicate == true)
1373  {
1374  for (int k = 0 ; k < boxes.size() ; k++ )
1375  {
1376  Box<dim,int> box = boxes.get(k);
1377 
1378  if (box.isInside(p_) == true)
1379  {
1380  // We have an atomic counter for every packing box
1381  int p = atomicAdd(&ato_cnt[k] , 1);
1382 
1383  // we have a scan for every box
1384  const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuffer.size() + 1));
1385  unsigned int sit = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1));
1386  int scan_id = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1)) + scan_it.template get<0>(k);
1387  output.template get<0>(scan_id + p) = (offset + dataBlockPos * blockSize) * numCnt + k;
1388  pack_output.template get<0>(scan_id + p) = p + sit;
1389  }
1390  }
1391  }
1392  }
1393 
1394 
1395 
1396 
1397  template<unsigned int dim,
1398  unsigned int blockSize,
1399  unsigned int blockEdgeSize,
1400  unsigned int nshifts,
1401  typename indexT,
1402  typename linearizer,
1403  typename shiftTypeVector,
1404  typename outputType>
1405  __global__ void convert_chunk_ids(indexT * ids,
1406  int n_cnk,
1407  linearizer gridGeoPack,
1408  grid_key_dx<dim,int> origPack,
1409  linearizer gridGeo,
1410  grid_key_dx<dim,int> origUnpack,
1411  outputType output,
1412  shiftTypeVector shifts,
1414  int bs)
1415  {
1416  // points
1417  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1418 
1419  if (p >= n_cnk)
1420  {return;}
1421 
1422  auto id = ids[p];
1423 
1424  for (int i = 0 ; i < nshifts ; i++)
1425  {
1426  grid_key_dx<dim,int> pos = gridGeoPack.InvLinId(id,0) - origPack + origUnpack;
1427 
1428  for (int j = 0 ; j < dim ; j++)
1429  {
1430  pos.set_d(j,pos.get(j) + shifts.template get<0>(i)[j]*blockEdgeSize);
1431  if (pos.get(j) < 0)
1432  {
1433  pos.set_d(j,0);
1434  }
1435  if (pos.get(j) >= sz.get(j))
1436  {
1437  pos.set_d(j,pos.get(j) - blockEdgeSize);
1438  }
1439  }
1440 
1441  auto plin = gridGeo.LinId(pos);
1442 
1443  output.template get<0>(p*nshifts + i + bs) = plin / blockSize;
1444  }
1445  }
1446 
1447  template<typename vectorType>
1448  __global__ void set_one(vectorType vt)
1449  {
1450  // points
1451  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1452 
1453  if (p >= vt.size())
1454  {return;}
1455 
1456  vt.template get<0>(p) = 1;
1457  }
1458 
1459  template<unsigned int pSegment,typename newMapType, typename mergeMapType,
1460  typename dataMapType, typename segmentOffsetType,
1461  typename outMapType>
1462  __global__ void construct_new_chunk_map(newMapType new_map, dataMapType dataMap,
1463  mergeMapType merge_id, outMapType outMap,
1464  segmentOffsetType segments_data, int start_p)
1465  {
1466  // points
1467  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1468 
1469  if (p >= segments_data.size()-1)
1470  {return;}
1471 
1472  unsigned int st = segments_data.template get<pSegment>(p);
1473 
1474  int segmentSize = segments_data.template get<pSegment>(p + 1)
1475  - segments_data.template get<pSegment>(p);
1476 
1477  for (int j = 0 ; j < segmentSize ; j++)
1478  {
1479  int dm = dataMap.template get<0>(st+j);
1480  new_map.template get<0>(dm) = outMap.template get<0>(p);
1481  }
1482  }
1483 
1484  template<unsigned int pMask, typename AggregateT, typename blockConvertType, typename newMapType, typename dataType_ptrs, typename dataType, unsigned int ... prp>
1485  __global__ void copy_packed_data_to_chunks(unsigned int * scan,
1486  unsigned short int * offsets,
1487  blockConvertType blc,
1488  newMapType new_map,
1489  dataType_ptrs data_ptrs,
1490  dataType data_buff,
1491  unsigned int n_cnk,
1492  unsigned int n_shf,
1493  unsigned int n_pnt,
1494  unsigned int i,
1495  unsigned int n_accu_cnk)
1496  {
1497  // points
1498  const unsigned int p = blockIdx.x;
1499 
1500  if (p >= n_cnk)
1501  {return;}
1502 
1503  int scan_pp = scan[p];
1504  int n_block_pnt = scan[p+1] - scan_pp;
1505 
1506  if (threadIdx.x < n_block_pnt)
1507  {
1508  unsigned short int off = offsets[scan[p] + threadIdx.x];
1509 
1510  int conv = blc.template get<0>(i)[off];
1511 
1512  unsigned short int off_c = conv & 0xFFFF;
1513  unsigned short int shf_c = conv >> 16;
1514 
1515  unsigned int pos_c = new_map.template get<0>(n_shf*p + shf_c + n_accu_cnk);
1516 
1517  sparsegridgpu_unpack_impl<AggregateT, dataType ,prp ...>
1518  spi(pos_c,off_c,data_buff,scan_pp + threadIdx.x,data_ptrs,n_pnt);
1519 
1520  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,sizeof...(prp)> >(spi);
1521 
1522  data_buff.template get<pMask>(pos_c)[off_c] |= 0x1;
1523  }
1524  }
1525 
1526 
1527  template<typename scanPointerType, typename scanType>
1528  __global__ void last_scan_point(scanPointerType scan_ptr, scanType scan,unsigned int stride, unsigned int n_pack)
1529  {
1530  const unsigned int k = blockIdx.x * blockDim.x + threadIdx.x;
1531 
1532  if (k >= n_pack) {return;}
1533 
1534  unsigned int ppos = scan.template get<0>((k+1)*stride-1);
1535  unsigned int pos = scan.template get<1>((k+1)*stride-1);
1536 
1537  ((unsigned int *)scan_ptr.ptr[k])[pos] = ppos;
1538  }
1539 
1540  template<unsigned int pMask,
1541  typename AggregateT,
1542  unsigned int n_it,
1543  unsigned int n_prp,
1544  typename indexT,
1545  typename pntBuff_type,
1546  typename pointOffset_type,
1547  typename indexBuffer_type,
1548  typename dataBuffer_type,
1549  typename scan_type,
1550  unsigned int blockSize,
1551  unsigned int ... prp>
1552  __global__ void pack_data(pntBuff_type pntBuff,
1553  dataBuffer_type dataBuff,
1554  indexBuffer_type indexBuff,
1555  scan_type scan,
1556  pointOffset_type point_offsets,
1557  arr_ptr<n_it> index_ptr,
1558  arr_ptr<n_it> scan_ptr,
1559  arr_arr_ptr<n_it,n_prp> * data_ptr,
1560  arr_ptr<n_it> offset_ptr,
1561  arr_ptr<n_it> mask_ptr,
1563  {
1564  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1565 
1566  if (p >= pntBuff.size())
1567  {return;}
1568 
1569  const unsigned int pb = pntBuff.template get<0>(p);
1570  const unsigned int p_offset = point_offsets.template get<0>(p);
1571 
1572  const unsigned int k = pb % n_it;
1573  const unsigned int id = pb / n_it;
1574 
1575  const unsigned int dataBlockPos = id / blockSize;
1576  const unsigned int offset = id % blockSize;
1577 
1578  unsigned int ppos = scan.template get<0>(dataBlockPos + k*(indexBuff.size() + 1));
1579  const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuff.size() + 1));
1580 
1581  sparsegridgpu_pack_impl<AggregateT, dataBuffer_type ,prp ...>
1582  spi(dataBlockPos,offset,dataBuff,p_offset,data_ptr->ptr[k],sar.sa[k]);
1583 
1584  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,sizeof...(prp)> >(spi);
1585 
1586  ((unsigned int *)scan_ptr.ptr[k])[dataBlockPosPack] = ppos;
1587 
1588  ((indexT *)index_ptr.ptr[k])[dataBlockPosPack] = indexBuff.template get<0>(dataBlockPos);
1589  ((short int *)offset_ptr.ptr[k])[p_offset] = offset;
1590  ((unsigned char *)mask_ptr.ptr[k])[p_offset] = dataBuff.template get<pMask>(dataBlockPos)[offset];
1591  }
1592 }
1593 
1594 #endif //OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
this class is a functor for "for_each" algorithm
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition: Box.hpp:556
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition: grid_key.hpp:503
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:27
__host__ __device__ bool isInside(const Point< dim, T > &p) const
Check if the point is inside the box.
Definition: Box.hpp:1004
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
this class is a functor for "for_each" algorithm
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ __host__ bool isInsideKey(const KeyType &k) const
Check if the point is inside the region (Border included)
Definition: Box.hpp:1153
__device__ __host__ bool Intersect(const Box< dim, T > &b, Box< dim, T > &b_out) const
Intersect.
Definition: Box.hpp:95
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition: grid_key.hpp:516
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition: Box.hpp:567
to_variadic_const_impl< 1, N, M, exit_::value, M >::type type
generate the boost::fusion::vector apply H on each term