OpenFPM  5.2.0
Project that contain the implementation of distributed structures
SparseGridGpu_kernels.cuh
1 //
2 // Created by tommaso on 19/06/19.
3 //
4 
5 #ifndef OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
6 #define OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
7 
8 #include <SparseGridGpu/BlockMapGpu.hpp>
9 #include <SparseGridGpu/TemplateUtils/mathUtils.hpp>
10 #include "util/cuda_util.hpp"
11 #include "SparseGrid/cp_block.hpp"
12 
13 #ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE
14 #define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE
15 #endif
16 
17 #ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED
18 #define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED
19 #endif
20 
21 enum mask_sparse
22 {
23  NOT_EXIST = 0,
24  EXIST = 1,
25  PADDING = 2,
26  EXIST_AND_PADDING = 3
27 };
28 
29 // Kernels for SparseGridGpu
30 namespace SparseGridGpuKernels
31 {
32  template<typename SparseGridGpuType, typename pointers_type, typename headers_type>
33  __global__ void unpack_headers(pointers_type pointers, headers_type headers, int * result, unsigned int sz_pack, int n_slot)
34  {
35  int t = threadIdx.x;
36 
37  if (t > pointers.size()) {return;}
38 
39  unsigned char * data_pack = (unsigned char *)pointers.template get<0>(t);
40 
41  while (data_pack < pointers.template get<1>(t) )
42  {
43  int ih = pointers.template get<2>(t);
44  if (n_slot > ih)
45  {
46  if (sizeof(typename SparseGridGpuType::indexT_) == 8)
47  {headers.template get<0>(t*n_slot + ih) = *(size_t *)data_pack;}
48  else
49  {
50  unsigned int dp1 = *(unsigned int *)data_pack;
51  unsigned int dp2 = *(unsigned int *)&(data_pack[4]);
52  headers.template get<0>(t*n_slot + ih) = (size_t)dp1 + (((size_t)dp2) << 32);
53  }
54  data_pack += sizeof(size_t);
55  data_pack += SparseGridGpuType::unpack_headers(headers,data_pack,t*n_slot + ih,sz_pack);
56  pointers.template get<2>(t) += 1;
57  }
58  else
59  {
60  // report error
61  result[0] = 1;
62  return;
63  }
64  }
65  }
66 
67 
68  template<unsigned int dim>
70  {
71  template<typename ScalarT, typename coordType, typename SparseGridT, unsigned int enlargedBlockSize, typename lambda_func, typename ... ArgsT>
72  __device__ static inline void stencil(ScalarT & res, ScalarT & cur, coordType & coord ,
73  ScalarT (& enlargedBlock)[enlargedBlockSize],
74  lambda_func f,
75  SparseGridT & sparseGrid, ArgsT ... args)
76  {
78 
79  for (int d = 0; d < dim; ++d)
80  {
81  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
82  auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
83  ScalarT neighbourPlus = enlargedBlock[nPlusId];
84  ScalarT neighbourMinus = enlargedBlock[nMinusId];
85 
86  cs.xm[d] = neighbourMinus;
87  cs.xp[d] = neighbourPlus;
88  }
89 
90  res = f(cur,cs, args ...);
91  }
92  };
93 
94  template<unsigned int dim>
96  {
97  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
98  __device__ static inline void stencil(ScalarT & res, coordType & coord ,
99  CpBlockType & cpb,
100  lambda_func f,
101  ArgsT ... args)
102  {
103  printf("Convolution operation on GPU: Dimension not implemented \n");
104  }
105 
106  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
107  __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
108  CpBlockType & cpb1,
109  CpBlockType & cpb2,
110  lambda_func f,
111  ArgsT ... args)
112  {
113  printf("Convolution operation on GPU: Dimension not implemented \n");
114  }
115  };
116 
117  template<>
119  {
120  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
121  __device__ static inline void stencil_block(ScalarT & res, coordType & coord ,
122  CpBlockType & cpb,
123  DataBlockWrapperT & DataBlockLoad,
124  int offset,
125  lambda_func f,
126  ArgsT ... args)
127  {
128  res = f(cpb,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
129  }
130 
131  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
132  __device__ static inline void stencil(ScalarT & res, coordType & coord ,
133  CpBlockType & cpb,
134  lambda_func f,
135  ArgsT ... args)
136  {
137  res = f(cpb,coord[0],coord[1],coord[2]);
138  }
139 
140  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
141  __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
142  CpBlockType & cpb1,
143  CpBlockType & cpb2,
144  lambda_func f,
145  ArgsT ... args)
146  {
147  f(res1,res2,cpb1,cpb2,coord[0],coord[1],coord[2]);
148  }
149 
150  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
151  __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
152  CpBlockType & cpb1,
153  CpBlockType & cpb2,
154  DataBlockWrapperT & DataBlockLoad,
155  int offset,
156  lambda_func f,
157  ArgsT ... args)
158  {
159  f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
160  }
161 
162  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
163  __device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
164  CpBlockType & cpb1,
165  CpBlockType & cpb2,
166  CpBlockType & cpb3,
167  DataBlockWrapperT & DataBlockLoad,
168  int offset,
169  lambda_func f,
170  ArgsT ... args)
171  {
172  f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
173  }
174  };
175 
176  template<>
178  {
179  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
180  __device__ static inline void stencil_block(ScalarT & res, coordType & coord,
181  CpBlockType & cpb,
182  DataBlockWrapperT & DataBlockLoad,
183  int offset,
184  lambda_func f,
185  ArgsT ... args)
186  {
187  res = f(cpb,DataBlockLoad,offset,coord[0],coord[1]);
188  }
189 
190  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
191  __device__ static inline void stencil(ScalarT & res, coordType & coord ,
192  CpBlockType & cpb,
193  lambda_func f,
194  ArgsT ... args)
195  {
196  res = f(cpb,coord[0],coord[1]);
197  }
198 
199  template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
200  __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
201  CpBlockType & cpb1,
202  CpBlockType & cpb2,
203  lambda_func f,
204  ArgsT ... args)
205  {
206  f(res1,res2,cpb1,cpb2,coord[0],coord[1]);
207  }
208 
209  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
210  __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
211  CpBlockType & cpb1,
212  CpBlockType & cpb2,
213  DataBlockWrapperT & DataBlockLoad,
214  int offset,
215  lambda_func f,
216  ArgsT ... args)
217  {
218  f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]);
219  }
220 
221  template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
222  __device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
223  CpBlockType & cpb1,
224  CpBlockType & cpb2,
225  CpBlockType & cpb3,
226  DataBlockWrapperT & DataBlockLoad,
227  int offset,
228  lambda_func f,
229  ArgsT ... args)
230  {
231  f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1]);
232  }
233  };
234 
235  template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
237  {
238  typedef NNStar<dim> stencil_type;
239 
240  static constexpr unsigned int supportRadius = stencil_size;
241 
242  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
243  static inline __device__ void stencil(
244  SparseGridT & sparseGrid,
245  const unsigned int dataBlockId,
247  unsigned int offset,
248  grid_key_dx<dim, int> & pointCoord,
249  DataBlockWrapperT & dataBlockLoad,
250  DataBlockWrapperT & dataBlockStore,
251  unsigned char curMask,
252  lambda_func f,
253  ArgT ... args)
254  {
255  typedef typename SparseGridT::AggregateBlockType AggregateT;
256  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
257 
258  constexpr unsigned int enlargedBlockSize = IntPow<
259  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
260 
261  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
262 
263  for (int i = 0; i < n_loop ; i++)
264  {
265  if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
266  {
267  enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
268  }
269  }
270 
271  __syncthreads();
272 
274  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
275 
277 
278  sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
279 
280  __syncthreads();
281 
282  ScalarT res = 0;
283 
284  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
285  {
286  int coord[dim];
287 
288  unsigned int linIdTmp = offset;
289  for (unsigned int d = 0; d < dim; ++d)
290  {
291  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
292  linIdTmp /= SparseGridT::blockEdgeSize_;
293  }
294 
295  stencil_conv_func_impl<dim>::stencil(res,coord,cpb,f,args...);
296 
297  dataBlockStore.template get<p_dst>()[offset] = res;
298  }
299  }
300 
301  template <typename SparseGridT>
302  static inline void __host__ flush(SparseGridT & sparseGrid, gpu::context_t& gpuContext)
303  {
304  // No flush
305  }
306  };
307 
308 
309  template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
311  {
312  typedef NNStar<dim> stencil_type;
313 
314  static constexpr unsigned int supportRadius = stencil_size;
315 
316  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
317  static inline __device__ void stencil(
318  SparseGridT & sparseGrid,
319  const unsigned int dataBlockId,
321  unsigned int offset,
322  grid_key_dx<dim, int> & pointCoord,
323  DataBlockWrapperT & dataBlockLoad,
324  DataBlockWrapperT & dataBlockStore,
325  unsigned char curMask,
326  lambda_func f,
327  ArgT ... args)
328  {
329  typedef typename SparseGridT::AggregateBlockType AggregateT;
330  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
331 
332  constexpr unsigned int enlargedBlockSize = IntPow<
333  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
334 
335  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
336 
337  for (int i = 0; i < n_loop ; i++)
338  {
339  if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
340  {
341  enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
342  }
343  }
344 
345  __syncthreads();
346 
348  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
349 
351 
352  sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
353 
354  __syncthreads();
355 
356  ScalarT res = 0;
357 
358  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
359  {
360  int coord[dim];
361 
362  unsigned int linIdTmp = offset;
363  for (unsigned int d = 0; d < dim; ++d)
364  {
365  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
366  linIdTmp /= SparseGridT::blockEdgeSize_;
367  }
368 
369  stencil_conv_func_impl<dim>::stencil_block(res,coord,cpb,dataBlockLoad,offset,f,args...);
370 
371  dataBlockStore.template get<p_dst>()[offset] = res;
372  }
373  }
374 
375  template <typename SparseGridT>
376  static inline void __host__ flush(SparseGridT & sparseGrid, gpu::context_t& gpuContext)
377  {
378  // No flush
379  }
380  };
381 
382  template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
384  {
385  typedef NNStar<dim> stencil_type;
386 
387  static constexpr unsigned int supportRadius = stencil_size;
388 
389  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
390  static inline __device__ void stencil(
391  SparseGridT & sparseGrid,
392  const unsigned int dataBlockId,
394  unsigned int offset,
395  grid_key_dx<dim, int> & pointCoord,
396  DataBlockWrapperT & dataBlockLoad,
397  DataBlockWrapperT & dataBlockStore,
398  unsigned char curMask,
399  lambda_func f,
400  ArgT ... args)
401  {
402  typedef typename SparseGridT::AggregateBlockType AggregateT;
403  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
404  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
405 
406  constexpr unsigned int enlargedBlockSize = IntPow<
407  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
408 
409  __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
410  __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
411 
412  // fill with background
413 
415  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
416 
419 
420  sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
421  sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
422 
423  __syncthreads();
424 
425  ScalarT1 res1 = 0;
426  ScalarT2 res2 = 0;
427 
428  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
429  {
430  int coord[dim];
431 
432  unsigned int linIdTmp = offset;
433  for (unsigned int d = 0; d < dim; ++d)
434  {
435  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
436  linIdTmp /= SparseGridT::blockEdgeSize_;
437  }
438 
439  stencil_conv_func_impl<dim>::stencil2_block(res1,res2,coord,cpb1,cpb2,dataBlockLoad,offset,f,args...);
440 
441  dataBlockStore.template get<p_dst1>()[offset] = res1;
442  dataBlockStore.template get<p_dst2>()[offset] = res2;
443  }
444  }
445 
446  template <typename SparseGridT>
447  static inline void __host__ flush(SparseGridT & sparseGrid, gpu::context_t& gpuContext)
448  {
449  // No flush
450  }
451  };
452 
453  template<unsigned int dim, unsigned int n_loop,
454  unsigned int p_src1, unsigned int p_src2, unsigned int p_src3,
455  unsigned int p_dst1, unsigned int p_dst2, unsigned int p_dst3,
456  unsigned int stencil_size>
458  {
459  typedef NNStar<dim> stencil_type;
460 
461  static constexpr unsigned int supportRadius = stencil_size;
462 
463  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
464  static inline __device__ void stencil(
465  SparseGridT & sparseGrid,
466  const unsigned int dataBlockId,
468  unsigned int offset,
469  grid_key_dx<dim, int> & pointCoord,
470  DataBlockWrapperT & dataBlockLoad,
471  DataBlockWrapperT & dataBlockStore,
472  unsigned char curMask,
473  lambda_func f,
474  ArgT ... args)
475  {
476  typedef typename SparseGridT::AggregateBlockType AggregateT;
477  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
478  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
479  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT3;
480 
481  constexpr unsigned int enlargedBlockSize = IntPow<
482  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
483 
484  __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
485  __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
486  __shared__ ScalarT3 enlargedBlock3[enlargedBlockSize];
487 
488  // fill with background
489 
491  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
492 
496 
497  sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
498  sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
499  sparseGrid.template loadGhostBlock<p_src3>(dataBlockLoad, dataBlockIdPos, enlargedBlock3);
500 
501  __syncthreads();
502 
503  ScalarT1 res1 = 0;
504  ScalarT2 res2 = 0;
505  ScalarT3 res3 = 0;
506 
507  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
508  {
509  int coord[dim];
510 
511  unsigned int linIdTmp = offset;
512  for (unsigned int d = 0; d < dim; ++d)
513  {
514  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
515  linIdTmp /= SparseGridT::blockEdgeSize_;
516  }
517 
518  stencil_conv_func_impl<dim>::stencil3_block(res1,res2,res3,coord,cpb1,cpb2,cpb3,dataBlockLoad,offset,f,args...);
519 
520  dataBlockStore.template get<p_dst1>()[offset] = res1;
521  dataBlockStore.template get<p_dst2>()[offset] = res2;
522  dataBlockStore.template get<p_dst3>()[offset] = res3;
523  }
524  }
525 
526  template <typename SparseGridT>
527  static inline void __host__ flush(SparseGridT & sparseGrid, gpu::context_t& gpuContext)
528  {
529  // No flush
530  }
531  };
532 
533  template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
535  {
536  typedef NNStar<dim> stencil_type;
537 
538  static constexpr unsigned int supportRadius = stencil_size;
539 
540  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
541  static inline __device__ void stencil(
542  SparseGridT & sparseGrid,
543  const unsigned int dataBlockId,
545  unsigned int offset,
546  grid_key_dx<dim, int> & pointCoord,
547  DataBlockWrapperT & dataBlockLoad,
548  DataBlockWrapperT & dataBlockStore,
549  unsigned char curMask,
550  lambda_func f,
551  ArgT ... args)
552  {
553  typedef typename SparseGridT::AggregateBlockType AggregateT;
554  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
555  typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
556 
557  constexpr unsigned int enlargedBlockSize = IntPow<
558  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
559 
560  __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
561  __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
562 
563  // fill with background
564 
566  typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
567 
570 
571  sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
572  sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
573 
574  __syncthreads();
575 
576  ScalarT1 res1 = 0;
577  ScalarT2 res2 = 0;
578 
579  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
580  {
581  int coord[dim];
582 
583  unsigned int linIdTmp = offset;
584  for (unsigned int d = 0; d < dim; ++d)
585  {
586  coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
587  linIdTmp /= SparseGridT::blockEdgeSize_;
588  }
589 
590  stencil_conv_func_impl<dim>::stencil2(res1,res2,coord,cpb1,cpb2,f,args...);
591 
592  dataBlockStore.template get<p_dst1>()[offset] = res1;
593  dataBlockStore.template get<p_dst2>()[offset] = res2;
594  }
595  }
596 
597  template <typename SparseGridT>
598  static inline void __host__ flush(SparseGridT & sparseGrid, gpu::context_t& gpuContext)
599  {
600  // No flush
601  }
602  };
603 
604  template<unsigned int dim, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
606  {
607  typedef NNStar<dim> stencil_type;
608 
609  static constexpr unsigned int supportRadius = stencil_size;
610 
611  template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
612  static inline __device__ void stencil(
613  SparseGridT & sparseGrid,
614  const unsigned int dataBlockId,
616  unsigned int offset,
617  grid_key_dx<dim, int> & pointCoord,
618  DataBlockWrapperT & dataBlockLoad,
619  DataBlockWrapperT & dataBlockStore,
620  unsigned char curMask,
621  lambda_func f,
622  ArgT ... args)
623  {
624  typedef typename SparseGridT::AggregateBlockType AggregateT;
625  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
626 
627  constexpr unsigned int enlargedBlockSize = IntPow<
628  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
629 
630  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
631 
632  sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
633 
634  __syncthreads();
635 
636  decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
637  ScalarT res = 0;
638 
639  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
640  {
641  const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
642  // const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
643  linId = sparseGrid.getLinIdInEnlargedBlock(offset);
644  ScalarT cur = enlargedBlock[linId];
645 
646  stencil_cross_func_impl<dim>::stencil(res,cur,coord,enlargedBlock,f,sparseGrid,args ...);
647 
648 
649  }
650  __syncthreads();
651  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
652  {
653  enlargedBlock[linId] = res;
654  }
655  __syncthreads();
656  sparseGrid.template storeBlock<p_dst>(dataBlockStore, enlargedBlock);
657  }
658 
659  template <typename SparseGridT>
660  static inline void __host__ flush(SparseGridT & sparseGrid, gpu::context_t& gpuContext)
661  {
662  // No flush
663  }
664  };
665 
666  // This kernel is to be called with 1D parameters (?)
667  template <unsigned int dim,
668  unsigned int stencilSupportRadius,
669  unsigned int pMask,
670  typename NN_type,
671  typename checker_type,
672  typename IndexBufT,
673  typename DataBufT,
674  typename SparseGridT,
675  typename nn_blocksT>
676  __global__ void tagBoundaries(IndexBufT indexBuffer, DataBufT dataBuffer, SparseGridT sparseGrid,nn_blocksT nbT, checker_type chk)
677  {
678  //todo: #ifdef __NVCC__
679  constexpr unsigned int pIndex = 0;
680 
681  typedef typename IndexBufT::value_type IndexAggregateT;
682  typedef BlockTypeOf<IndexAggregateT, pIndex> IndexT;
683 
684  typedef typename DataBufT::value_type AggregateT;
685  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
686  typedef ScalarTypeOf<AggregateT, pMask> MaskT;
687  constexpr unsigned int blockSize = MaskBlockT::size;
688 
689  // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
690  // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
691  const unsigned int dataBlockPos = blockIdx.x;
692  const unsigned int offset = threadIdx.x;
693 
694  constexpr unsigned int enlargedBlockSize = IntPow<
695  sparseGrid.getBlockEdgeSize() + 2 * stencilSupportRadius, dim>::value;
696  __shared__ MaskT enlargedBlock[enlargedBlockSize];
697 
698  if (dataBlockPos >= indexBuffer.size())
699  {
700  return;
701  }
702 
703  const long long dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
704  auto dataBlock = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
705 
707  sdataBlockPos.id = dataBlockPos;
708  sparseGrid.template loadGhostBlock<pMask>(dataBlock,sdataBlockPos,enlargedBlock);
709 
710  __syncthreads();
711 
712  bool check = chk.check(sparseGrid,dataBlockId,offset);
713 
714  //Here code for tagging the boundary
715  if (offset < blockSize && check == true)
716  {
717  const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
718  const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
719 
720  MaskT cur = enlargedBlock[linId];
721  if (sparseGrid.exist(cur))
722  {
723  bool isPadding = NN_type::isPadding(sparseGrid,coord,enlargedBlock);
724  if (isPadding)
725  {
726  sparseGrid.setPadding(enlargedBlock[linId]);
727  }
728  else
729  {
730  sparseGrid.unsetPadding(enlargedBlock[linId]);
731  }
732  }
733  }
734  // Write block back to global memory
735  __syncthreads();
736  sparseGrid.template storeBlock<pMask>(dataBlock, enlargedBlock);
737  }
738 
743  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size , typename SparseGridType, typename outputType>
744  __global__ void link_construct(SparseGridType grid_up, SparseGridType grid_cu, outputType out)
745  {
746  const unsigned int dataBlockPos = blockIdx.x;
747  const unsigned int offset = threadIdx.x;
748 
749  auto & indexBuffer = grid_cu.getIndexBuffer();
750  auto & dataBuffer = grid_cu.getDataBuffer();
751 
752  // if the point is a padding
753  if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
754  {
755  auto id = indexBuffer.template get<0>(dataBlockPos);
756  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
757 
758  printf("HERE %d %d \n",pos.get(0),pos.get(1));
759 
760  for (int i = 0 ; i < dim ; i++)
761  {pos.set_d(i,pos.get(i) / 2);}
762 
763  if (grid_up.template get<pMask>(pos) == 0x1)
764  {
765  atomicAdd(&out.template get<0>(dataBlockPos),1);
766  }
767  }
768  }
769 
774  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size , typename SparseGridType, typename outputType, typename BoxType>
775  __global__ void count_paddings(SparseGridType grid_cu, outputType out, BoxType box)
776  {
777  const unsigned int dataBlockPos = blockIdx.x;
778  const unsigned int offset = threadIdx.x;
779 
780  auto & indexBuffer = grid_cu.getIndexBuffer();
781  auto & dataBuffer = grid_cu.getDataBuffer();
782 
783  auto id = indexBuffer.template get<0>(dataBlockPos);
784 
785  // check if the point is inside the box
786  auto coord = grid_cu.getCoord(id,offset);
787 
788  bool active = box.isInsideKey(coord);
789 
790  if (active == false)
791  {return;}
792 
793  // if the point is a padding
794  if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
795  {
796  atomicAdd(&out.template get<0>(dataBlockPos),1);
797  }
798  }
799 
804  template<unsigned int pMask, typename SparseGridType, typename ScanType, typename outputType, typename BoxType>
805  __global__ void collect_paddings(SparseGridType grid_cu, ScanType stp, outputType out, BoxType box)
806  {
807  const unsigned int dataBlockPos = blockIdx.x;
808  const unsigned int offset = threadIdx.x;
809 
810  __shared__ int counter;
811  counter = 0;
812  __syncthreads();
813 
814  auto & indexBuffer = grid_cu.getIndexBuffer();
815  auto & dataBuffer = grid_cu.getDataBuffer();
816 
817  auto id = indexBuffer.template get<0>(dataBlockPos);
818 
819  // check if the point is inside the box
820  auto coord = grid_cu.getCoord(id,offset);
821 
822  bool active = box.isInsideKey(coord);
823 
824  if (active == false)
825  {return;}
826 
827  int pad_offset = stp.template get<0>(dataBlockPos);
828 
829  // if the point is a padding
830  if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
831  {
832  int cnt = atomicAdd(&counter,1);
833 
834  out.template get<0>(pad_offset + cnt) = dataBlockPos;
835  out.template get<1>(pad_offset + cnt) = offset;
836  }
837  }
838 
843  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
844  typename padPointType , typename SparseGridType,
845  typename outputType>
846  __global__ void link_construct_dw_count(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, outputType out, Point<dim,int> p_dw)
847  {
848  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
849 
850  if (p >= padPoints.size()) {return;}
851 
852  const unsigned int dataBlockPos = padPoints.template get<0>(p);
853  const unsigned int offset = padPoints.template get<1>(p);
854 
855  auto & indexBuffer = grid_cu.getIndexBuffer();
856  auto & dataBuffer = grid_cu.getDataBuffer();
857 
858  auto id = indexBuffer.template get<0>(dataBlockPos);
859  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
860 
861  for (int i = 0 ; i < dim ; i++)
862  {pos.set_d(i,pos.get(i) * 2 + p_dw.get(i) );}
863 
864  for (int j = 0 ; j < 2*dim ; j++)
865  {
867  for (int k = 0 ; k < dim ; k++)
868  {
869  kc.set_d(k,pos.get(k) + ((j >> k) & 0x1) );
870  }
871 
872  if (grid_dw.template get<pMask>(kc) & 0x1)
873  {
874  int a = atomicAdd(&out.template get<0>(p),1);
875  }
876  }
877  }
878 
883  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
884  typename padPointType , typename SparseGridType,
885  typename outputType>
886  __global__ void link_construct_up_count(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, outputType out, Point<dim,int> p_up)
887  {
888  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
889 
890  if (p >= padPoints.size()) {return;}
891 
892  const unsigned int dataBlockPos = padPoints.template get<0>(p);
893  const unsigned int offset = padPoints.template get<1>(p);
894 
895  auto & indexBuffer = grid_cu.getIndexBuffer();
896  auto & dataBuffer = grid_cu.getDataBuffer();
897 
898  auto id = indexBuffer.template get<0>(dataBlockPos);
899  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
900 
901  for (int i = 0 ; i < dim ; i++)
902  {pos.set_d(i,(pos.get(i) - p_up.get(i)) / 2);}
903 
904  if (grid_up.template get<pMask>(pos) & 0x1)
905  {
906  int a = atomicAdd(&out.template get<0>(p),1);
907  }
908  }
909 
914  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
915  typename padPointType , typename SparseGridType, typename scanType, typename outputType>
916  __global__ void link_construct_insert_dw(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, scanType scan, outputType out, Point<dim,int> p_dw)
917  {
918  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
919 
920  if (p >= padPoints.size()) {return;}
921 
922  const unsigned int dataBlockPos = padPoints.template get<0>(p);
923  const unsigned int offset = padPoints.template get<1>(p);
924 
925  auto & indexBuffer = grid_cu.getIndexBuffer();
926  auto & dataBuffer = grid_cu.getDataBuffer();
927 
928  auto & dataBuffer_dw = grid_dw.getDataBuffer();
929 
930  auto id = indexBuffer.template get<0>(dataBlockPos);
931  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
932 
933  for (int i = 0 ; i < dim ; i++)
934  {pos.set_d(i,pos.get(i) * 2 + p_dw.get(i) );}
935 
936  unsigned int dataBlockPos_dw;
937  unsigned int offset_dw;
938 
939  int link_offset = scan.template get<0>(p);
940 
941  int c = 0;
942  for (int j = 0 ; j < 2*dim ; j++)
943  {
945  for (int k = 0 ; k < dim ; k++)
946  {
947  kc.set_d(k,pos.get(k) + ((j >> k) & 0x1) );
948  }
949 
950  grid_dw.get_sparse(kc,dataBlockPos_dw,offset_dw);
951 
952  if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
953  {
954  out.template get<0>(link_offset + c) = dataBlockPos_dw;
955  out.template get<1>(link_offset + c) = offset_dw;
956 
957  c++;
958  }
959  }
960  }
961 
966  template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
967  typename padPointType , typename SparseGridType, typename scanType, typename outputType>
968  __global__ void link_construct_insert_up(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, scanType scan, outputType out, Point<dim,int> p_up)
969  {
970  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
971 
972  if (p >= padPoints.size()) {return;}
973 
974  const unsigned int dataBlockPos = padPoints.template get<0>(p);
975  const unsigned int offset = padPoints.template get<1>(p);
976 
977  auto & indexBuffer = grid_cu.getIndexBuffer();
978  auto & dataBuffer = grid_cu.getDataBuffer();
979 
980  auto & dataBuffer_dw = grid_up.getDataBuffer();
981 
982  auto id = indexBuffer.template get<0>(dataBlockPos);
983  grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
984 
985  for (int i = 0 ; i < dim ; i++)
986  {pos.set_d(i,(pos.get(i) - p_up.get(i)) / 2);}
987 
988  unsigned int dataBlockPos_dw;
989  unsigned int offset_dw;
990 
991  int link_offset = scan.template get<0>(p);
992 
993  grid_up.get_sparse(pos,dataBlockPos_dw,offset_dw);
994 
995  if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
996  {
997  out.template get<0>(link_offset) = dataBlockPos_dw;
998  out.template get<1>(link_offset) = offset_dw;
999  }
1000  }
1001 
1009  template <unsigned int dim,
1010  typename nNN_type,
1011  typename IndexBufT,
1012  typename SparseGridT,
1013  typename nn_blocksT>
1014  __global__ void findNeighbours(IndexBufT indexBuffer, SparseGridT sparseGrid, nn_blocksT nn_blocks)
1015  {
1016  //todo: #ifdef __NVCC__
1017  constexpr unsigned int pIndex = 0;
1018 
1019  typedef typename IndexBufT::value_type IndexAggregateT;
1020  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1021 
1022  const unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
1023 
1024  const unsigned int dataBlockPos = pos / nNN_type::nNN;
1025  const unsigned int offset = pos % nNN_type::nNN;
1026 
1027  if (dataBlockPos >= indexBuffer.size())
1028  {return;}
1029 
1030  const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1031 
1032  auto neighbourPos = sparseGrid.template getNeighboursPos<nNN_type>(dataBlockId, offset);
1033 
1034  nn_blocks.template get<0>(dataBlockPos*nNN_type::nNN + offset) = neighbourPos;
1035  }
1036 
1037  template <unsigned int dim,
1038  unsigned int pMask,
1039  typename stencil,
1040  typename IndexBufT,
1041  typename DataBufT,
1042  typename SparseGridT,
1043  typename... Args>
1044  __global__ void
1045  applyStencilInPlace(
1046  Box<dim,int> bx,
1047  IndexBufT indexBuffer,
1048  DataBufT dataBuffer,
1049  SparseGridT sparseGrid,
1050  Args... args)
1051  {
1052  constexpr unsigned int pIndex = 0;
1053 
1054  typedef typename IndexBufT::value_type IndexAggregateT;
1055  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1056 
1057  typedef typename DataBufT::value_type AggregateT;
1058  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1059  typedef ScalarTypeOf<AggregateT, pMask> MaskT;
1060  constexpr unsigned int blockSize = MaskBlockT::size;
1061 
1062  // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
1063  // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
1064  const unsigned int dataBlockPos = blockIdx.x;
1065  const unsigned int offset = threadIdx.x;
1066 
1067  if (dataBlockPos >= indexBuffer.size())
1068  {
1069  return;
1070  }
1071 
1072  auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
1073 
1074  // todo: Add management of RED-BLACK stencil application! :)
1075  const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1076  grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1077 
1078  unsigned char curMask;
1079 
1080  if (offset < blockSize)
1081  {
1082  // Read local mask to register
1083  curMask = dataBlockLoad.template get<pMask>()[offset];
1084  for (int i = 0 ; i < dim ; i++)
1085  {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
1086  }
1087 
1089  sdataBlockPos.id = dataBlockPos;
1090 
1091  stencil::stencil(
1092  sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1093  curMask, args...);
1094  }
1095 
1096  template <unsigned int dim,
1097  unsigned int pMask,
1098  typename stencil,
1099  typename IndexBufT,
1100  typename DataBufT,
1101  typename SparseGridT,
1102  typename... Args>
1103  __global__ void
1104  applyStencilInPlaceNoShared(
1105  Box<dim,int> bx,
1106  IndexBufT indexBuffer,
1107  DataBufT dataBuffer,
1108  SparseGridT sparseGrid,
1109  Args... args)
1110  {
1111  constexpr unsigned int pIndex = 0;
1112 
1113  typedef typename IndexBufT::value_type IndexAggregateT;
1114  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1115 
1116  typedef typename DataBufT::value_type AggregateT;
1117  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1118  typedef ScalarTypeOf<AggregateT, pMask> MaskT;
1119  constexpr unsigned int blockSize = MaskBlockT::size;
1120 
1121  int p = blockIdx.x * blockDim.x + threadIdx.x;
1122 
1123  auto & pntBuff = sparseGrid.getPointBuffer();
1124 
1125  if (p >= pntBuff.size())
1126  {
1127  return;
1128  }
1129 
1130  auto id = pntBuff.template get<0>(p);
1131 
1132  const unsigned int dataBlockPos = id / blockSize;
1133  const unsigned int offset = id % blockSize;
1134 
1135  auto dataBlockLoad = dataBuffer.get(dataBlockPos);
1136 
1137  const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1138  grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1139 
1140  unsigned char curMask;
1141 
1142  if (offset < blockSize)
1143  {
1144  // Read local mask to register
1145  curMask = dataBlockLoad.template get<pMask>()[offset];
1146  if (bx.isInsideKey(pointCoord) == false) {curMask = 0;}
1147  }
1148 
1150  sdataBlockPos.id = dataBlockPos;
1151 
1152  stencil::stencil(
1153  sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1154  curMask, args...);
1155  }
1156 
1157  template<unsigned int pMask,
1158  typename dataBuffType,
1159  typename scanType,
1160  typename outType>
1161  __global__ void fill_e_points(dataBuffType dataBuf, scanType scanBuf, outType output)
1162  {
1163  typedef typename dataBuffType::value_type AggregateT;
1164  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1165  constexpr unsigned int blockSize = MaskBlockT::size;
1166 
1167  const unsigned int dataBlockPos = blockIdx.x;
1168  const unsigned int offset = threadIdx.x % blockSize;
1169 
1170  __shared__ int ato_cnt;
1171 
1172  if (threadIdx.x == 0)
1173  {ato_cnt = 0;}
1174 
1175  __syncthreads();
1176 
1177  if (dataBlockPos >= scanBuf.size() - 1)
1178  {
1179  return;
1180  }
1181 
1182  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1183 
1184  int id = atomicAdd(&ato_cnt,predicate);
1185 
1186  __syncthreads();
1187 
1188  if (predicate == true)
1189  {
1190  output.template get<0>(id + scanBuf.template get<0>(dataBlockPos)) = offset + dataBlockPos * blockSize;
1191  }
1192  }
1193 
1194  template<unsigned int pMask,
1195  typename dataBufferType,
1196  typename outType>
1197  __global__ void calc_exist_points(dataBufferType dataBuf, outType output)
1198  {
1199  typedef typename dataBufferType::value_type AggregateT;
1200  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1201  constexpr unsigned int blockSize = MaskBlockT::size;
1202 
1203  const unsigned int dataBlockPos = blockIdx.x;
1204  const unsigned int offset = threadIdx.x % blockSize;
1205 
1206  __shared__ int ato_cnt;
1207 
1208  if (threadIdx.x == 0)
1209  {ato_cnt = 0;}
1210 
1211  __syncthreads();
1212 
1213  if (dataBlockPos >= output.size())
1214  {
1215  return;
1216  }
1217 
1218  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1219 
1220  atomicAdd(&ato_cnt,predicate);
1221 
1222  __syncthreads();
1223 
1224  output.template get<0>(dataBlockPos) = ato_cnt;
1225  }
1226 
1227  template<unsigned int dim,
1228  unsigned int pMask,
1229  unsigned int blockEdgeSize,
1230  typename dataBufferType,
1231  typename outType,
1232  typename boxesVector_type,
1233  typename grid_smb_type,
1234  typename indexBuffer_type>
1235  __global__ void calc_remove_points_chunks_boxes(indexBuffer_type indexBuffer,
1236  boxesVector_type boxes,
1237  grid_smb_type grd,
1238  dataBufferType dataBuf,
1239  outType output)
1240  {
1241  typedef typename dataBufferType::value_type AggregateT;
1242  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1243 
1244  const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1245 
1246  if (dataBlockPos >= indexBuffer.size())
1247  {return;}
1248 
1249  auto id = indexBuffer.template get<0>(dataBlockPos);
1250  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize());
1251 
1253 
1254  for (int i = 0 ; i < dim ; i++)
1255  {
1256  b.setLow(i,pnt.get(i));
1257  b.setHigh(i,pnt.get(i) + blockEdgeSize - 1);
1258  }
1259 
1260  // this block intersect a remove box section so mark the chunk
1261 
1262  output.template get<1>(dataBlockPos) = 0;
1263  for (int k = 0 ; k < boxes.size() ; k++ )
1264  {
1265  Box<dim,unsigned int> btest = boxes.get(k);
1266 
1267  Box<dim,unsigned int> bout;
1268 
1269  if (btest.Intersect(b,bout) == true)
1270  {
1271  output.template get<1>(dataBlockPos) = 1;
1272  }
1273  }
1274  }
1275 
1276  template<typename outType,
1277  typename activeCnkType>
1278  __global__ void collect_rem_chunks(activeCnkType act,
1279  outType output)
1280  {
1281  const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1282 
1283  if (dataBlockPos >= act.size()-1)
1284  {return;}
1285 
1286  auto id = act.template get<1>(dataBlockPos);
1287  auto id_p1 = act.template get<1>(dataBlockPos+1);
1288 
1289  if (id != id_p1)
1290  {
1291  output.template get<0>(id) = dataBlockPos;
1292  }
1293  }
1294 
1295  template<unsigned int dim, unsigned int pMask,
1296  typename dataBufferType,
1297  typename indexBufferType,
1298  typename grid_smb_type,
1299  typename activeCntType,
1300  typename boxesType>
1301  __global__ void remove_points(indexBufferType indexBuffer,
1302  grid_smb_type grd,
1303  dataBufferType dataBuffer,
1304  activeCntType active_blocks,
1305  boxesType boxes)
1306  {
1307  typedef typename dataBufferType::value_type AggregateT;
1308  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1309  constexpr unsigned int blockSize = MaskBlockT::size;
1310 
1311  const unsigned int dataBlockPos = active_blocks.template get<0>(blockIdx.x);
1312  const unsigned int offset = threadIdx.x % blockSize;
1313 
1314  if (dataBlockPos >= dataBuffer.size()-1)
1315  {return;}
1316 
1317  int predicate = dataBuffer.template get<pMask>(dataBlockPos)[offset] & 0x1;
1318  // calculate point coord;
1319  auto id = indexBuffer.template get<0>(dataBlockPos);
1320  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1322 
1323  for (int i = 0 ; i < dim ; i++)
1324  {p.get(i) = pnt.get(i);}
1325 
1326  // if this block intersect any box
1327 
1328  if (predicate == true)
1329  {
1330  for (int k = 0 ; k < boxes.size() ; k++ )
1331  {
1332  Box<dim,unsigned int> box = boxes.get(k);
1333 
1334  if (box.isInside(p) == true)
1335  {
1336  dataBuffer.template get<pMask>(dataBlockPos)[offset] = 0;
1337  }
1338  }
1339  }
1340  }
1341 
1342  template<unsigned int dim,
1343  unsigned int pMask,
1344  unsigned int numCnt,
1345  typename indexT,
1346  typename dataBufferType,
1347  typename outType,
1348  typename boxesVector_type,
1349  typename grid_smb_type,
1350  typename indexBuffer_type>
1351  __global__ void calc_exist_points_with_boxes(indexBuffer_type indexBuffer,
1352  boxesVector_type boxes,
1353  grid_smb_type grd,
1354  dataBufferType dataBuf,
1355  outType output,
1356  unsigned int stride_size)
1357  {
1358  typedef typename dataBufferType::value_type AggregateT;
1359  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1360  constexpr unsigned int blockSize = MaskBlockT::size;
1361 
1362  const unsigned int dataBlockPos = blockIdx.x;
1363  const unsigned int offset = threadIdx.x % blockSize;
1364 
1365  __shared__ int ato_cnt[numCnt];
1366 
1367  if (threadIdx.x < numCnt)
1368  {ato_cnt[threadIdx.x] = 0;}
1369 
1370  __syncthreads();
1371 
1372 #ifdef SE_CLASS1
1373 
1374  if (numCnt > blockDim.x)
1375  {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);}
1376 
1377 #endif
1378 
1379  if (dataBlockPos >= output.size())
1380  {return;}
1381 
1382  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1383  // calculate point coord;
1384  indexT id = indexBuffer.template get<0>(dataBlockPos);
1385  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1386  Point<dim,int> p;
1387 
1388  for (int i = 0 ; i < dim ; i++)
1389  {p.get(i) = pnt.get(i);}
1390 
1391  // if this block intersect any box
1392 
1393  if (predicate == true)
1394  {
1395  for (int k = 0 ; k < boxes.size() ; k++ )
1396  {
1397  Box<dim,int> box = boxes.get(k);
1398 
1399  if (box.isInside(p) == true)
1400  {
1401  atomicAdd(&ato_cnt[k],1);
1402  }
1403  }
1404  }
1405 
1406  __syncthreads();
1407 
1408  if (threadIdx.x < boxes.size())
1409  {
1410  output.template get<0>(dataBlockPos+threadIdx.x*stride_size) = ato_cnt[threadIdx.x];
1411  output.template get<1>(dataBlockPos+threadIdx.x*stride_size) = (ato_cnt[threadIdx.x] != 0);
1412  }
1413  }
1414 
1425  template<unsigned int dim,
1426  unsigned int pMask,
1427  unsigned int numCnt,
1428  typename indexT,
1429  typename dataBufferType,
1430  typename packBufferType,
1431  typename scanType,
1432  typename scanItType,
1433  typename outputType,
1434  typename boxesVector_type,
1435  typename grid_smb_type,
1436  typename indexBuffer_type>
1437  __global__ void get_exist_points_with_boxes(indexBuffer_type indexBuffer,
1438  boxesVector_type boxes,
1439  grid_smb_type grd,
1440  dataBufferType dataBuf,
1441  packBufferType pack_output,
1442  scanType scan,
1443  scanItType scan_it,
1444  outputType output)
1445  {
1446  typedef typename dataBufferType::value_type AggregateT;
1447  typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1448  constexpr unsigned int blockSize = MaskBlockT::size;
1449 
1450  const unsigned int dataBlockPos = blockIdx.x;
1451  const unsigned int offset = threadIdx.x % blockSize;
1452 
1453  __shared__ int ato_cnt[numCnt];
1454 
1455  if (threadIdx.x < numCnt)
1456  {ato_cnt[threadIdx.x] = 0;}
1457 
1458  __syncthreads();
1459 
1460 #ifdef SE_CLASS1
1461 
1462  if (numCnt > blockDim.x)
1463  {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);}
1464 
1465 #endif
1466 
1467  int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1468  // calculate point coord;
1469  indexT id = indexBuffer.template get<0>(dataBlockPos);
1470  grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1471  Point<dim,int> p_;
1472 
1473  for (int i = 0 ; i < dim ; i++)
1474  {p_.get(i) = pnt.get(i);}
1475 
1476  // if this block intersect any box
1477 
1478  if (predicate == true)
1479  {
1480  for (int k = 0 ; k < boxes.size() ; k++ )
1481  {
1482  Box<dim,int> box = boxes.get(k);
1483 
1484  if (box.isInside(p_) == true)
1485  {
1486  // We have an atomic counter for every packing box
1487  int p = atomicAdd(&ato_cnt[k] , 1);
1488 
1489  // we have a scan for every box
1490  const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuffer.size() + 1));
1491  unsigned int sit = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1));
1492  int scan_id = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1)) + scan_it.template get<0>(k);
1493  output.template get<0>(scan_id + p) = (offset + dataBlockPos * blockSize) * numCnt + k;
1494  pack_output.template get<0>(scan_id + p) = p + sit;
1495  }
1496  }
1497  }
1498  }
1499 
1500 
1501 
1502 
1503  template<unsigned int dim,
1504  unsigned int blockSize,
1505  unsigned int blockEdgeSize,
1506  unsigned int nshifts,
1507  typename indexT,
1508  typename linearizer,
1509  typename shiftTypeVector,
1510  typename outputType>
1511  __global__ void convert_chunk_ids(indexT * ids,
1512  int n_cnk,
1513  linearizer gridGeoPack,
1514  grid_key_dx<dim,int> origPack,
1515  linearizer gridGeo,
1516  grid_key_dx<dim,int> origUnpack,
1517  outputType output,
1518  shiftTypeVector shifts,
1520  int bs)
1521  {
1522  // points
1523  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1524 
1525  if (p >= n_cnk)
1526  {return;}
1527 
1528  auto id = ids[p];
1529 
1530  for (int i = 0 ; i < nshifts ; i++)
1531  {
1532  grid_key_dx<dim,int> pos = gridGeoPack.InvLinId(id,0) - origPack + origUnpack;
1533 
1534  for (int j = 0 ; j < dim ; j++)
1535  {
1536  pos.set_d(j,pos.get(j) + shifts.template get<0>(i)[j]*blockEdgeSize);
1537  if (pos.get(j) < 0)
1538  {
1539  pos.set_d(j,0);
1540  }
1541  if (pos.get(j) >= sz.get(j))
1542  {
1543  pos.set_d(j,pos.get(j) - blockEdgeSize);
1544  }
1545  }
1546 
1547  auto plin = gridGeo.LinId(pos);
1548 
1549  output.template get<0>(p*nshifts + i + bs) = plin / blockSize;
1550  }
1551  }
1552 
1553  template<typename vectorType>
1554  __global__ void set_one(vectorType vt)
1555  {
1556  // points
1557  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1558 
1559  if (p >= vt.size())
1560  {return;}
1561 
1562  vt.template get<0>(p) = 1;
1563  }
1564 
1565  template<unsigned int pSegment,typename newMapType, typename mergeMapType,
1566  typename dataMapType, typename segmentOffsetType,
1567  typename outMapType>
1568  __global__ void construct_new_chunk_map(newMapType new_map, dataMapType dataMap,
1569  mergeMapType merge_id, outMapType outMap,
1570  segmentOffsetType segments_data, int start_p)
1571  {
1572  // points
1573  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1574 
1575  if (p >= segments_data.size()-1)
1576  {return;}
1577 
1578  unsigned int st = segments_data.template get<pSegment>(p);
1579 
1580  int segmentSize = segments_data.template get<pSegment>(p + 1)
1581  - segments_data.template get<pSegment>(p);
1582 
1583  for (int j = 0 ; j < segmentSize ; j++)
1584  {
1585  int dm = dataMap.template get<0>(st+j);
1586  new_map.template get<0>(dm) = outMap.template get<0>(p);
1587  }
1588  }
1589 
1590  template<unsigned int pMask, typename AggregateT, typename blockConvertType, typename newMapType, typename dataType_ptrs, typename dataType, unsigned int ... prp>
1591  __global__ void copy_packed_data_to_chunks(unsigned int * scan,
1592  unsigned short int * offsets,
1593  blockConvertType blc,
1594  newMapType new_map,
1595  dataType_ptrs data_ptrs,
1596  dataType data_buff,
1597  unsigned int n_cnk,
1598  unsigned int n_shf,
1599  unsigned int n_pnt,
1600  unsigned int i,
1601  unsigned int n_accu_cnk)
1602  {
1603  // points
1604  const unsigned int p = blockIdx.x;
1605 
1606  if (p >= n_cnk)
1607  {return;}
1608 
1609  int scan_pp = scan[p];
1610  int n_block_pnt = scan[p+1] - scan_pp;
1611 
1612  if (threadIdx.x < n_block_pnt)
1613  {
1614  unsigned short int off = offsets[scan[p] + threadIdx.x];
1615 
1616  int conv = blc.template get<0>(i)[off];
1617 
1618  unsigned short int off_c = conv & 0xFFFF;
1619  unsigned short int shf_c = conv >> 16;
1620 
1621  unsigned int pos_c = new_map.template get<0>(n_shf*p + shf_c + n_accu_cnk);
1622 
1623  sparsegridgpu_unpack_impl<AggregateT, dataType ,prp ...>
1624  spi(pos_c,off_c,data_buff,scan_pp + threadIdx.x,data_ptrs,n_pnt);
1625 
1626  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,sizeof...(prp)> >(spi);
1627 
1628  data_buff.template get<pMask>(pos_c)[off_c] |= 0x1;
1629  }
1630  }
1631 
1632 
1633  template<typename scanPointerType, typename scanType>
1634  __global__ void last_scan_point(scanPointerType scan_ptr, scanType scan,unsigned int stride, unsigned int n_pack)
1635  {
1636  const unsigned int k = blockIdx.x * blockDim.x + threadIdx.x;
1637 
1638  if (k >= n_pack) {return;}
1639 
1640  unsigned int ppos = scan.template get<0>((k+1)*stride-1);
1641  unsigned int pos = scan.template get<1>((k+1)*stride-1);
1642 
1643  ((unsigned int *)scan_ptr.ptr[k])[pos] = ppos;
1644  }
1645 
1646  template<unsigned int pMask,
1647  typename AggregateT,
1648  unsigned int n_it,
1649  unsigned int n_prp,
1650  typename indexT,
1651  typename pntBuff_type,
1652  typename pointOffset_type,
1653  typename indexBuffer_type,
1654  typename dataBuffer_type,
1655  typename scan_type,
1656  unsigned int blockSize,
1657  unsigned int ... prp>
1658  __global__ void pack_data(pntBuff_type pntBuff,
1659  dataBuffer_type dataBuff,
1660  indexBuffer_type indexBuff,
1661  scan_type scan,
1662  pointOffset_type point_offsets,
1663  arr_ptr<n_it> index_ptr,
1664  arr_ptr<n_it> scan_ptr,
1665  arr_arr_ptr<n_it,n_prp> * data_ptr,
1666  arr_ptr<n_it> offset_ptr,
1667  arr_ptr<n_it> mask_ptr,
1669  {
1670  const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1671 
1672  if (p >= pntBuff.size())
1673  {return;}
1674 
1675  const unsigned int pb = pntBuff.template get<0>(p);
1676  const unsigned int p_offset = point_offsets.template get<0>(p);
1677 
1678  const unsigned int k = pb % n_it;
1679  const unsigned int id = pb / n_it;
1680 
1681  const unsigned int dataBlockPos = id / blockSize;
1682  const unsigned int offset = id % blockSize;
1683 
1684  unsigned int ppos = scan.template get<0>(dataBlockPos + k*(indexBuff.size() + 1));
1685  const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuff.size() + 1));
1686 
1687  sparsegridgpu_pack_impl<AggregateT, dataBuffer_type ,prp ...>
1688  spi(dataBlockPos,offset,dataBuff,p_offset,data_ptr->ptr[k],sar.sa[k]);
1689 
1690  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,sizeof...(prp)> >(spi);
1691 
1692  ((unsigned int *)scan_ptr.ptr[k])[dataBlockPosPack] = ppos;
1693 
1694  ((indexT *)index_ptr.ptr[k])[dataBlockPosPack] = indexBuff.template get<0>(dataBlockPos);
1695  ((short int *)offset_ptr.ptr[k])[p_offset] = offset;
1696  ((unsigned char *)mask_ptr.ptr[k])[p_offset] = dataBuff.template get<pMask>(dataBlockPos)[offset];
1697  }
1698 }
1699 
1700 #endif //OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition: Box.hpp:555
__device__ __host__ bool Intersect(const Box< dim, T > &b, Box< dim, T > &b_out) const
Intersect.
Definition: Box.hpp:94
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition: Box.hpp:566
__host__ __device__ bool isInside(const Point< dim, T > &p) const
Check if the point is inside the box.
Definition: Box.hpp:1016
__device__ __host__ bool isInsideKey(const KeyType &k) const
Check if the point is inside the region (Border included)
Definition: Box.hpp:1160
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:28
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition: grid_key.hpp:516
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition: grid_key.hpp:503
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
this class is a functor for "for_each" algorithm
this class is a functor for "for_each" algorithm
to_variadic_const_impl< 1, N, M, exit_::value, M >::type type
generate the boost::fusion::vector apply H on each term