OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_kernels.cuh
1//
2// Created by tommaso on 19/06/19.
3//
4
5#ifndef OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
6#define OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
7
8#include <SparseGridGpu/BlockMapGpu.hpp>
9#include <SparseGridGpu/TemplateUtils/mathUtils.hpp>
10#include "util/cuda_util.hpp"
11#include "SparseGrid/cp_block.hpp"
12
13#ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE
14#define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE
15#endif
16
17#ifndef SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED
18#define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED
19#endif
20
21enum mask_sparse
22{
23 NOT_EXIST = 0,
24 EXIST = 1,
25 PADDING = 2,
26 EXIST_AND_PADDING = 3
27};
28
29// Kernels for SparseGridGpu
30namespace SparseGridGpuKernels
31{
32 template<typename SparseGridGpuType, typename pointers_type, typename headers_type>
33 __global__ void unpack_headers(pointers_type pointers, headers_type headers, int * result, unsigned int sz_pack, int n_slot)
34 {
35 int t = threadIdx.x;
36
37 if (t > pointers.size()) {return;}
38
39 unsigned char * data_pack = (unsigned char *)pointers.template get<0>(t);
40
41 while (data_pack < pointers.template get<1>(t) )
42 {
43 int ih = pointers.template get<2>(t);
44 if (n_slot > ih)
45 {
46 if (sizeof(typename SparseGridGpuType::indexT_) == 8)
47 {headers.template get<0>(t*n_slot + ih) = *(size_t *)data_pack;}
48 else
49 {
50 unsigned int dp1 = *(unsigned int *)data_pack;
51 unsigned int dp2 = *(unsigned int *)&(data_pack[4]);
52 headers.template get<0>(t*n_slot + ih) = (size_t)dp1 + (((size_t)dp2) << 32);
53 }
54 data_pack += sizeof(size_t);
55 data_pack += SparseGridGpuType::unpack_headers(headers,data_pack,t*n_slot + ih,sz_pack);
56 pointers.template get<2>(t) += 1;
57 }
58 else
59 {
60 // report error
61 result[0] = 1;
62 return;
63 }
64 }
65 }
66
67
68 template<unsigned int dim>
70 {
71 template<typename ScalarT, typename coordType, typename SparseGridT, unsigned int enlargedBlockSize, typename lambda_func, typename ... ArgsT>
72 __device__ static inline void stencil(ScalarT & res, ScalarT & cur, coordType & coord ,
73 ScalarT (& enlargedBlock)[enlargedBlockSize],
74 lambda_func f,
75 SparseGridT & sparseGrid, ArgsT ... args)
76 {
78
79 for (int d = 0; d < dim; ++d)
80 {
81 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
82 auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
83 ScalarT neighbourPlus = enlargedBlock[nPlusId];
84 ScalarT neighbourMinus = enlargedBlock[nMinusId];
85
86 cs.xm[d] = neighbourMinus;
87 cs.xp[d] = neighbourPlus;
88 }
89
90 res = f(cur,cs, args ...);
91 }
92 };
93
94 template<unsigned int dim>
96 {
97 template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
98 __device__ static inline void stencil(ScalarT & res, coordType & coord ,
99 CpBlockType & cpb,
100 lambda_func f,
101 ArgsT ... args)
102 {
103 printf("Convolution operation on GPU: Dimension not implemented \n");
104 }
105
106 template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
107 __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
108 CpBlockType & cpb1,
109 CpBlockType & cpb2,
110 lambda_func f,
111 ArgsT ... args)
112 {
113 printf("Convolution operation on GPU: Dimension not implemented \n");
114 }
115 };
116
117 template<>
119 {
120 template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
121 __device__ static inline void stencil_block(ScalarT & res, coordType & coord ,
122 CpBlockType & cpb,
123 DataBlockWrapperT & DataBlockLoad,
124 int offset,
125 lambda_func f,
126 ArgsT ... args)
127 {
128 res = f(cpb,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
129 }
130
131 template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
132 __device__ static inline void stencil(ScalarT & res, coordType & coord ,
133 CpBlockType & cpb,
134 lambda_func f,
135 ArgsT ... args)
136 {
137 res = f(cpb,coord[0],coord[1],coord[2]);
138 }
139
140 template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
141 __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
142 CpBlockType & cpb1,
143 CpBlockType & cpb2,
144 lambda_func f,
145 ArgsT ... args)
146 {
147 f(res1,res2,cpb1,cpb2,coord[0],coord[1],coord[2]);
148 }
149
150 template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
151 __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
152 CpBlockType & cpb1,
153 CpBlockType & cpb2,
154 DataBlockWrapperT & DataBlockLoad,
155 int offset,
156 lambda_func f,
157 ArgsT ... args)
158 {
159 f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
160 }
161
162 template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
163 __device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
164 CpBlockType & cpb1,
165 CpBlockType & cpb2,
166 CpBlockType & cpb3,
167 DataBlockWrapperT & DataBlockLoad,
168 int offset,
169 lambda_func f,
170 ArgsT ... args)
171 {
172 f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
173 }
174 };
175
176 template<>
178 {
179 template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
180 __device__ static inline void stencil_block(ScalarT & res, coordType & coord,
181 CpBlockType & cpb,
182 DataBlockWrapperT & DataBlockLoad,
183 int offset,
184 lambda_func f,
185 ArgsT ... args)
186 {
187 res = f(cpb,DataBlockLoad,offset,coord[0],coord[1]);
188 }
189
190 template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
191 __device__ static inline void stencil(ScalarT & res, coordType & coord ,
192 CpBlockType & cpb,
193 lambda_func f,
194 ArgsT ... args)
195 {
196 res = f(cpb,coord[0],coord[1]);
197 }
198
199 template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
200 __device__ static inline void stencil2(ScalarT & res1, ScalarT & res2, coordType & coord ,
201 CpBlockType & cpb1,
202 CpBlockType & cpb2,
203 lambda_func f,
204 ArgsT ... args)
205 {
206 f(res1,res2,cpb1,cpb2,coord[0],coord[1]);
207 }
208
209 template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
210 __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
211 CpBlockType & cpb1,
212 CpBlockType & cpb2,
213 DataBlockWrapperT & DataBlockLoad,
214 int offset,
215 lambda_func f,
216 ArgsT ... args)
217 {
218 f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]);
219 }
220
221 template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
222 __device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
223 CpBlockType & cpb1,
224 CpBlockType & cpb2,
225 CpBlockType & cpb3,
226 DataBlockWrapperT & DataBlockLoad,
227 int offset,
228 lambda_func f,
229 ArgsT ... args)
230 {
231 f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1]);
232 }
233 };
234
235 template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
237 {
239
240 static constexpr unsigned int supportRadius = stencil_size;
241
242 template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
243 static inline __device__ void stencil(
244 SparseGridT & sparseGrid,
245 const unsigned int dataBlockId,
247 unsigned int offset,
248 grid_key_dx<dim, int> & pointCoord,
249 DataBlockWrapperT & dataBlockLoad,
250 DataBlockWrapperT & dataBlockStore,
251 unsigned char curMask,
252 lambda_func f,
253 ArgT ... args)
254 {
255 typedef typename SparseGridT::AggregateBlockType AggregateT;
256 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
257
258 constexpr unsigned int enlargedBlockSize = IntPow<
259 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
260
261 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
262
263 for (int i = 0; i < n_loop ; i++)
264 {
265 if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
266 {
267 enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
268 }
269 }
270
271 __syncthreads();
272
274 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
275
277
278 sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
279
280 __syncthreads();
281
282 ScalarT res = 0;
283
284 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
285 {
286 int coord[dim];
287
288 unsigned int linIdTmp = offset;
289 for (unsigned int d = 0; d < dim; ++d)
290 {
291 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
292 linIdTmp /= SparseGridT::blockEdgeSize_;
293 }
294
295 stencil_conv_func_impl<dim>::stencil(res,coord,cpb,f,args...);
296
297 dataBlockStore.template get<p_dst>()[offset] = res;
298 }
299 }
300
301 template <typename SparseGridT, typename CtxT>
302 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
303 {
304 // No flush
305 }
306 };
307
308
309 template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
311 {
313
314 static constexpr unsigned int supportRadius = stencil_size;
315
316 template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
317 static inline __device__ void stencil(
318 SparseGridT & sparseGrid,
319 const unsigned int dataBlockId,
321 unsigned int offset,
322 grid_key_dx<dim, int> & pointCoord,
323 DataBlockWrapperT & dataBlockLoad,
324 DataBlockWrapperT & dataBlockStore,
325 unsigned char curMask,
326 lambda_func f,
327 ArgT ... args)
328 {
329 typedef typename SparseGridT::AggregateBlockType AggregateT;
330 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
331
332 constexpr unsigned int enlargedBlockSize = IntPow<
333 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
334
335 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
336
337 for (int i = 0; i < n_loop ; i++)
338 {
339 if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
340 {
341 enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
342 }
343 }
344
345 __syncthreads();
346
348 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
349
351
352 sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
353
354 __syncthreads();
355
356 ScalarT res = 0;
357
358 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
359 {
360 int coord[dim];
361
362 unsigned int linIdTmp = offset;
363 for (unsigned int d = 0; d < dim; ++d)
364 {
365 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
366 linIdTmp /= SparseGridT::blockEdgeSize_;
367 }
368
369 stencil_conv_func_impl<dim>::stencil_block(res,coord,cpb,dataBlockLoad,offset,f,args...);
370
371 dataBlockStore.template get<p_dst>()[offset] = res;
372 }
373 }
374
375 template <typename SparseGridT, typename CtxT>
376 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
377 {
378 // No flush
379 }
380 };
381
382 template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
384 {
386
387 static constexpr unsigned int supportRadius = stencil_size;
388
389 template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
390 static inline __device__ void stencil(
391 SparseGridT & sparseGrid,
392 const unsigned int dataBlockId,
394 unsigned int offset,
395 grid_key_dx<dim, int> & pointCoord,
396 DataBlockWrapperT & dataBlockLoad,
397 DataBlockWrapperT & dataBlockStore,
398 unsigned char curMask,
399 lambda_func f,
400 ArgT ... args)
401 {
402 typedef typename SparseGridT::AggregateBlockType AggregateT;
403 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
404 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
405
406 constexpr unsigned int enlargedBlockSize = IntPow<
407 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
408
409 __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
410 __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
411
412 // fill with background
413
415 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
416
419
420 sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
421 sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
422
423 __syncthreads();
424
425 ScalarT1 res1 = 0;
426 ScalarT2 res2 = 0;
427
428 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
429 {
430 int coord[dim];
431
432 unsigned int linIdTmp = offset;
433 for (unsigned int d = 0; d < dim; ++d)
434 {
435 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
436 linIdTmp /= SparseGridT::blockEdgeSize_;
437 }
438
439 stencil_conv_func_impl<dim>::stencil2_block(res1,res2,coord,cpb1,cpb2,dataBlockLoad,offset,f,args...);
440
441 dataBlockStore.template get<p_dst1>()[offset] = res1;
442 dataBlockStore.template get<p_dst2>()[offset] = res2;
443 }
444 }
445
446 template <typename SparseGridT, typename CtxT>
447 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
448 {
449 // No flush
450 }
451 };
452
453 template<unsigned int dim, unsigned int n_loop,
454 unsigned int p_src1, unsigned int p_src2, unsigned int p_src3,
455 unsigned int p_dst1, unsigned int p_dst2, unsigned int p_dst3,
456 unsigned int stencil_size>
458 {
460
461 static constexpr unsigned int supportRadius = stencil_size;
462
463 template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
464 static inline __device__ void stencil(
465 SparseGridT & sparseGrid,
466 const unsigned int dataBlockId,
468 unsigned int offset,
469 grid_key_dx<dim, int> & pointCoord,
470 DataBlockWrapperT & dataBlockLoad,
471 DataBlockWrapperT & dataBlockStore,
472 unsigned char curMask,
473 lambda_func f,
474 ArgT ... args)
475 {
476 typedef typename SparseGridT::AggregateBlockType AggregateT;
477 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
478 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
479 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT3;
480
481 constexpr unsigned int enlargedBlockSize = IntPow<
482 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
483
484 __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
485 __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
486 __shared__ ScalarT3 enlargedBlock3[enlargedBlockSize];
487
488 // fill with background
489
491 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
492
496
497 sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
498 sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
499 sparseGrid.template loadGhostBlock<p_src3>(dataBlockLoad, dataBlockIdPos, enlargedBlock3);
500
501 __syncthreads();
502
503 ScalarT1 res1 = 0;
504 ScalarT2 res2 = 0;
505 ScalarT3 res3 = 0;
506
507 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
508 {
509 int coord[dim];
510
511 unsigned int linIdTmp = offset;
512 for (unsigned int d = 0; d < dim; ++d)
513 {
514 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
515 linIdTmp /= SparseGridT::blockEdgeSize_;
516 }
517
518 stencil_conv_func_impl<dim>::stencil3_block(res1,res2,res3,coord,cpb1,cpb2,cpb3,dataBlockLoad,offset,f,args...);
519
520 dataBlockStore.template get<p_dst1>()[offset] = res1;
521 dataBlockStore.template get<p_dst2>()[offset] = res2;
522 dataBlockStore.template get<p_dst3>()[offset] = res3;
523 }
524 }
525
526 template <typename SparseGridT, typename CtxT>
527 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
528 {
529 // No flush
530 }
531 };
532
533 template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
535 {
537
538 static constexpr unsigned int supportRadius = stencil_size;
539
540 template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
541 static inline __device__ void stencil(
542 SparseGridT & sparseGrid,
543 const unsigned int dataBlockId,
545 unsigned int offset,
546 grid_key_dx<dim, int> & pointCoord,
547 DataBlockWrapperT & dataBlockLoad,
548 DataBlockWrapperT & dataBlockStore,
549 unsigned char curMask,
550 lambda_func f,
551 ArgT ... args)
552 {
553 typedef typename SparseGridT::AggregateBlockType AggregateT;
554 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
555 typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
556
557 constexpr unsigned int enlargedBlockSize = IntPow<
558 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
559
560 __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
561 __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
562
563 // fill with background
564
566 typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
567
570
571 sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
572 sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
573
574 __syncthreads();
575
576 ScalarT1 res1 = 0;
577 ScalarT2 res2 = 0;
578
579 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
580 {
581 int coord[dim];
582
583 unsigned int linIdTmp = offset;
584 for (unsigned int d = 0; d < dim; ++d)
585 {
586 coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
587 linIdTmp /= SparseGridT::blockEdgeSize_;
588 }
589
590 stencil_conv_func_impl<dim>::stencil2(res1,res2,coord,cpb1,cpb2,f,args...);
591
592 dataBlockStore.template get<p_dst1>()[offset] = res1;
593 dataBlockStore.template get<p_dst2>()[offset] = res2;
594 }
595 }
596
597 template <typename SparseGridT, typename CtxT>
598 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
599 {
600 // No flush
601 }
602 };
603
604 template<unsigned int dim, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
606 {
608
609 static constexpr unsigned int supportRadius = stencil_size;
610
611 template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
612 static inline __device__ void stencil(
613 SparseGridT & sparseGrid,
614 const unsigned int dataBlockId,
616 unsigned int offset,
617 grid_key_dx<dim, int> & pointCoord,
618 DataBlockWrapperT & dataBlockLoad,
619 DataBlockWrapperT & dataBlockStore,
620 unsigned char curMask,
621 lambda_func f,
622 ArgT ... args)
623 {
624 typedef typename SparseGridT::AggregateBlockType AggregateT;
625 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
626
627 constexpr unsigned int enlargedBlockSize = IntPow<
628 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
629
630 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
631
632 sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
633
634 __syncthreads();
635
636 decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
637 ScalarT res = 0;
638
639 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
640 {
641 const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
642 // const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
643 linId = sparseGrid.getLinIdInEnlargedBlock(offset);
644 ScalarT cur = enlargedBlock[linId];
645
646 stencil_cross_func_impl<dim>::stencil(res,cur,coord,enlargedBlock,f,sparseGrid,args ...);
647
648
649 }
650 __syncthreads();
651 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
652 {
653 enlargedBlock[linId] = res;
654 }
655 __syncthreads();
656 sparseGrid.template storeBlock<p_dst>(dataBlockStore, enlargedBlock);
657 }
658
659 template <typename SparseGridT, typename CtxT>
660 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
661 {
662 // No flush
663 }
664 };
665
666 // This kernel is to be called with 1D parameters (?)
667 template <unsigned int dim,
668 unsigned int stencilSupportRadius,
669 unsigned int pMask,
670 typename NN_type,
671 typename checker_type,
672 typename IndexBufT,
673 typename DataBufT,
674 typename SparseGridT,
675 typename nn_blocksT>
676 __global__ void tagBoundaries(IndexBufT indexBuffer, DataBufT dataBuffer, SparseGridT sparseGrid,nn_blocksT nbT, checker_type chk)
677 {
678 //todo: #ifdef __NVCC__
679 constexpr unsigned int pIndex = 0;
680
681 typedef typename IndexBufT::value_type IndexAggregateT;
682 typedef BlockTypeOf<IndexAggregateT, pIndex> IndexT;
683
684 typedef typename DataBufT::value_type AggregateT;
685 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
686 typedef ScalarTypeOf<AggregateT, pMask> MaskT;
687 constexpr unsigned int blockSize = MaskBlockT::size;
688
689 // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
690 // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
691 const unsigned int dataBlockPos = blockIdx.x;
692 const unsigned int offset = threadIdx.x;
693
694 constexpr unsigned int enlargedBlockSize = IntPow<
695 sparseGrid.getBlockEdgeSize() + 2 * stencilSupportRadius, dim>::value;
696 __shared__ MaskT enlargedBlock[enlargedBlockSize];
697
698 if (dataBlockPos >= indexBuffer.size())
699 {
700 return;
701 }
702
703 const long long dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
704 auto dataBlock = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
705
707 sdataBlockPos.id = dataBlockPos;
708 sparseGrid.template loadGhostBlock<pMask>(dataBlock,sdataBlockPos,enlargedBlock);
709
710 __syncthreads();
711
712 bool check = chk.check(sparseGrid,dataBlockId,offset);
713
714 //Here code for tagging the boundary
715 if (offset < blockSize && check == true)
716 {
717 const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
718 const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
719
720 MaskT cur = enlargedBlock[linId];
721 if (sparseGrid.exist(cur))
722 {
723 bool isPadding = NN_type::isPadding(sparseGrid,coord,enlargedBlock);
724 if (isPadding)
725 {
726 sparseGrid.setPadding(enlargedBlock[linId]);
727 }
728 else
729 {
730 sparseGrid.unsetPadding(enlargedBlock[linId]);
731 }
732 }
733 }
734 // Write block back to global memory
735 __syncthreads();
736 sparseGrid.template storeBlock<pMask>(dataBlock, enlargedBlock);
737 }
738
743 template<unsigned int dim, unsigned int pMask, unsigned int chunk_size , typename SparseGridType, typename outputType>
744 __global__ void link_construct(SparseGridType grid_up, SparseGridType grid_cu, outputType out)
745 {
746 const unsigned int dataBlockPos = blockIdx.x;
747 const unsigned int offset = threadIdx.x;
748
749 auto & indexBuffer = grid_cu.getIndexBuffer();
750 auto & dataBuffer = grid_cu.getDataBuffer();
751
752 // if the point is a padding
753 if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
754 {
755 auto id = indexBuffer.template get<0>(dataBlockPos);
756 grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
757
758 printf("HERE %d %d \n",pos.get(0),pos.get(1));
759
760 for (int i = 0 ; i < dim ; i++)
761 {pos.set_d(i,pos.get(i) / 2);}
762
763 if (grid_up.template get<pMask>(pos) == 0x1)
764 {
765 atomicAdd(&out.template get<0>(dataBlockPos),1);
766 }
767 }
768 }
769
774 template<unsigned int dim, unsigned int pMask, unsigned int chunk_size , typename SparseGridType, typename outputType, typename BoxType>
775 __global__ void count_paddings(SparseGridType grid_cu, outputType out, BoxType box)
776 {
777 const unsigned int dataBlockPos = blockIdx.x;
778 const unsigned int offset = threadIdx.x;
779
780 auto & indexBuffer = grid_cu.getIndexBuffer();
781 auto & dataBuffer = grid_cu.getDataBuffer();
782
783 auto id = indexBuffer.template get<0>(dataBlockPos);
784
785 // check if the point is inside the box
786 auto coord = grid_cu.getCoord(id,offset);
787
788 bool active = box.isInsideKey(coord);
789
790 if (active == false)
791 {return;}
792
793 // if the point is a padding
794 if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
795 {
796 atomicAdd(&out.template get<0>(dataBlockPos),1);
797 }
798 }
799
804 template<unsigned int pMask, typename SparseGridType, typename ScanType, typename outputType, typename BoxType>
805 __global__ void collect_paddings(SparseGridType grid_cu, ScanType stp, outputType out, BoxType box)
806 {
807 const unsigned int dataBlockPos = blockIdx.x;
808 const unsigned int offset = threadIdx.x;
809
810 __shared__ int counter;
811 counter = 0;
812 __syncthreads();
813
814 auto & indexBuffer = grid_cu.getIndexBuffer();
815 auto & dataBuffer = grid_cu.getDataBuffer();
816
817 auto id = indexBuffer.template get<0>(dataBlockPos);
818
819 // check if the point is inside the box
820 auto coord = grid_cu.getCoord(id,offset);
821
822 bool active = box.isInsideKey(coord);
823
824 if (active == false)
825 {return;}
826
827 int pad_offset = stp.template get<0>(dataBlockPos);
828
829 // if the point is a padding
830 if (dataBuffer.template get <pMask>(dataBlockPos)[offset] & 0x2)
831 {
832 int cnt = atomicAdd(&counter,1);
833
834 out.template get<0>(pad_offset + cnt) = dataBlockPos;
835 out.template get<1>(pad_offset + cnt) = offset;
836 }
837 }
838
843 template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
844 typename padPointType , typename SparseGridType,
845 typename outputType>
846 __global__ void link_construct_dw_count(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, outputType out, Point<dim,int> p_dw)
847 {
848 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
849
850 if (p >= padPoints.size()) {return;}
851
852 const unsigned int dataBlockPos = padPoints.template get<0>(p);
853 const unsigned int offset = padPoints.template get<1>(p);
854
855 auto & indexBuffer = grid_cu.getIndexBuffer();
856 auto & dataBuffer = grid_cu.getDataBuffer();
857
858 auto id = indexBuffer.template get<0>(dataBlockPos);
859 grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
860
861 for (int i = 0 ; i < dim ; i++)
862 {pos.set_d(i,pos.get(i) * 2 + p_dw.get(i) );}
863
864 for (int j = 0 ; j < 2*dim ; j++)
865 {
867 for (int k = 0 ; k < dim ; k++)
868 {
869 kc.set_d(k,pos.get(k) + ((j >> k) & 0x1) );
870 }
871
872 if (grid_dw.template get<pMask>(kc) & 0x1)
873 {
874 int a = atomicAdd(&out.template get<0>(p),1);
875 }
876 }
877 }
878
883 template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
884 typename padPointType , typename SparseGridType,
885 typename outputType>
886 __global__ void link_construct_up_count(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, outputType out, Point<dim,int> p_up)
887 {
888 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
889
890 if (p >= padPoints.size()) {return;}
891
892 const unsigned int dataBlockPos = padPoints.template get<0>(p);
893 const unsigned int offset = padPoints.template get<1>(p);
894
895 auto & indexBuffer = grid_cu.getIndexBuffer();
896 auto & dataBuffer = grid_cu.getDataBuffer();
897
898 auto id = indexBuffer.template get<0>(dataBlockPos);
899 grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
900
901 for (int i = 0 ; i < dim ; i++)
902 {pos.set_d(i,(pos.get(i) - p_up.get(i)) / 2);}
903
904 if (grid_up.template get<pMask>(pos) & 0x1)
905 {
906 int a = atomicAdd(&out.template get<0>(p),1);
907 }
908 }
909
914 template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
915 typename padPointType , typename SparseGridType, typename scanType, typename outputType>
916 __global__ void link_construct_insert_dw(padPointType padPoints, SparseGridType grid_dw, SparseGridType grid_cu, scanType scan, outputType out, Point<dim,int> p_dw)
917 {
918 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
919
920 if (p >= padPoints.size()) {return;}
921
922 const unsigned int dataBlockPos = padPoints.template get<0>(p);
923 const unsigned int offset = padPoints.template get<1>(p);
924
925 auto & indexBuffer = grid_cu.getIndexBuffer();
926 auto & dataBuffer = grid_cu.getDataBuffer();
927
928 auto & dataBuffer_dw = grid_dw.getDataBuffer();
929
930 auto id = indexBuffer.template get<0>(dataBlockPos);
931 grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
932
933 for (int i = 0 ; i < dim ; i++)
934 {pos.set_d(i,pos.get(i) * 2 + p_dw.get(i) );}
935
936 unsigned int dataBlockPos_dw;
937 unsigned int offset_dw;
938
939 int link_offset = scan.template get<0>(p);
940
941 int c = 0;
942 for (int j = 0 ; j < 2*dim ; j++)
943 {
945 for (int k = 0 ; k < dim ; k++)
946 {
947 kc.set_d(k,pos.get(k) + ((j >> k) & 0x1) );
948 }
949
950 grid_dw.get_sparse(kc,dataBlockPos_dw,offset_dw);
951
952 if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
953 {
954 out.template get<0>(link_offset + c) = dataBlockPos_dw;
955 out.template get<1>(link_offset + c) = offset_dw;
956
957 c++;
958 }
959 }
960 }
961
966 template<unsigned int dim, unsigned int pMask, unsigned int chunk_size,
967 typename padPointType , typename SparseGridType, typename scanType, typename outputType>
968 __global__ void link_construct_insert_up(padPointType padPoints, SparseGridType grid_up, SparseGridType grid_cu, scanType scan, outputType out, Point<dim,int> p_up)
969 {
970 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
971
972 if (p >= padPoints.size()) {return;}
973
974 const unsigned int dataBlockPos = padPoints.template get<0>(p);
975 const unsigned int offset = padPoints.template get<1>(p);
976
977 auto & indexBuffer = grid_cu.getIndexBuffer();
978 auto & dataBuffer = grid_cu.getDataBuffer();
979
980 auto & dataBuffer_dw = grid_up.getDataBuffer();
981
982 auto id = indexBuffer.template get<0>(dataBlockPos);
983 grid_key_dx<dim,int> pos = grid_cu.getCoord(id*chunk_size + offset);
984
985 for (int i = 0 ; i < dim ; i++)
986 {pos.set_d(i,(pos.get(i) - p_up.get(i)) / 2);}
987
988 unsigned int dataBlockPos_dw;
989 unsigned int offset_dw;
990
991 int link_offset = scan.template get<0>(p);
992
993 grid_up.get_sparse(pos,dataBlockPos_dw,offset_dw);
994
995 if (dataBuffer_dw.template get<pMask>(dataBlockPos_dw)[offset_dw] & 0x1)
996 {
997 out.template get<0>(link_offset) = dataBlockPos_dw;
998 out.template get<1>(link_offset) = offset_dw;
999 }
1000 }
1001
1009 template <unsigned int dim,
1010 typename nNN_type,
1011 typename IndexBufT,
1012 typename SparseGridT,
1013 typename nn_blocksT>
1014 __global__ void findNeighbours(IndexBufT indexBuffer, SparseGridT sparseGrid, nn_blocksT nn_blocks)
1015 {
1016 //todo: #ifdef __NVCC__
1017 constexpr unsigned int pIndex = 0;
1018
1019 typedef typename IndexBufT::value_type IndexAggregateT;
1020 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1021
1022 const unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
1023
1024 const unsigned int dataBlockPos = pos / nNN_type::nNN;
1025 const unsigned int offset = pos % nNN_type::nNN;
1026
1027 if (dataBlockPos >= indexBuffer.size())
1028 {return;}
1029
1030 const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1031
1032 auto neighbourPos = sparseGrid.template getNeighboursPos<nNN_type>(dataBlockId, offset);
1033
1034 nn_blocks.template get<0>(dataBlockPos*nNN_type::nNN + offset) = neighbourPos;
1035 }
1036
1037 template <unsigned int dim,
1038 unsigned int pMask,
1039 typename stencil,
1040 typename IndexBufT,
1041 typename DataBufT,
1042 typename SparseGridT,
1043 typename... Args>
1044 __global__ void
1045 applyStencilInPlace(
1046 Box<dim,int> bx,
1047 IndexBufT indexBuffer,
1048 DataBufT dataBuffer,
1049 SparseGridT sparseGrid,
1050 Args... args)
1051 {
1052 constexpr unsigned int pIndex = 0;
1053
1054 typedef typename IndexBufT::value_type IndexAggregateT;
1055 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1056
1057 typedef typename DataBufT::value_type AggregateT;
1058 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1059 typedef ScalarTypeOf<AggregateT, pMask> MaskT;
1060 constexpr unsigned int blockSize = MaskBlockT::size;
1061
1062 // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
1063 // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
1064 const unsigned int dataBlockPos = blockIdx.x;
1065 const unsigned int offset = threadIdx.x;
1066
1067 if (dataBlockPos >= indexBuffer.size())
1068 {
1069 return;
1070 }
1071
1072 auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
1073
1074 // todo: Add management of RED-BLACK stencil application! :)
1075 const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1076 grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1077
1078 unsigned char curMask;
1079
1080 if (offset < blockSize)
1081 {
1082 // Read local mask to register
1083 curMask = dataBlockLoad.template get<pMask>()[offset];
1084 for (int i = 0 ; i < dim ; i++)
1085 {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
1086 }
1087
1089 sdataBlockPos.id = dataBlockPos;
1090
1091 stencil::stencil(
1092 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1093 curMask, args...);
1094 }
1095
1096 template <unsigned int dim,
1097 unsigned int pMask,
1098 typename stencil,
1099 typename IndexBufT,
1100 typename DataBufT,
1101 typename SparseGridT,
1102 typename... Args>
1103 __global__ void
1104 applyStencilInPlaceNoShared(
1105 Box<dim,int> bx,
1106 IndexBufT indexBuffer,
1107 DataBufT dataBuffer,
1108 SparseGridT sparseGrid,
1109 Args... args)
1110 {
1111 constexpr unsigned int pIndex = 0;
1112
1113 typedef typename IndexBufT::value_type IndexAggregateT;
1114 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1115
1116 typedef typename DataBufT::value_type AggregateT;
1117 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1118 typedef ScalarTypeOf<AggregateT, pMask> MaskT;
1119 constexpr unsigned int blockSize = MaskBlockT::size;
1120
1121 int p = blockIdx.x * blockDim.x + threadIdx.x;
1122
1123 auto & pntBuff = sparseGrid.getPointBuffer();
1124
1125 if (p >= pntBuff.size())
1126 {
1127 return;
1128 }
1129
1130 auto id = pntBuff.template get<0>(p);
1131
1132 const unsigned int dataBlockPos = id / blockSize;
1133 const unsigned int offset = id % blockSize;
1134
1135 auto dataBlockLoad = dataBuffer.get(dataBlockPos);
1136
1137 const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1138 grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1139
1140 unsigned char curMask;
1141
1142 if (offset < blockSize)
1143 {
1144 // Read local mask to register
1145 curMask = dataBlockLoad.template get<pMask>()[offset];
1146 if (bx.isInsideKey(pointCoord) == false) {curMask = 0;}
1147 }
1148
1150 sdataBlockPos.id = dataBlockPos;
1151
1152 stencil::stencil(
1153 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1154 curMask, args...);
1155 }
1156
1157 template<unsigned int pMask,
1158 typename dataBuffType,
1159 typename scanType,
1160 typename outType>
1161 __global__ void fill_e_points(dataBuffType dataBuf, scanType scanBuf, outType output)
1162 {
1163 typedef typename dataBuffType::value_type AggregateT;
1164 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1165 constexpr unsigned int blockSize = MaskBlockT::size;
1166
1167 const unsigned int dataBlockPos = blockIdx.x;
1168 const unsigned int offset = threadIdx.x % blockSize;
1169
1170 __shared__ int ato_cnt;
1171
1172 if (threadIdx.x == 0)
1173 {ato_cnt = 0;}
1174
1175 __syncthreads();
1176
1177 if (dataBlockPos >= scanBuf.size() - 1)
1178 {
1179 return;
1180 }
1181
1182 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1183
1184 int id = atomicAdd(&ato_cnt,predicate);
1185
1186 __syncthreads();
1187
1188 if (predicate == true)
1189 {
1190 output.template get<0>(id + scanBuf.template get<0>(dataBlockPos)) = offset + dataBlockPos * blockSize;
1191 }
1192 }
1193
1194 template<unsigned int pMask,
1195 typename dataBufferType,
1196 typename outType>
1197 __global__ void calc_exist_points(dataBufferType dataBuf, outType output)
1198 {
1199 typedef typename dataBufferType::value_type AggregateT;
1200 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1201 constexpr unsigned int blockSize = MaskBlockT::size;
1202
1203 const unsigned int dataBlockPos = blockIdx.x;
1204 const unsigned int offset = threadIdx.x % blockSize;
1205
1206 __shared__ int ato_cnt;
1207
1208 if (threadIdx.x == 0)
1209 {ato_cnt = 0;}
1210
1211 __syncthreads();
1212
1213 if (dataBlockPos >= output.size())
1214 {
1215 return;
1216 }
1217
1218 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1219
1220 atomicAdd(&ato_cnt,predicate);
1221
1222 __syncthreads();
1223
1224 output.template get<0>(dataBlockPos) = ato_cnt;
1225 }
1226
1227 template<unsigned int dim,
1228 unsigned int pMask,
1229 unsigned int blockEdgeSize,
1230 typename dataBufferType,
1231 typename outType,
1232 typename boxesVector_type,
1233 typename grid_smb_type,
1234 typename indexBuffer_type>
1235 __global__ void calc_remove_points_chunks_boxes(indexBuffer_type indexBuffer,
1236 boxesVector_type boxes,
1237 grid_smb_type grd,
1238 dataBufferType dataBuf,
1239 outType output)
1240 {
1241 typedef typename dataBufferType::value_type AggregateT;
1242 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1243
1244 const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1245
1246 if (dataBlockPos >= indexBuffer.size())
1247 {return;}
1248
1249 auto id = indexBuffer.template get<0>(dataBlockPos);
1250 grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize());
1251
1253
1254 for (int i = 0 ; i < dim ; i++)
1255 {
1256 b.setLow(i,pnt.get(i));
1257 b.setHigh(i,pnt.get(i) + blockEdgeSize - 1);
1258 }
1259
1260 // this block intersect a remove box section so mark the chunk
1261
1262 output.template get<1>(dataBlockPos) = 0;
1263 for (int k = 0 ; k < boxes.size() ; k++ )
1264 {
1265 Box<dim,unsigned int> btest = boxes.get(k);
1266
1268
1269 if (btest.Intersect(b,bout) == true)
1270 {
1271 output.template get<1>(dataBlockPos) = 1;
1272 }
1273 }
1274 }
1275
1276 template<typename outType,
1277 typename activeCnkType>
1278 __global__ void collect_rem_chunks(activeCnkType act,
1279 outType output)
1280 {
1281 const unsigned int dataBlockPos = blockIdx.x * blockDim.x + threadIdx.x;
1282
1283 if (dataBlockPos >= act.size()-1)
1284 {return;}
1285
1286 auto id = act.template get<1>(dataBlockPos);
1287 auto id_p1 = act.template get<1>(dataBlockPos+1);
1288
1289 if (id != id_p1)
1290 {
1291 output.template get<0>(id) = dataBlockPos;
1292 }
1293 }
1294
1295 template<unsigned int dim, unsigned int pMask,
1296 typename dataBufferType,
1297 typename indexBufferType,
1298 typename grid_smb_type,
1299 typename activeCntType,
1300 typename boxesType>
1301 __global__ void remove_points(indexBufferType indexBuffer,
1302 grid_smb_type grd,
1303 dataBufferType dataBuffer,
1304 activeCntType active_blocks,
1305 boxesType boxes)
1306 {
1307 typedef typename dataBufferType::value_type AggregateT;
1308 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1309 constexpr unsigned int blockSize = MaskBlockT::size;
1310
1311 const unsigned int dataBlockPos = active_blocks.template get<0>(blockIdx.x);
1312 const unsigned int offset = threadIdx.x % blockSize;
1313
1314 if (dataBlockPos >= dataBuffer.size()-1)
1315 {return;}
1316
1317 int predicate = dataBuffer.template get<pMask>(dataBlockPos)[offset] & 0x1;
1318 // calculate point coord;
1319 auto id = indexBuffer.template get<0>(dataBlockPos);
1320 grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1322
1323 for (int i = 0 ; i < dim ; i++)
1324 {p.get(i) = pnt.get(i);}
1325
1326 // if this block intersect any box
1327
1328 if (predicate == true)
1329 {
1330 for (int k = 0 ; k < boxes.size() ; k++ )
1331 {
1332 Box<dim,unsigned int> box = boxes.get(k);
1333
1334 if (box.isInside(p) == true)
1335 {
1336 dataBuffer.template get<pMask>(dataBlockPos)[offset] = 0;
1337 }
1338 }
1339 }
1340 }
1341
1342 template<unsigned int dim,
1343 unsigned int pMask,
1344 unsigned int numCnt,
1345 typename indexT,
1346 typename dataBufferType,
1347 typename outType,
1348 typename boxesVector_type,
1349 typename grid_smb_type,
1350 typename indexBuffer_type>
1351 __global__ void calc_exist_points_with_boxes(indexBuffer_type indexBuffer,
1352 boxesVector_type boxes,
1353 grid_smb_type grd,
1354 dataBufferType dataBuf,
1355 outType output,
1356 unsigned int stride_size)
1357 {
1358 typedef typename dataBufferType::value_type AggregateT;
1359 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1360 constexpr unsigned int blockSize = MaskBlockT::size;
1361
1362 const unsigned int dataBlockPos = blockIdx.x;
1363 const unsigned int offset = threadIdx.x % blockSize;
1364
1365 __shared__ int ato_cnt[numCnt];
1366
1367 if (threadIdx.x < numCnt)
1368 {ato_cnt[threadIdx.x] = 0;}
1369
1370 __syncthreads();
1371
1372#ifdef SE_CLASS1
1373
1374 if (numCnt > blockDim.x)
1375 {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);}
1376
1377#endif
1378
1379 if (dataBlockPos >= output.size())
1380 {return;}
1381
1382 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1383 // calculate point coord;
1384 indexT id = indexBuffer.template get<0>(dataBlockPos);
1385 grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1387
1388 for (int i = 0 ; i < dim ; i++)
1389 {p.get(i) = pnt.get(i);}
1390
1391 // if this block intersect any box
1392
1393 if (predicate == true)
1394 {
1395 for (int k = 0 ; k < boxes.size() ; k++ )
1396 {
1397 Box<dim,int> box = boxes.get(k);
1398
1399 if (box.isInside(p) == true)
1400 {
1401 atomicAdd(&ato_cnt[k],1);
1402 }
1403 }
1404 }
1405
1406 __syncthreads();
1407
1408 if (threadIdx.x < boxes.size())
1409 {
1410 output.template get<0>(dataBlockPos+threadIdx.x*stride_size) = ato_cnt[threadIdx.x];
1411 output.template get<1>(dataBlockPos+threadIdx.x*stride_size) = (ato_cnt[threadIdx.x] != 0);
1412 }
1413 }
1414
1425 template<unsigned int dim,
1426 unsigned int pMask,
1427 unsigned int numCnt,
1428 typename indexT,
1429 typename dataBufferType,
1430 typename packBufferType,
1431 typename scanType,
1432 typename scanItType,
1433 typename outputType,
1434 typename boxesVector_type,
1435 typename grid_smb_type,
1436 typename indexBuffer_type>
1437 __global__ void get_exist_points_with_boxes(indexBuffer_type indexBuffer,
1438 boxesVector_type boxes,
1439 grid_smb_type grd,
1440 dataBufferType dataBuf,
1441 packBufferType pack_output,
1442 scanType scan,
1443 scanItType scan_it,
1444 outputType output)
1445 {
1446 typedef typename dataBufferType::value_type AggregateT;
1447 typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
1448 constexpr unsigned int blockSize = MaskBlockT::size;
1449
1450 const unsigned int dataBlockPos = blockIdx.x;
1451 const unsigned int offset = threadIdx.x % blockSize;
1452
1453 __shared__ int ato_cnt[numCnt];
1454
1455 if (threadIdx.x < numCnt)
1456 {ato_cnt[threadIdx.x] = 0;}
1457
1458 __syncthreads();
1459
1460#ifdef SE_CLASS1
1461
1462 if (numCnt > blockDim.x)
1463 {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);}
1464
1465#endif
1466
1467 int predicate = dataBuf.template get<pMask>(dataBlockPos)[offset] & 0x1;
1468 // calculate point coord;
1469 indexT id = indexBuffer.template get<0>(dataBlockPos);
1470 grid_key_dx<dim,int> pnt = grd.InvLinId(id*grd.getBlockSize() + offset);
1471 Point<dim,int> p_;
1472
1473 for (int i = 0 ; i < dim ; i++)
1474 {p_.get(i) = pnt.get(i);}
1475
1476 // if this block intersect any box
1477
1478 if (predicate == true)
1479 {
1480 for (int k = 0 ; k < boxes.size() ; k++ )
1481 {
1482 Box<dim,int> box = boxes.get(k);
1483
1484 if (box.isInside(p_) == true)
1485 {
1486 // We have an atomic counter for every packing box
1487 int p = atomicAdd(&ato_cnt[k] , 1);
1488
1489 // we have a scan for every box
1490 const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuffer.size() + 1));
1491 unsigned int sit = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1));
1492 int scan_id = scan.template get<0>(dataBlockPos + k*(indexBuffer.size() + 1)) + scan_it.template get<0>(k);
1493 output.template get<0>(scan_id + p) = (offset + dataBlockPos * blockSize) * numCnt + k;
1494 pack_output.template get<0>(scan_id + p) = p + sit;
1495 }
1496 }
1497 }
1498 }
1499
1500
1501
1502
1503 template<unsigned int dim,
1504 unsigned int blockSize,
1505 unsigned int blockEdgeSize,
1506 unsigned int nshifts,
1507 typename indexT,
1508 typename linearizer,
1509 typename shiftTypeVector,
1510 typename outputType>
1511 __global__ void convert_chunk_ids(indexT * ids,
1512 int n_cnk,
1513 linearizer gridGeoPack,
1514 grid_key_dx<dim,int> origPack,
1515 linearizer gridGeo,
1516 grid_key_dx<dim,int> origUnpack,
1517 outputType output,
1518 shiftTypeVector shifts,
1520 int bs)
1521 {
1522 // points
1523 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1524
1525 if (p >= n_cnk)
1526 {return;}
1527
1528 auto id = ids[p];
1529
1530 for (int i = 0 ; i < nshifts ; i++)
1531 {
1532 grid_key_dx<dim,int> pos = gridGeoPack.InvLinId(id,0) - origPack + origUnpack;
1533
1534 for (int j = 0 ; j < dim ; j++)
1535 {
1536 pos.set_d(j,pos.get(j) + shifts.template get<0>(i)[j]*blockEdgeSize);
1537 if (pos.get(j) < 0)
1538 {
1539 pos.set_d(j,0);
1540 }
1541 if (pos.get(j) >= sz.get(j))
1542 {
1543 pos.set_d(j,pos.get(j) - blockEdgeSize);
1544 }
1545 }
1546
1547 auto plin = gridGeo.LinId(pos);
1548
1549 output.template get<0>(p*nshifts + i + bs) = plin / blockSize;
1550 }
1551 }
1552
1553 template<typename vectorType>
1554 __global__ void set_one(vectorType vt)
1555 {
1556 // points
1557 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1558
1559 if (p >= vt.size())
1560 {return;}
1561
1562 vt.template get<0>(p) = 1;
1563 }
1564
1565 template<unsigned int pSegment,typename newMapType, typename mergeMapType,
1566 typename dataMapType, typename segmentOffsetType,
1567 typename outMapType>
1568 __global__ void construct_new_chunk_map(newMapType new_map, dataMapType dataMap,
1569 mergeMapType merge_id, outMapType outMap,
1570 segmentOffsetType segments_data, int start_p)
1571 {
1572 // points
1573 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1574
1575 if (p >= segments_data.size()-1)
1576 {return;}
1577
1578 unsigned int st = segments_data.template get<pSegment>(p);
1579
1580 int segmentSize = segments_data.template get<pSegment>(p + 1)
1581 - segments_data.template get<pSegment>(p);
1582
1583 for (int j = 0 ; j < segmentSize ; j++)
1584 {
1585 int dm = dataMap.template get<0>(st+j);
1586 new_map.template get<0>(dm) = outMap.template get<0>(p);
1587 }
1588 }
1589
1590 template<unsigned int pMask, typename AggregateT, typename blockConvertType, typename newMapType, typename dataType_ptrs, typename dataType, unsigned int ... prp>
1591 __global__ void copy_packed_data_to_chunks(unsigned int * scan,
1592 unsigned short int * offsets,
1593 blockConvertType blc,
1594 newMapType new_map,
1595 dataType_ptrs data_ptrs,
1596 dataType data_buff,
1597 unsigned int n_cnk,
1598 unsigned int n_shf,
1599 unsigned int n_pnt,
1600 unsigned int i,
1601 unsigned int n_accu_cnk)
1602 {
1603 // points
1604 const unsigned int p = blockIdx.x;
1605
1606 if (p >= n_cnk)
1607 {return;}
1608
1609 int scan_pp = scan[p];
1610 int n_block_pnt = scan[p+1] - scan_pp;
1611
1612 if (threadIdx.x < n_block_pnt)
1613 {
1614 unsigned short int off = offsets[scan[p] + threadIdx.x];
1615
1616 int conv = blc.template get<0>(i)[off];
1617
1618 unsigned short int off_c = conv & 0xFFFF;
1619 unsigned short int shf_c = conv >> 16;
1620
1621 unsigned int pos_c = new_map.template get<0>(n_shf*p + shf_c + n_accu_cnk);
1622
1623 sparsegridgpu_unpack_impl<AggregateT, dataType ,prp ...>
1624 spi(pos_c,off_c,data_buff,scan_pp + threadIdx.x,data_ptrs,n_pnt);
1625
1626 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,sizeof...(prp)> >(spi);
1627
1628 data_buff.template get<pMask>(pos_c)[off_c] |= 0x1;
1629 }
1630 }
1631
1632
1633 template<typename scanPointerType, typename scanType>
1634 __global__ void last_scan_point(scanPointerType scan_ptr, scanType scan,unsigned int stride, unsigned int n_pack)
1635 {
1636 const unsigned int k = blockIdx.x * blockDim.x + threadIdx.x;
1637
1638 if (k >= n_pack) {return;}
1639
1640 unsigned int ppos = scan.template get<0>((k+1)*stride-1);
1641 unsigned int pos = scan.template get<1>((k+1)*stride-1);
1642
1643 ((unsigned int *)scan_ptr.ptr[k])[pos] = ppos;
1644 }
1645
1646 template<unsigned int pMask,
1647 typename AggregateT,
1648 unsigned int n_it,
1649 unsigned int n_prp,
1650 typename indexT,
1651 typename pntBuff_type,
1652 typename pointOffset_type,
1653 typename indexBuffer_type,
1654 typename dataBuffer_type,
1655 typename scan_type,
1656 unsigned int blockSize,
1657 unsigned int ... prp>
1658 __global__ void pack_data(pntBuff_type pntBuff,
1659 dataBuffer_type dataBuff,
1660 indexBuffer_type indexBuff,
1661 scan_type scan,
1662 pointOffset_type point_offsets,
1663 arr_ptr<n_it> index_ptr,
1664 arr_ptr<n_it> scan_ptr,
1665 arr_arr_ptr<n_it,n_prp> * data_ptr,
1666 arr_ptr<n_it> offset_ptr,
1667 arr_ptr<n_it> mask_ptr,
1669 {
1670 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
1671
1672 if (p >= pntBuff.size())
1673 {return;}
1674
1675 const unsigned int pb = pntBuff.template get<0>(p);
1676 const unsigned int p_offset = point_offsets.template get<0>(p);
1677
1678 const unsigned int k = pb % n_it;
1679 const unsigned int id = pb / n_it;
1680
1681 const unsigned int dataBlockPos = id / blockSize;
1682 const unsigned int offset = id % blockSize;
1683
1684 unsigned int ppos = scan.template get<0>(dataBlockPos + k*(indexBuff.size() + 1));
1685 const unsigned int dataBlockPosPack = scan.template get<1>(dataBlockPos + k*(indexBuff.size() + 1));
1686
1687 sparsegridgpu_pack_impl<AggregateT, dataBuffer_type ,prp ...>
1688 spi(dataBlockPos,offset,dataBuff,p_offset,data_ptr->ptr[k],sar.sa[k]);
1689
1690 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,sizeof...(prp)> >(spi);
1691
1692 ((unsigned int *)scan_ptr.ptr[k])[dataBlockPosPack] = ppos;
1693
1694 ((indexT *)index_ptr.ptr[k])[dataBlockPosPack] = indexBuff.template get<0>(dataBlockPos);
1695 ((short int *)offset_ptr.ptr[k])[p_offset] = offset;
1696 ((unsigned char *)mask_ptr.ptr[k])[p_offset] = dataBuff.template get<pMask>(dataBlockPos)[offset];
1697 }
1698}
1699
1700#endif //OPENFPM_PDATA_SPARSEGRIDGPU_KERNELS_CUH
This class represent an N-dimensional box.
Definition Box.hpp:61
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition Box.hpp:556
__device__ __host__ bool Intersect(const Box< dim, T > &b, Box< dim, T > &b_out) const
Intersect.
Definition Box.hpp:95
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition Box.hpp:567
__host__ __device__ bool isInside(const Point< dim, T > &p) const
Check if the point is inside the box.
Definition Box.hpp:1004
__device__ __host__ bool isInsideKey(const KeyType &k) const
Check if the point is inside the region (Border included)
Definition Box.hpp:1153
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition Point.hpp:172
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition grid_key.hpp:516
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition grid_key.hpp:503
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
this class is a functor for "for_each" algorithm
this class is a functor for "for_each" algorithm
to_variadic_const_impl< 1, N, M, exit_::value, M >::type type
generate the boost::fusion::vector apply H on each term