OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_ker_util.hpp
1/*
2 * SparseGridGpu_ker_util.hpp
3 *
4 * Created on: Aug 7, 2019
5 * Author: i-bird
6 */
7
8#ifndef SPARSEGRIDGPU_KER_UTIL_HPP_
9#define SPARSEGRIDGPU_KER_UTIL_HPP_
10
11#include "util/variadic_to_vmpl.hpp"
12
13template<bool to_set>
15{
16 template<unsigned int p, typename SrcType, typename AggrType>
17 __device__ __host__ static inline void set(SrcType & src,AggrType & aggr)
18 {
19 src = aggr.template get<p>();
20 }
21};
22
23template<>
25{
26 template<unsigned int p, typename SrcType, typename AggrType>
27 __device__ __host__ static inline void set(SrcType & src,AggrType & aggr)
28 {}
29};
30
31template<unsigned int dim,typename T>
33{
34 T xm[dim];
35 T xp[dim];
36};
37
38/*template<unsigned int dim, unsigned int block_edge_size>
39struct shift_position
40{
41 __device__ static inline int shift(int pos, int stencilRadius)
42 {
43 int accu = 1;
44 int pos_s = 0;
45 for (int i = 0 ; i < dim ; i++)
46 {
47 pos_s += (pos % block_edge_size + stencilRadius)*accu;
48 accu *= (block_edge_size + 2*stencilRadius);
49 pos /= block_edge_size;
50 }
51
52 return pos_s;
53 }
54};
55
56template<unsigned int block_edge_size>
57struct shift_position<2,block_edge_size>
58{
59 __device__ static inline int shift(int pos, int stencilRadius)
60 {
61 unsigned int x = pos % block_edge_size;
62 unsigned int y = (pos / block_edge_size);
63
64 unsigned int g_sz = block_edge_size + 2*stencilRadius;
65
66 return (x+stencilRadius) + (y+stencilRadius)*g_sz;
67 }
68};
69
70
71template<unsigned int block_edge_size>
72struct shift_position<3,block_edge_size>
73{
74 __device__ static inline int shift(int pos, int stencilRadius)
75 {
76 unsigned int x = pos % block_edge_size;
77 unsigned int y = (pos / block_edge_size) % block_edge_size;
78 unsigned int z = (pos / (block_edge_size*block_edge_size));
79
80 unsigned int g_sz = block_edge_size + 2*stencilRadius;
81
82 return (x+stencilRadius) + (y+stencilRadius)*g_sz + (z+stencilRadius)*g_sz*g_sz;
83 }
84};*/
85
86template<unsigned int dim>
87struct NNStar
88{
89 static const int nNN = IntPow<2, dim>::value;
90
91 template<typename indexT, typename blockCoord_type, typename blockMap_type, typename SparseGrid_type>
92 __device__ static inline indexT getNNpos(blockCoord_type & blockCoord,
93 blockMap_type & blockMap,
94 SparseGrid_type & sparseGrid,
95 const unsigned int offset)
96 {
97 //todo: also do the full neighbourhood version, this is just cross
98 int neighbourPos = -1;
99 if (offset < 2*dim)
100 {
101 unsigned int d = offset/2;
102 int dPos = blockCoord.get(d) + (offset%2)*2 - 1;
103 blockCoord.set_d(d, dPos);
104
105 int bl = sparseGrid.getBlockLinId(blockCoord);
106
107 bl = (dPos < 0)?-1:bl;
108
109 neighbourPos = blockMap.get_sparse(bl).id;
110 }
111 return neighbourPos;
112 }
113
114 template<typename indexT, unsigned int blockEdgeSize, typename coordType>
115 __host__ static inline indexT getNNskin(coordType & coord, int stencilSupportRadius)
116 {
117 int neighbourNum = -1;
118 int ctr = 0;
119 for (int j = 0; j < dim; ++j)
120 {
121 int c = static_cast<int>(coord.get(j)) - static_cast<int>(stencilSupportRadius);
122 if (c < 0)
123 {
124 neighbourNum = 2*j;
125 ++ctr;
126 }
127 else if (c >= blockEdgeSize)
128 {
129 neighbourNum = 2*j + 1;
130 ++ctr;
131 }
132 }
133 if (ctr > 1) // If we are on a "corner"
134 {
135 neighbourNum = 0;
136 }
137
138 return neighbourNum;
139 }
140
141 template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
142 __device__ static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
143 {
144 bool isPadding_ = false;
145 for (int d=0; d<dim; ++d)
146 {
147 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
148 auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
149 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
150 typename std::remove_all_extents<Mask_type>::type neighbourMinus = enlargedBlock[nMinusId];
151 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
152 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourMinus));
153 if (isPadding_) break;
154 }
155
156 return isPadding_;
157 }
158
163 __device__ static inline bool getNNindex_offset()
164 {
165 return false;
166 }
167};
168
169template<unsigned int n_it>
171{
172 void * ptr[n_it];
173};
174
175template<unsigned int n_it,unsigned int n_prp>
177{
178 void * ptr[n_it][n_prp+1];
179};
180
181template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id>
183{
184 template<typename dataBuffer_type>
185 __device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
186 {
187 ((copy_type *)data_ptr[prp_id])[ppos] = dataBuff.template get<prp_val>(dataBlockPos)[offset];
188 }
189
190 template<typename dataBuffer_type>
191 __device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
192 {
193 dataBuff.template get<prp_val>(dataBlockPos)[offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos];
194 }
195};
196
197template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id, unsigned int N1>
198struct meta_copy_block<copy_type[N1],nprp,prp_val,prp_id>
199{
200 template<typename dataBuffer_type>
201 __device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
202 {
203 for (int i = 0 ; i < N1 ; i++)
204 {
205 ((copy_type *)data_ptr[prp_id])[ppos+i*n_pnt] = dataBuff.template get<prp_val>(dataBlockPos)[i][offset];
206 }
207 }
208
209 template<typename dataBuffer_type>
210 __device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
211 {
212 for (int i = 0 ; i < N1 ; i++)
213 {
214 dataBuff.template get<prp_val>(dataBlockPos)[i][offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos+i*n_pnt];
215 }
216 }
217};
218
219template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id, unsigned int N1, unsigned int N2>
220struct meta_copy_block<copy_type[N1][N2],nprp,prp_val,prp_id>
221{
222 template<typename dataBuffer_type>
223 __device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
224 {
225 for (int i = 0 ; i < N1 ; i++)
226 {
227 for (int j = 0 ; j < N2 ; j++)
228 {
229 ((copy_type *)data_ptr[prp_id])[ppos + (i*N2 + j)*n_pnt] = dataBuff.template get<prp_val>(dataBlockPos)[i][j][offset];
230 }
231 }
232 }
233
234 template<typename dataBuffer_type>
235 __device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
236 {
237 for (int i = 0 ; i < N1 ; i++)
238 {
239 for (int j = 0 ; j < N2 ; j++)
240 {
241 dataBuff.template get<prp_val>(dataBlockPos)[i][j][offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos + (i*N2 + j)*n_pnt];
242 }
243 }
244 }
245};
246
256template<typename AggregateT, typename dataBuffer_type, int ... prp>
258{
259 typedef typename to_boost_vmpl<prp...>::type vprp;
260
262 unsigned int dataBlockPos;
263
265 unsigned int offset;
266
268 dataBuffer_type & dataBuff;
269
271 unsigned int ppos;
272
274 void * (& data_ptr)[sizeof...(prp)+1];
275
277 unsigned int n_pnt;
278
282 __device__ __host__ inline sparsegridgpu_pack_impl(unsigned int dataBlockPos,
283 unsigned int offset,
284 dataBuffer_type & dataBuff,
285 unsigned int ppos,
286 void * (& data_ptr)[sizeof...(prp)+1],
287 unsigned int n_pnt)
289 {};
290
292 template<typename T>
293 __device__ __host__ inline void operator()(T& t)
294 {
295 typedef typename boost::mpl::at<vprp,T>::type prp_cp;
296
297 // Remove the reference from the type to copy
298 typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
299
300 meta_copy_block<pack_type,sizeof...(prp)+1,prp_cp::value,T::value>::copy(data_ptr,dataBuff,ppos,dataBlockPos,offset,n_pnt);
301 }
302};
303
304
314template<typename AggregateT, typename dataBuffer_type, int ... prp>
316{
317 typedef typename to_boost_vmpl<prp...>::type vprp;
318
320 unsigned int dataBlockPos;
321
323 unsigned int offset;
324
326 dataBuffer_type & dataBuff;
327
329 unsigned int ppos;
330
332 arr_arr_ptr<1,sizeof...(prp)> & data_ptr;
333
335 unsigned int n_pnt;
336
340 __device__ __host__ inline sparsegridgpu_unpack_impl(unsigned int dataBlockPos,
341 unsigned int offset,
342 dataBuffer_type & dataBuff,
343 unsigned int ppos,
344 arr_arr_ptr<1,sizeof...(prp)> & data_ptr,
345 unsigned int n_pnt)
347 {};
348
350 template<typename T>
351 __device__ __host__ inline void operator()(T& t)
352 {
353 typedef typename boost::mpl::at<vprp,T>::type prp_cp;
354
355 // Remove the reference from the type to copy
356 typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
357
358 meta_copy_block<pack_type,sizeof...(prp),prp_cp::value,T::value>::copy_inv(data_ptr,dataBuff,ppos,dataBlockPos,offset,n_pnt);
359 }
360};
361
371template<typename AggregateT, int ... prp>
373{
374 typedef typename to_boost_vmpl<prp...>::type vprp;
375
378
383 :point_size(0)
384 {};
385
387 template<typename T>
388 inline void operator()(T& t)
389 {
390 typedef typename boost::mpl::at<vprp,T>::type prp_cp;
391
392 // Remove the reference from the type to copy
393 typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
394
395 point_size += sizeof(pack_type);
396 }
397};
398
399template<unsigned int edgeSize, unsigned int dim>
400inline __device__ unsigned int coordToLin(const unsigned int (&coord)[dim], const unsigned int paddingSize = 0)
401{
402 unsigned int linId = coord[dim - 1];
403 for (int d = dim - 2; d >= 0; --d)
404 {
405 linId *= edgeSize + 2 * paddingSize;
406 linId += coord[d];
407 }
408 return linId;
409}
410
411
412template<unsigned int edgeSize, typename CoordT, unsigned int dim>
413inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> &coord, const unsigned int paddingSize = 0)
414{
415 unsigned int linId = coord.get(dim - 1);
416 for (int d = dim - 2; d >= 0; --d)
417 {
418 linId *= edgeSize + 2 * paddingSize;
419 linId += coord.get(d);
420 }
421 return linId;
422}
423
424template <typename CoordT,unsigned int dim>
425inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> & coord, grid_key_dx<dim, int> & blockDimensions)
426{
427 unsigned int linId = coord.get(dim - 1);
428 for (int d = dim - 2; d >= 0; --d)
429 {
430 linId *= blockDimensions.get(d);
431 linId += coord.get(d);
432 }
433 return linId;
434}
435
436
437
438template<unsigned int edgeSize, unsigned int dim>
439inline __device__ void linToCoordWithOffset(const unsigned int linId, const unsigned int offset, unsigned int (&coord)[dim])
440{
441 unsigned int linIdTmp = linId;
442 for (unsigned int d = 0; d < dim; ++d)
443 {
444 coord[d] = linIdTmp % edgeSize;
445 coord[d] += offset;
446 linIdTmp /= edgeSize;
447 }
448}
449
450template<unsigned int edgeSize, unsigned int dim>
451inline __device__ void linToCoord(const unsigned int linId, unsigned int (&coord)[dim])
452{
453 unsigned int linIdTmp = linId;
454 for (unsigned int d = 0; d < dim; ++d)
455 {
456 coord[d] = linIdTmp % edgeSize;
457 linIdTmp /= edgeSize;
458 }
459}
460
461constexpr int gt = 0;
462constexpr int nt = 1;
463
464template<unsigned int nLoop, unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
466{
467 template<typename AggrWrapperT,
468 typename SharedPtrT,
469 typename vector_type,
470 typename vector_type2,
471 typename blockMapType,
472 typename AggrBck>
473 __device__ static inline void load(const AggrWrapperT &block,
474 SharedPtrT * sharedRegionPtr,
475 const vector_type & ghostLayerToThreadsMapping,
476 const vector_type2 & nn_blocks,
477 const blockMapType & blockMap,
478 unsigned int stencilSupportRadius,
479 unsigned int ghostLayerSize,
480 const unsigned int blockId,
481 AggrBck & bck)
482 {
483 printf("Error to implement loadGhostBlock_impl with nLoop=%d \n",nLoop);
484 }
485};
486
487template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
488struct loadGhostBlock_impl<1,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
489{
490 template<typename AggrWrapperT,
491 typename SharedPtrT,
492 typename vector_type,
493 typename vector_type2,
494 typename blockMapType,
495 typename AggrBck>
496 __device__ static inline void load(const AggrWrapperT &block,
497 SharedPtrT * sharedRegionPtr,
498 const vector_type & ghostLayerToThreadsMapping,
499 const vector_type2 & nn_blocks,
500 const blockMapType & blockMap,
501 unsigned int stencilSupportRadius,
502 unsigned int ghostLayerSize,
503 const unsigned int blockIdPos,
504 AggrBck & bck)
505 {
506 typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
507
508 const int pos = threadIdx.x % ghostLayerSize;
509
510 __shared__ int neighboursPos[ct_params::nNN];
511
512 const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
513 short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
514
515 // Convert pos into a linear id accounting for the inner domain offsets
516 const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
517 // Now get linear offset wrt the first element of the block
518
519 int ctr = linId;
520 unsigned int acc = 1;
521 unsigned int offset = 0;
522 for (int i = 0; i < dim; ++i)
523 {
524 int v = (ctr % edge) - stencilSupportRadius;
525 v = (v < 0)?(v + blockEdgeSize):v;
526 v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
527 offset += v*acc;
528 ctr /= edge;
529 acc *= blockEdgeSize;
530 }
531
532 // Convert pos into a linear id accounting for the ghost offsets
533 unsigned int coord[dim];
534 linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
535 const unsigned int linId2 = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
536
537 unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
538
539 if (threadIdx.x < ct_params::nNN)
540 {
541 neighboursPos[threadIdx.x] = nnb;
542 }
543
544 __syncthreads();
545
546 // Actually load the data into the shared region
547 auto nPos = neighboursPos[neighbourNum];
548
549 auto gdata = blockMap.template get_ele<p>(nPos)[offset];
550
551 // Actually load the data into the shared region
552 //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
553 auto bdata = block.template get<p>()[threadIdx.x];
554
555 auto bmask = block.template get<pMask>()[threadIdx.x];
556 auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
557
558 if (bmask == 0) { set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
559 if (gmask == 0) { set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
560
561 sharedRegionPtr[linId] = gdata;
562 sharedRegionPtr[linId2] = bdata;
563 }
564};
565
566template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
567struct loadGhostBlock_impl<2,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
568{
569 template<typename AggrWrapperT,
570 typename SharedPtrT,
571 typename vector_type,
572 typename vector_type2,
573 typename blockMapType,
574 typename AggrBck>
575 __device__ static inline void load(const AggrWrapperT &block,
576 SharedPtrT * sharedRegionPtr,
577 const vector_type & ghostLayerToThreadsMapping,
578 const vector_type2 & nn_blocks,
579 const blockMapType & blockMap,
580 unsigned int stencilSupportRadius,
581 unsigned int ghostLayerSize,
582 const unsigned int blockIdPos,
583 AggrBck & bck)
584 {
585 typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
586
587 const int pos = threadIdx.x % ghostLayerSize;
588 const int pos_d1 = (threadIdx.x + blockDim.x) % ghostLayerSize;
589
590 __shared__ int neighboursPos[ct_params::nNN];
591
592 const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
593 short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
594 short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
595
596 // Convert pos into a linear id accounting for the inner domain offsets
597 const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
598 const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
599 // Now get linear offset wrt the first element of the block
600
601 int ctr = linId;
602 int ctr2 = linId2;
603 unsigned int acc = 1;
604 unsigned int offset = 0;
605 unsigned int offset2 = 0;
606 for (int i = 0; i < dim; ++i)
607 {
608 int v = (ctr % edge) - stencilSupportRadius;
609 int v2 = (ctr2 % edge) - stencilSupportRadius;
610 v = (v < 0)?(v + blockEdgeSize):v;
611 v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
612 v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
613 v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
614 offset += v*acc;
615 offset2 += v2*acc;
616 ctr /= edge;
617 ctr2 /= edge;
618 acc *= blockEdgeSize;
619 }
620
621 // Convert pos into a linear id accounting for the ghost offsets
622 unsigned int coord[dim];
623 linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
624 const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
625// const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);
626
627// printf("AAA %d %d \n",linId_b,linId_b_test);
628
629 unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
630
631 if (threadIdx.x < ct_params::nNN)
632 {
633 neighboursPos[threadIdx.x] = nnb;
634 }
635
636 __syncthreads();
637
638 // Actually load the data into the shared region
639 auto nPos = neighboursPos[neighbourNum];
640 auto nPos2 = neighboursPos[neighbourNum2];
641
642 auto gdata = blockMap.template get_ele<p>(nPos)[offset];
643 auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
644
645 // Actually load the data into the shared region
646 //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
647 auto bdata = block.template get<p>()[threadIdx.x];
648
649 auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
650 auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
651
652 // Actually load the data into the shared region
653 //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
654 auto bmask = block.template get<pMask>()[threadIdx.x];
655
656 if (bmask == 0) {set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
657 if (gmask == 0) {set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
658 if (gmask2 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
659
660 sharedRegionPtr[linId] = gdata;
661 sharedRegionPtr[linId2] = gdata2;
662 sharedRegionPtr[linId_b] = bdata;
663 }
664};
665
666template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
667struct loadGhostBlock_impl<3,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
668{
669 template<typename AggrWrapperT,
670 typename SharedPtrT,
671 typename vector_type,
672 typename vector_type2,
673 typename blockMapType,
674 typename AggrBck>
675 __device__ static inline void load(const AggrWrapperT &block,
676 SharedPtrT * sharedRegionPtr,
677 const vector_type & ghostLayerToThreadsMapping,
678 const vector_type2 & nn_blocks,
679 const blockMapType & blockMap,
680 unsigned int stencilSupportRadius,
681 unsigned int ghostLayerSize,
682 const unsigned int blockIdPos,
683 AggrBck & bck)
684 {
685 typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
686
687 const int pos = threadIdx.x % ghostLayerSize;
688 const int pos_d1 = (threadIdx.x + 2*blockDim.x) % ghostLayerSize;
689
690 __shared__ int neighboursPos[ct_params::nNN];
691
692 const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
693 short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
694 short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos + blockDim.x);
695 short int neighbourNum3 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
696
697 // Convert pos into a linear id accounting for the inner domain offsets
698 const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
699 const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos + blockDim.x);
700 const unsigned int linId3 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
701 // Now get linear offset wrt the first element of the block
702
703 int ctr = linId;
704 int ctr2 = linId2;
705 int ctr3 = linId3;
706 unsigned int acc = 1;
707 unsigned int offset = 0;
708 unsigned int offset2 = 0;
709 unsigned int offset3 = 0;
710 for (int i = 0; i < dim; ++i)
711 {
712 int v = (ctr % edge) - stencilSupportRadius;
713 int v2 = (ctr2 % edge) - stencilSupportRadius;
714 int v3 = (ctr3 % edge) - stencilSupportRadius;
715 v = (v < 0)?(v + blockEdgeSize):v;
716 v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
717 v3 = (v3 < 0)?(v3 + blockEdgeSize):v3;
718 v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
719 v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
720 v3 = (v3 >= blockEdgeSize)?v3-blockEdgeSize:v3;
721 offset += v*acc;
722 offset2 += v2*acc;
723 offset3 += v3*acc;
724 ctr /= edge;
725 ctr2 /= edge;
726 ctr3 /= edge;
727 acc *= blockEdgeSize;
728 }
729
730 // Convert pos into a linear id accounting for the ghost offsets
731 unsigned int coord[dim];
732 linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
733 const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
734// const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);
735
736// printf("AAA %d %d \n",linId_b,linId_b_test);
737
738 unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
739
740 if (threadIdx.x < ct_params::nNN)
741 {
742 neighboursPos[threadIdx.x] = nnb;
743 }
744
745 __syncthreads();
746
747 // Actually load the data into the shared region
748 auto nPos = neighboursPos[neighbourNum];
749 auto nPos2 = neighboursPos[neighbourNum2];
750 auto nPos3 = neighboursPos[neighbourNum3];
751
752 auto gdata = blockMap.template get_ele<p>(nPos)[offset];
753 auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
754 auto gdata3 = blockMap.template get_ele<p>(nPos3)[offset3];
755
756 // Actually load the data into the shared region
757 //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
758 auto bdata = block.template get<p>()[threadIdx.x];
759
760 auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
761 auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
762 auto gmask3 = blockMap.template get_ele<pMask>(nPos3)[offset3];
763
764 // Actually load the data into the shared region
765 //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
766 auto bmask = block.template get<pMask>()[threadIdx.x];
767
768 if (bmask == 0) {set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
769 if (gmask == 0) {set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
770 if (gmask2 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
771 if (gmask3 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata3,bck);}
772
773 sharedRegionPtr[linId] = gdata;
774 sharedRegionPtr[linId2] = gdata2;
775 sharedRegionPtr[linId3] = gdata3;
776 sharedRegionPtr[linId_b] = bdata;
777 }
778};
779
780template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
781struct loadGhostBlock_impl<7,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
782{
783 template<typename AggrWrapperT,
784 typename SharedPtrT,
785 typename vector_type,
786 typename vector_type2,
787 typename blockMapType,
788 typename AggrBck>
789 __device__ static inline void load(const AggrWrapperT &block,
790 SharedPtrT * sharedRegionPtr,
791 const vector_type & ghostLayerToThreadsMapping,
792 const vector_type2 & nn_blocks,
793 const blockMapType & blockMap,
794 unsigned int stencilSupportRadius,
795 unsigned int ghostLayerSize,
796 const unsigned int blockIdPos,
797 AggrBck & bck)
798 {
799 typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
800
801 const int pos = threadIdx.x % ghostLayerSize;
802 const int pos_d1 = (threadIdx.x + 6*blockDim.x) % ghostLayerSize;
803
804 __shared__ int neighboursPos[ct_params::nNN];
805
806 const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
807 short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
808 short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos + blockDim.x);
809 short int neighbourNum3 = ghostLayerToThreadsMapping.template get<nt>(pos + 2*blockDim.x);
810 short int neighbourNum4 = ghostLayerToThreadsMapping.template get<nt>(pos + 3*blockDim.x);
811 short int neighbourNum5 = ghostLayerToThreadsMapping.template get<nt>(pos + 4*blockDim.x);
812 short int neighbourNum6 = ghostLayerToThreadsMapping.template get<nt>(pos + 5*blockDim.x);
813 short int neighbourNum7 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
814
815 // Convert pos into a linear id accounting for the inner domain offsets
816 const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
817 const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos + blockDim.x);
818 const unsigned int linId3 = ghostLayerToThreadsMapping.template get<gt>(pos + 2*blockDim.x);
819 const unsigned int linId4 = ghostLayerToThreadsMapping.template get<gt>(pos + 3*blockDim.x);
820 const unsigned int linId5 = ghostLayerToThreadsMapping.template get<gt>(pos + 4*blockDim.x);
821 const unsigned int linId6 = ghostLayerToThreadsMapping.template get<gt>(pos + 5*blockDim.x);
822 const unsigned int linId7 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
823 // Now get linear offset wrt the first element of the block
824
825 int ctr = linId;
826 int ctr2 = linId2;
827 int ctr3 = linId3;
828 int ctr4 = linId4;
829 int ctr5 = linId5;
830 int ctr6 = linId6;
831 int ctr7 = linId7;
832 unsigned int acc = 1;
833 unsigned int offset = 0;
834 unsigned int offset2 = 0;
835 unsigned int offset3 = 0;
836 unsigned int offset4 = 0;
837 unsigned int offset5 = 0;
838 unsigned int offset6 = 0;
839 unsigned int offset7 = 0;
840 for (int i = 0; i < dim; ++i)
841 {
842 int v = (ctr % edge) - stencilSupportRadius;
843 int v2 = (ctr2 % edge) - stencilSupportRadius;
844 int v3 = (ctr3 % edge) - stencilSupportRadius;
845 int v4 = (ctr4 % edge) - stencilSupportRadius;
846 int v5 = (ctr5 % edge) - stencilSupportRadius;
847 int v6 = (ctr6 % edge) - stencilSupportRadius;
848 int v7 = (ctr7 % edge) - stencilSupportRadius;
849 v = (v < 0)?(v + blockEdgeSize):v;
850 v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
851 v3 = (v3 < 0)?(v3 + blockEdgeSize):v3;
852 v4 = (v4 < 0)?(v4 + blockEdgeSize):v4;
853 v5 = (v5 < 0)?(v5 + blockEdgeSize):v5;
854 v6 = (v6 < 0)?(v6 + blockEdgeSize):v6;
855 v7 = (v7 < 0)?(v7 + blockEdgeSize):v7;
856 v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
857 v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
858 v3 = (v3 >= blockEdgeSize)?v3-blockEdgeSize:v3;
859 v4 = (v4 >= blockEdgeSize)?v4-blockEdgeSize:v4;
860 v5 = (v5 >= blockEdgeSize)?v5-blockEdgeSize:v5;
861 v6 = (v6 >= blockEdgeSize)?v6-blockEdgeSize:v6;
862 v7 = (v7 >= blockEdgeSize)?v7-blockEdgeSize:v7;
863 offset += v*acc;
864 offset2 += v2*acc;
865 offset3 += v3*acc;
866 offset4 += v4*acc;
867 offset5 += v5*acc;
868 offset6 += v6*acc;
869 offset7 += v7*acc;
870 ctr /= edge;
871 ctr2 /= edge;
872 ctr3 /= edge;
873 ctr4 /= edge;
874 ctr5 /= edge;
875 ctr6 /= edge;
876 ctr7 /= edge;
877 acc *= blockEdgeSize;
878 }
879
880 // Convert pos into a linear id accounting for the ghost offsets
881 unsigned int coord[dim];
882 linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
883 const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
884// const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);
885
886// printf("AAA %d %d \n",linId_b,linId_b_test);
887
888 unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
889
890 if (threadIdx.x < ct_params::nNN)
891 {
892 neighboursPos[threadIdx.x] = nnb;
893 }
894
895 __syncthreads();
896
897 // Actually load the data into the shared region
898 auto nPos = neighboursPos[neighbourNum];
899 auto nPos2 = neighboursPos[neighbourNum2];
900 auto nPos3 = neighboursPos[neighbourNum3];
901 auto nPos4 = neighboursPos[neighbourNum4];
902 auto nPos5 = neighboursPos[neighbourNum5];
903 auto nPos6 = neighboursPos[neighbourNum6];
904 auto nPos7 = neighboursPos[neighbourNum7];
905
906 auto gdata = blockMap.template get_ele<p>(nPos)[offset];
907 auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
908 auto gdata3 = blockMap.template get_ele<p>(nPos3)[offset3];
909 auto gdata4 = blockMap.template get_ele<p>(nPos4)[offset4];
910 auto gdata5 = blockMap.template get_ele<p>(nPos5)[offset5];
911 auto gdata6 = blockMap.template get_ele<p>(nPos6)[offset6];
912 auto gdata7 = blockMap.template get_ele<p>(nPos7)[offset7];
913
914 // Actually load the data into the shared region
915 //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
916 auto bdata = block.template get<p>()[threadIdx.x];
917
918 auto bmask = block.template get<pMask>()[threadIdx.x];
919 auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
920 auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
921 auto gmask3 = blockMap.template get_ele<pMask>(nPos3)[offset3];
922 auto gmask4 = blockMap.template get_ele<pMask>(nPos4)[offset4];
923 auto gmask5 = blockMap.template get_ele<pMask>(nPos5)[offset5];
924 auto gmask6 = blockMap.template get_ele<pMask>(nPos6)[offset6];
925 auto gmask7 = blockMap.template get_ele<pMask>(nPos7)[offset7];
926
927 if (bmask == 0) {set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
928 if (gmask == 0) {set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
929 if (gmask2 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
930 if (gmask3 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata3,bck);}
931 if (gmask4 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata4,bck);}
932 if (gmask5 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata5,bck);}
933 if (gmask6 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata6,bck);}
934 if (gmask7 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata7,bck);}
935
936 sharedRegionPtr[linId] = gdata;
937 sharedRegionPtr[linId2] = gdata2;
938 sharedRegionPtr[linId3] = gdata3;
939 sharedRegionPtr[linId4] = gdata4;
940 sharedRegionPtr[linId5] = gdata5;
941 sharedRegionPtr[linId6] = gdata6;
942 sharedRegionPtr[linId7] = gdata7;
943 sharedRegionPtr[linId_b] = bdata;
944 }
945};
946
947#endif /* SPARSEGRIDGPU_KER_UTIL_HPP_ */
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition grid_key.hpp:503
Distributed vector.
static __device__ bool getNNindex_offset()
given a coordinate give the neighborhood chunk position and the offset in the neighborhood chunk
this class is a functor for "for_each" algorithm
void *(& data_ptr)[sizeof...(prp)+1]
data pointer
dataBuffer_type & dataBuff
data buffer
__device__ __host__ void operator()(T &t)
It call the copy function for each property.
unsigned int dataBlockPos
position of the block
unsigned int n_pnt
Number of points to pack.
__device__ __host__ sparsegridgpu_pack_impl(unsigned int dataBlockPos, unsigned int offset, dataBuffer_type &dataBuff, unsigned int ppos, void *(&data_ptr)[sizeof...(prp)+1], unsigned int n_pnt)
constructor
this class is a functor for "for_each" algorithm
void operator()(T &t)
It call the copy function for each property.
this class is a functor for "for_each" algorithm
dataBuffer_type & dataBuff
data buffer
unsigned int dataBlockPos
position of the block
__device__ __host__ sparsegridgpu_unpack_impl(unsigned int dataBlockPos, unsigned int offset, dataBuffer_type &dataBuff, unsigned int ppos, arr_arr_ptr< 1, sizeof...(prp)> &data_ptr, unsigned int n_pnt)
constructor
unsigned int n_pnt
Number of points to pack.
__device__ __host__ void operator()(T &t)
It call the copy function for each property.
arr_arr_ptr< 1, sizeof...(prp)> & data_ptr
data pointer
Sub-domain vertex graph node.