OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_ker_util.hpp
1 /*
2  * SparseGridGpu_ker_util.hpp
3  *
4  * Created on: Aug 7, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef SPARSEGRIDGPU_KER_UTIL_HPP_
9 #define SPARSEGRIDGPU_KER_UTIL_HPP_
10 
11 #include "util/variadic_to_vmpl.hpp"
12 
13 template<bool to_set>
15 {
16  template<unsigned int p, typename SrcType, typename AggrType>
17  __device__ __host__ static inline void set(SrcType & src,AggrType & aggr)
18  {
19  src = aggr.template get<p>();
20  }
21 };
22 
23 template<>
24 struct set_compile_condition<false>
25 {
26  template<unsigned int p, typename SrcType, typename AggrType>
27  __device__ __host__ static inline void set(SrcType & src,AggrType & aggr)
28  {}
29 };
30 
31 template<unsigned int dim,typename T>
33 {
34  T xm[dim];
35  T xp[dim];
36 };
37 
38 /*template<unsigned int dim, unsigned int block_edge_size>
39 struct shift_position
40 {
41  __device__ static inline int shift(int pos, int stencilRadius)
42  {
43  int accu = 1;
44  int pos_s = 0;
45  for (int i = 0 ; i < dim ; i++)
46  {
47  pos_s += (pos % block_edge_size + stencilRadius)*accu;
48  accu *= (block_edge_size + 2*stencilRadius);
49  pos /= block_edge_size;
50  }
51 
52  return pos_s;
53  }
54 };
55 
56 template<unsigned int block_edge_size>
57 struct shift_position<2,block_edge_size>
58 {
59  __device__ static inline int shift(int pos, int stencilRadius)
60  {
61  unsigned int x = pos % block_edge_size;
62  unsigned int y = (pos / block_edge_size);
63 
64  unsigned int g_sz = block_edge_size + 2*stencilRadius;
65 
66  return (x+stencilRadius) + (y+stencilRadius)*g_sz;
67  }
68 };
69 
70 
71 template<unsigned int block_edge_size>
72 struct shift_position<3,block_edge_size>
73 {
74  __device__ static inline int shift(int pos, int stencilRadius)
75  {
76  unsigned int x = pos % block_edge_size;
77  unsigned int y = (pos / block_edge_size) % block_edge_size;
78  unsigned int z = (pos / (block_edge_size*block_edge_size));
79 
80  unsigned int g_sz = block_edge_size + 2*stencilRadius;
81 
82  return (x+stencilRadius) + (y+stencilRadius)*g_sz + (z+stencilRadius)*g_sz*g_sz;
83  }
84 };*/
85 
86 template<unsigned int dim>
87 struct NNStar
88 {
89  static const int nNN = IntPow<2, dim>::value;
90 
91  template<typename indexT, typename blockCoord_type, typename blockMap_type, typename SparseGrid_type>
92  __device__ static inline indexT getNNpos(blockCoord_type & blockCoord,
93  blockMap_type & blockMap,
94  SparseGrid_type & sparseGrid,
95  const unsigned int offset)
96  {
97  //todo: also do the full neighbourhood version, this is just cross
98  int neighbourPos = -1;
99  if (offset < 2*dim)
100  {
101  unsigned int d = offset/2;
102  int dPos = blockCoord.get(d) + (offset%2)*2 - 1;
103  blockCoord.set_d(d, dPos);
104 
105  int bl = sparseGrid.getBlockLinId(blockCoord);
106 
107  bl = (dPos < 0)?-1:bl;
108 
109  neighbourPos = blockMap.get_sparse(bl).id;
110  }
111  return neighbourPos;
112  }
113 
114  template<typename indexT, unsigned int blockEdgeSize, typename coordType>
115  __host__ static inline indexT getNNskin(coordType & coord, int stencilSupportRadius)
116  {
117  int neighbourNum = -1;
118  int ctr = 0;
119  for (int j = 0; j < dim; ++j)
120  {
121  int c = static_cast<int>(coord.get(j)) - static_cast<int>(stencilSupportRadius);
122  if (c < 0)
123  {
124  neighbourNum = 2*j;
125  ++ctr;
126  }
127  else if (c >= blockEdgeSize)
128  {
129  neighbourNum = 2*j + 1;
130  ++ctr;
131  }
132  }
133  if (ctr > 1) // If we are on a "corner"
134  {
135  neighbourNum = 0;
136  }
137 
138  return neighbourNum;
139  }
140 
141  template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
142  __device__ static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
143  {
144  bool isPadding_ = false;
145  for (int d=0; d<dim; ++d)
146  {
147  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
148  auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
149  typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
150  typename std::remove_all_extents<Mask_type>::type neighbourMinus = enlargedBlock[nMinusId];
151  isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
152  isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourMinus));
153  if (isPadding_) break;
154  }
155 
156  return isPadding_;
157  }
158 
163  __device__ static inline bool getNNindex_offset()
164  {
165  return false;
166  }
167 };
168 
169 template<unsigned int n_it>
170 struct arr_ptr
171 {
172  void * ptr[n_it];
173 };
174 
175 template<unsigned int n_it,unsigned int n_prp>
177 {
178  void * ptr[n_it][n_prp+1];
179 };
180 
181 template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id>
183 {
184  template<typename dataBuffer_type>
185  __device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
186  {
187  ((copy_type *)data_ptr[prp_id])[ppos] = dataBuff.template get<prp_val>(dataBlockPos)[offset];
188  }
189 
190  template<typename dataBuffer_type>
191  __device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
192  {
193  dataBuff.template get<prp_val>(dataBlockPos)[offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos];
194  }
195 };
196 
197 template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id, unsigned int N1>
198 struct meta_copy_block<copy_type[N1],nprp,prp_val,prp_id>
199 {
200  template<typename dataBuffer_type>
201  __device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
202  {
203  for (int i = 0 ; i < N1 ; i++)
204  {
205  ((copy_type *)data_ptr[prp_id])[ppos+i*n_pnt] = dataBuff.template get<prp_val>(dataBlockPos)[i][offset];
206  }
207  }
208 
209  template<typename dataBuffer_type>
210  __device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
211  {
212  for (int i = 0 ; i < N1 ; i++)
213  {
214  dataBuff.template get<prp_val>(dataBlockPos)[i][offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos+i*n_pnt];
215  }
216  }
217 };
218 
219 template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id, unsigned int N1, unsigned int N2>
220 struct meta_copy_block<copy_type[N1][N2],nprp,prp_val,prp_id>
221 {
222  template<typename dataBuffer_type>
223  __device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
224  {
225  for (int i = 0 ; i < N1 ; i++)
226  {
227  for (int j = 0 ; j < N2 ; j++)
228  {
229  ((copy_type *)data_ptr[prp_id])[ppos + (i*N2 + j)*n_pnt] = dataBuff.template get<prp_val>(dataBlockPos)[i][j][offset];
230  }
231  }
232  }
233 
234  template<typename dataBuffer_type>
235  __device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
236  {
237  for (int i = 0 ; i < N1 ; i++)
238  {
239  for (int j = 0 ; j < N2 ; j++)
240  {
241  dataBuff.template get<prp_val>(dataBlockPos)[i][j][offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos + (i*N2 + j)*n_pnt];
242  }
243  }
244  }
245 };
246 
256 template<typename AggregateT, typename dataBuffer_type, int ... prp>
258 {
259  typedef typename to_boost_vmpl<prp...>::type vprp;
260 
262  unsigned int dataBlockPos;
263 
265  unsigned int offset;
266 
268  dataBuffer_type & dataBuff;
269 
271  unsigned int ppos;
272 
274  void * (& data_ptr)[sizeof...(prp)+1];
275 
277  unsigned int n_pnt;
278 
282  __device__ __host__ inline sparsegridgpu_pack_impl(unsigned int dataBlockPos,
283  unsigned int offset,
284  dataBuffer_type & dataBuff,
285  unsigned int ppos,
286  void * (& data_ptr)[sizeof...(prp)+1],
287  unsigned int n_pnt)
289  {};
290 
292  template<typename T>
293  __device__ __host__ inline void operator()(T& t)
294  {
295  typedef typename boost::mpl::at<vprp,T>::type prp_cp;
296 
297  // Remove the reference from the type to copy
298  typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
299 
300  meta_copy_block<pack_type,sizeof...(prp)+1,prp_cp::value,T::value>::copy(data_ptr,dataBuff,ppos,dataBlockPos,offset,n_pnt);
301  }
302 };
303 
304 
314 template<typename AggregateT, typename dataBuffer_type, int ... prp>
316 {
317  typedef typename to_boost_vmpl<prp...>::type vprp;
318 
320  unsigned int dataBlockPos;
321 
323  unsigned int offset;
324 
326  dataBuffer_type & dataBuff;
327 
329  unsigned int ppos;
330 
332  arr_arr_ptr<1,sizeof...(prp)> & data_ptr;
333 
335  unsigned int n_pnt;
336 
340  __device__ __host__ inline sparsegridgpu_unpack_impl(unsigned int dataBlockPos,
341  unsigned int offset,
342  dataBuffer_type & dataBuff,
343  unsigned int ppos,
344  arr_arr_ptr<1,sizeof...(prp)> & data_ptr,
345  unsigned int n_pnt)
347  {};
348 
350  template<typename T>
351  __device__ __host__ inline void operator()(T& t)
352  {
353  typedef typename boost::mpl::at<vprp,T>::type prp_cp;
354 
355  // Remove the reference from the type to copy
356  typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
357 
358  meta_copy_block<pack_type,sizeof...(prp),prp_cp::value,T::value>::copy_inv(data_ptr,dataBuff,ppos,dataBlockPos,offset,n_pnt);
359  }
360 };
361 
371 template<typename AggregateT, int ... prp>
373 {
374  typedef typename to_boost_vmpl<prp...>::type vprp;
375 
377  size_t point_size;
378 
383  :point_size(0)
384  {};
385 
387  template<typename T>
388  inline void operator()(T& t)
389  {
390  typedef typename boost::mpl::at<vprp,T>::type prp_cp;
391 
392  // Remove the reference from the type to copy
393  typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
394 
395  point_size += sizeof(pack_type);
396  }
397 };
398 
399 template<unsigned int edgeSize, unsigned int dim>
400 inline __device__ unsigned int coordToLin(const unsigned int (&coord)[dim], const unsigned int paddingSize = 0)
401 {
402  unsigned int linId = coord[dim - 1];
403  for (int d = dim - 2; d >= 0; --d)
404  {
405  linId *= edgeSize + 2 * paddingSize;
406  linId += coord[d];
407  }
408  return linId;
409 }
410 
411 
412 template<unsigned int edgeSize, typename CoordT, unsigned int dim>
413 inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> &coord, const unsigned int paddingSize = 0)
414 {
415  unsigned int linId = coord.get(dim - 1);
416  for (int d = dim - 2; d >= 0; --d)
417  {
418  linId *= edgeSize + 2 * paddingSize;
419  linId += coord.get(d);
420  }
421  return linId;
422 }
423 
424 template <typename CoordT,unsigned int dim>
425 inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> & coord, grid_key_dx<dim, int> & blockDimensions)
426 {
427  unsigned int linId = coord.get(dim - 1);
428  for (int d = dim - 2; d >= 0; --d)
429  {
430  linId *= blockDimensions.get(d);
431  linId += coord.get(d);
432  }
433  return linId;
434 }
435 
436 
437 
438 template<unsigned int edgeSize, unsigned int dim>
439 inline __device__ void linToCoordWithOffset(const unsigned int linId, const unsigned int offset, unsigned int (&coord)[dim])
440 {
441  unsigned int linIdTmp = linId;
442  for (unsigned int d = 0; d < dim; ++d)
443  {
444  coord[d] = linIdTmp % edgeSize;
445  coord[d] += offset;
446  linIdTmp /= edgeSize;
447  }
448 }
449 
450 template<unsigned int edgeSize, unsigned int dim>
451 inline __device__ void linToCoord(const unsigned int linId, unsigned int (&coord)[dim])
452 {
453  unsigned int linIdTmp = linId;
454  for (unsigned int d = 0; d < dim; ++d)
455  {
456  coord[d] = linIdTmp % edgeSize;
457  linIdTmp /= edgeSize;
458  }
459 }
460 
461 constexpr int gt = 0;
462 constexpr int nt = 1;
463 
464 template<unsigned int nLoop, unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
466 {
467  template<typename AggrWrapperT,
468  typename SharedPtrT,
469  typename vector_type,
470  typename vector_type2,
471  typename blockMapType,
472  typename AggrBck>
473  __device__ static inline void load(const AggrWrapperT &block,
474  SharedPtrT * sharedRegionPtr,
475  const vector_type & ghostLayerToThreadsMapping,
476  const vector_type2 & nn_blocks,
477  const blockMapType & blockMap,
478  unsigned int stencilSupportRadius,
479  unsigned int ghostLayerSize,
480  const unsigned int blockId,
481  AggrBck & bck)
482  {
483  printf("Error to implement loadGhostBlock_impl with nLoop=%d \n",nLoop);
484  }
485 };
486 
487 template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
488 struct loadGhostBlock_impl<1,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
489 {
490  template<typename AggrWrapperT,
491  typename SharedPtrT,
492  typename vector_type,
493  typename vector_type2,
494  typename blockMapType,
495  typename AggrBck>
496  __device__ static inline void load(const AggrWrapperT &block,
497  SharedPtrT * sharedRegionPtr,
498  const vector_type & ghostLayerToThreadsMapping,
499  const vector_type2 & nn_blocks,
500  const blockMapType & blockMap,
501  unsigned int stencilSupportRadius,
502  unsigned int ghostLayerSize,
503  const unsigned int blockIdPos,
504  AggrBck & bck)
505  {
506  typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
507 
508  const int pos = threadIdx.x % ghostLayerSize;
509 
510  __shared__ int neighboursPos[ct_params::nNN];
511 
512  const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
513  short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
514 
515  // Convert pos into a linear id accounting for the inner domain offsets
516  const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
517  // Now get linear offset wrt the first element of the block
518 
519  int ctr = linId;
520  unsigned int acc = 1;
521  unsigned int offset = 0;
522  for (int i = 0; i < dim; ++i)
523  {
524  int v = (ctr % edge) - stencilSupportRadius;
525  v = (v < 0)?(v + blockEdgeSize):v;
526  v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
527  offset += v*acc;
528  ctr /= edge;
529  acc *= blockEdgeSize;
530  }
531 
532  // Convert pos into a linear id accounting for the ghost offsets
533  unsigned int coord[dim];
534  linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
535  const unsigned int linId2 = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
536 
537  unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
538 
539  if (threadIdx.x < ct_params::nNN)
540  {
541  neighboursPos[threadIdx.x] = nnb;
542  }
543 
544  __syncthreads();
545 
546  // Actually load the data into the shared region
547  auto nPos = neighboursPos[neighbourNum];
548 
549  auto gdata = blockMap.template get_ele<p>(nPos)[offset];
550 
551  // Actually load the data into the shared region
552  //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
553  auto bdata = block.template get<p>()[threadIdx.x];
554 
555  auto bmask = block.template get<pMask>()[threadIdx.x];
556  auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
557 
558  if (bmask == 0) { set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
559  if (gmask == 0) { set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
560 
561  sharedRegionPtr[linId] = gdata;
562  sharedRegionPtr[linId2] = bdata;
563  }
564 };
565 
566 template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
567 struct loadGhostBlock_impl<2,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
568 {
569  template<typename AggrWrapperT,
570  typename SharedPtrT,
571  typename vector_type,
572  typename vector_type2,
573  typename blockMapType,
574  typename AggrBck>
575  __device__ static inline void load(const AggrWrapperT &block,
576  SharedPtrT * sharedRegionPtr,
577  const vector_type & ghostLayerToThreadsMapping,
578  const vector_type2 & nn_blocks,
579  const blockMapType & blockMap,
580  unsigned int stencilSupportRadius,
581  unsigned int ghostLayerSize,
582  const unsigned int blockIdPos,
583  AggrBck & bck)
584  {
585  typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
586 
587  const int pos = threadIdx.x % ghostLayerSize;
588  const int pos_d1 = (threadIdx.x + blockDim.x) % ghostLayerSize;
589 
590  __shared__ int neighboursPos[ct_params::nNN];
591 
592  const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
593  short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
594  short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
595 
596  // Convert pos into a linear id accounting for the inner domain offsets
597  const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
598  const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
599  // Now get linear offset wrt the first element of the block
600 
601  int ctr = linId;
602  int ctr2 = linId2;
603  unsigned int acc = 1;
604  unsigned int offset = 0;
605  unsigned int offset2 = 0;
606  for (int i = 0; i < dim; ++i)
607  {
608  int v = (ctr % edge) - stencilSupportRadius;
609  int v2 = (ctr2 % edge) - stencilSupportRadius;
610  v = (v < 0)?(v + blockEdgeSize):v;
611  v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
612  v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
613  v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
614  offset += v*acc;
615  offset2 += v2*acc;
616  ctr /= edge;
617  ctr2 /= edge;
618  acc *= blockEdgeSize;
619  }
620 
621  // Convert pos into a linear id accounting for the ghost offsets
622  unsigned int coord[dim];
623  linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
624  const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
625 // const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);
626 
627 // printf("AAA %d %d \n",linId_b,linId_b_test);
628 
629  unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
630 
631  if (threadIdx.x < ct_params::nNN)
632  {
633  neighboursPos[threadIdx.x] = nnb;
634  }
635 
636  __syncthreads();
637 
638  // Actually load the data into the shared region
639  auto nPos = neighboursPos[neighbourNum];
640  auto nPos2 = neighboursPos[neighbourNum2];
641 
642  auto gdata = blockMap.template get_ele<p>(nPos)[offset];
643  auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
644 
645  // Actually load the data into the shared region
646  //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
647  auto bdata = block.template get<p>()[threadIdx.x];
648 
649  auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
650  auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
651 
652  // Actually load the data into the shared region
653  //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
654  auto bmask = block.template get<pMask>()[threadIdx.x];
655 
656  if (bmask == 0) {set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
657  if (gmask == 0) {set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
658  if (gmask2 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
659 
660  sharedRegionPtr[linId] = gdata;
661  sharedRegionPtr[linId2] = gdata2;
662  sharedRegionPtr[linId_b] = bdata;
663  }
664 };
665 
666 template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
667 struct loadGhostBlock_impl<3,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
668 {
669  template<typename AggrWrapperT,
670  typename SharedPtrT,
671  typename vector_type,
672  typename vector_type2,
673  typename blockMapType,
674  typename AggrBck>
675  __device__ static inline void load(const AggrWrapperT &block,
676  SharedPtrT * sharedRegionPtr,
677  const vector_type & ghostLayerToThreadsMapping,
678  const vector_type2 & nn_blocks,
679  const blockMapType & blockMap,
680  unsigned int stencilSupportRadius,
681  unsigned int ghostLayerSize,
682  const unsigned int blockIdPos,
683  AggrBck & bck)
684  {
685  typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
686 
687  const int pos = threadIdx.x % ghostLayerSize;
688  const int pos_d1 = (threadIdx.x + 2*blockDim.x) % ghostLayerSize;
689 
690  __shared__ int neighboursPos[ct_params::nNN];
691 
692  const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
693  short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
694  short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos + blockDim.x);
695  short int neighbourNum3 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
696 
697  // Convert pos into a linear id accounting for the inner domain offsets
698  const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
699  const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos + blockDim.x);
700  const unsigned int linId3 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
701  // Now get linear offset wrt the first element of the block
702 
703  int ctr = linId;
704  int ctr2 = linId2;
705  int ctr3 = linId3;
706  unsigned int acc = 1;
707  unsigned int offset = 0;
708  unsigned int offset2 = 0;
709  unsigned int offset3 = 0;
710  for (int i = 0; i < dim; ++i)
711  {
712  int v = (ctr % edge) - stencilSupportRadius;
713  int v2 = (ctr2 % edge) - stencilSupportRadius;
714  int v3 = (ctr3 % edge) - stencilSupportRadius;
715  v = (v < 0)?(v + blockEdgeSize):v;
716  v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
717  v3 = (v3 < 0)?(v3 + blockEdgeSize):v3;
718  v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
719  v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
720  v3 = (v3 >= blockEdgeSize)?v3-blockEdgeSize:v3;
721  offset += v*acc;
722  offset2 += v2*acc;
723  offset3 += v3*acc;
724  ctr /= edge;
725  ctr2 /= edge;
726  ctr3 /= edge;
727  acc *= blockEdgeSize;
728  }
729 
730  // Convert pos into a linear id accounting for the ghost offsets
731  unsigned int coord[dim];
732  linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
733  const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
734 // const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);
735 
736 // printf("AAA %d %d \n",linId_b,linId_b_test);
737 
738  unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
739 
740  if (threadIdx.x < ct_params::nNN)
741  {
742  neighboursPos[threadIdx.x] = nnb;
743  }
744 
745  __syncthreads();
746 
747  // Actually load the data into the shared region
748  auto nPos = neighboursPos[neighbourNum];
749  auto nPos2 = neighboursPos[neighbourNum2];
750  auto nPos3 = neighboursPos[neighbourNum3];
751 
752  auto gdata = blockMap.template get_ele<p>(nPos)[offset];
753  auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
754  auto gdata3 = blockMap.template get_ele<p>(nPos3)[offset3];
755 
756  // Actually load the data into the shared region
757  //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
758  auto bdata = block.template get<p>()[threadIdx.x];
759 
760  auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
761  auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
762  auto gmask3 = blockMap.template get_ele<pMask>(nPos3)[offset3];
763 
764  // Actually load the data into the shared region
765  //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
766  auto bmask = block.template get<pMask>()[threadIdx.x];
767 
768  if (bmask == 0) {set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
769  if (gmask == 0) {set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
770  if (gmask2 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
771  if (gmask3 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata3,bck);}
772 
773  sharedRegionPtr[linId] = gdata;
774  sharedRegionPtr[linId2] = gdata2;
775  sharedRegionPtr[linId3] = gdata3;
776  sharedRegionPtr[linId_b] = bdata;
777  }
778 };
779 
780 template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
781 struct loadGhostBlock_impl<7,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
782 {
783  template<typename AggrWrapperT,
784  typename SharedPtrT,
785  typename vector_type,
786  typename vector_type2,
787  typename blockMapType,
788  typename AggrBck>
789  __device__ static inline void load(const AggrWrapperT &block,
790  SharedPtrT * sharedRegionPtr,
791  const vector_type & ghostLayerToThreadsMapping,
792  const vector_type2 & nn_blocks,
793  const blockMapType & blockMap,
794  unsigned int stencilSupportRadius,
795  unsigned int ghostLayerSize,
796  const unsigned int blockIdPos,
797  AggrBck & bck)
798  {
799  typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
800 
801  const int pos = threadIdx.x % ghostLayerSize;
802  const int pos_d1 = (threadIdx.x + 6*blockDim.x) % ghostLayerSize;
803 
804  __shared__ int neighboursPos[ct_params::nNN];
805 
806  const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
807  short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
808  short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos + blockDim.x);
809  short int neighbourNum3 = ghostLayerToThreadsMapping.template get<nt>(pos + 2*blockDim.x);
810  short int neighbourNum4 = ghostLayerToThreadsMapping.template get<nt>(pos + 3*blockDim.x);
811  short int neighbourNum5 = ghostLayerToThreadsMapping.template get<nt>(pos + 4*blockDim.x);
812  short int neighbourNum6 = ghostLayerToThreadsMapping.template get<nt>(pos + 5*blockDim.x);
813  short int neighbourNum7 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
814 
815  // Convert pos into a linear id accounting for the inner domain offsets
816  const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
817  const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos + blockDim.x);
818  const unsigned int linId3 = ghostLayerToThreadsMapping.template get<gt>(pos + 2*blockDim.x);
819  const unsigned int linId4 = ghostLayerToThreadsMapping.template get<gt>(pos + 3*blockDim.x);
820  const unsigned int linId5 = ghostLayerToThreadsMapping.template get<gt>(pos + 4*blockDim.x);
821  const unsigned int linId6 = ghostLayerToThreadsMapping.template get<gt>(pos + 5*blockDim.x);
822  const unsigned int linId7 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
823  // Now get linear offset wrt the first element of the block
824 
825  int ctr = linId;
826  int ctr2 = linId2;
827  int ctr3 = linId3;
828  int ctr4 = linId4;
829  int ctr5 = linId5;
830  int ctr6 = linId6;
831  int ctr7 = linId7;
832  unsigned int acc = 1;
833  unsigned int offset = 0;
834  unsigned int offset2 = 0;
835  unsigned int offset3 = 0;
836  unsigned int offset4 = 0;
837  unsigned int offset5 = 0;
838  unsigned int offset6 = 0;
839  unsigned int offset7 = 0;
840  for (int i = 0; i < dim; ++i)
841  {
842  int v = (ctr % edge) - stencilSupportRadius;
843  int v2 = (ctr2 % edge) - stencilSupportRadius;
844  int v3 = (ctr3 % edge) - stencilSupportRadius;
845  int v4 = (ctr4 % edge) - stencilSupportRadius;
846  int v5 = (ctr5 % edge) - stencilSupportRadius;
847  int v6 = (ctr6 % edge) - stencilSupportRadius;
848  int v7 = (ctr7 % edge) - stencilSupportRadius;
849  v = (v < 0)?(v + blockEdgeSize):v;
850  v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
851  v3 = (v3 < 0)?(v3 + blockEdgeSize):v3;
852  v4 = (v4 < 0)?(v4 + blockEdgeSize):v4;
853  v5 = (v5 < 0)?(v5 + blockEdgeSize):v5;
854  v6 = (v6 < 0)?(v6 + blockEdgeSize):v6;
855  v7 = (v7 < 0)?(v7 + blockEdgeSize):v7;
856  v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
857  v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
858  v3 = (v3 >= blockEdgeSize)?v3-blockEdgeSize:v3;
859  v4 = (v4 >= blockEdgeSize)?v4-blockEdgeSize:v4;
860  v5 = (v5 >= blockEdgeSize)?v5-blockEdgeSize:v5;
861  v6 = (v6 >= blockEdgeSize)?v6-blockEdgeSize:v6;
862  v7 = (v7 >= blockEdgeSize)?v7-blockEdgeSize:v7;
863  offset += v*acc;
864  offset2 += v2*acc;
865  offset3 += v3*acc;
866  offset4 += v4*acc;
867  offset5 += v5*acc;
868  offset6 += v6*acc;
869  offset7 += v7*acc;
870  ctr /= edge;
871  ctr2 /= edge;
872  ctr3 /= edge;
873  ctr4 /= edge;
874  ctr5 /= edge;
875  ctr6 /= edge;
876  ctr7 /= edge;
877  acc *= blockEdgeSize;
878  }
879 
880  // Convert pos into a linear id accounting for the ghost offsets
881  unsigned int coord[dim];
882  linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
883  const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
884 // const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);
885 
886 // printf("AAA %d %d \n",linId_b,linId_b_test);
887 
888  unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
889 
890  if (threadIdx.x < ct_params::nNN)
891  {
892  neighboursPos[threadIdx.x] = nnb;
893  }
894 
895  __syncthreads();
896 
897  // Actually load the data into the shared region
898  auto nPos = neighboursPos[neighbourNum];
899  auto nPos2 = neighboursPos[neighbourNum2];
900  auto nPos3 = neighboursPos[neighbourNum3];
901  auto nPos4 = neighboursPos[neighbourNum4];
902  auto nPos5 = neighboursPos[neighbourNum5];
903  auto nPos6 = neighboursPos[neighbourNum6];
904  auto nPos7 = neighboursPos[neighbourNum7];
905 
906  auto gdata = blockMap.template get_ele<p>(nPos)[offset];
907  auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
908  auto gdata3 = blockMap.template get_ele<p>(nPos3)[offset3];
909  auto gdata4 = blockMap.template get_ele<p>(nPos4)[offset4];
910  auto gdata5 = blockMap.template get_ele<p>(nPos5)[offset5];
911  auto gdata6 = blockMap.template get_ele<p>(nPos6)[offset6];
912  auto gdata7 = blockMap.template get_ele<p>(nPos7)[offset7];
913 
914  // Actually load the data into the shared region
915  //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
916  auto bdata = block.template get<p>()[threadIdx.x];
917 
918  auto bmask = block.template get<pMask>()[threadIdx.x];
919  auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
920  auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
921  auto gmask3 = blockMap.template get_ele<pMask>(nPos3)[offset3];
922  auto gmask4 = blockMap.template get_ele<pMask>(nPos4)[offset4];
923  auto gmask5 = blockMap.template get_ele<pMask>(nPos5)[offset5];
924  auto gmask6 = blockMap.template get_ele<pMask>(nPos6)[offset6];
925  auto gmask7 = blockMap.template get_ele<pMask>(nPos7)[offset7];
926 
927  if (bmask == 0) {set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
928  if (gmask == 0) {set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
929  if (gmask2 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
930  if (gmask3 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata3,bck);}
931  if (gmask4 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata4,bck);}
932  if (gmask5 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata5,bck);}
933  if (gmask6 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata6,bck);}
934  if (gmask7 == 0) {set_compile_condition<pMask != p>::template set<p>(gdata7,bck);}
935 
936  sharedRegionPtr[linId] = gdata;
937  sharedRegionPtr[linId2] = gdata2;
938  sharedRegionPtr[linId3] = gdata3;
939  sharedRegionPtr[linId4] = gdata4;
940  sharedRegionPtr[linId5] = gdata5;
941  sharedRegionPtr[linId6] = gdata6;
942  sharedRegionPtr[linId7] = gdata7;
943  sharedRegionPtr[linId_b] = bdata;
944  }
945 };
946 
947 #endif /* SPARSEGRIDGPU_KER_UTIL_HPP_ */
Sub-domain vertex graph node.
unsigned int n_pnt
Number of points to pack.
grid_key_dx is the key to access any element in the grid
Definition: grid_key.hpp:18
this class is a functor for "for_each" algorithm
arr_arr_ptr< 1, sizeof...(prp)> & data_ptr
data pointer
__device__ __host__ sparsegridgpu_pack_impl(unsigned int dataBlockPos, unsigned int offset, dataBuffer_type &dataBuff, unsigned int ppos, void *(&data_ptr)[sizeof...(prp)+1], unsigned int n_pnt)
constructor
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition: grid_key.hpp:503
__device__ __host__ void operator()(T &t)
It call the copy function for each property.
dataBuffer_type & dataBuff
data buffer
void *(& data_ptr)[sizeof...(prp)+1]
data pointer
this class is a functor for "for_each" algorithm
static __device__ bool getNNindex_offset()
given a coordinate give the neighborhood chunk position and the offset in the neighborhood chunk
__device__ __host__ void operator()(T &t)
It call the copy function for each property.
void operator()(T &t)
It call the copy function for each property.
dataBuffer_type & dataBuff
data buffer
unsigned int dataBlockPos
position of the block
__device__ __host__ sparsegridgpu_unpack_impl(unsigned int dataBlockPos, unsigned int offset, dataBuffer_type &dataBuff, unsigned int ppos, arr_arr_ptr< 1, sizeof...(prp)> &data_ptr, unsigned int n_pnt)
constructor
Distributed vector.
unsigned int dataBlockPos
position of the block
this class is a functor for "for_each" algorithm
unsigned int n_pnt
Number of points to pack.