OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_util_test.cuh
1 /*
2  * SparseGridGpu_util_test.cuh
3  *
4  * Created on: Sep 9, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef SPARSEGRIDGPU_UTIL_TEST_CUH_
9 #define SPARSEGRIDGPU_UTIL_TEST_CUH_
10 
11 #include "SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh"
12 
13 template<unsigned int p, typename SparseGridType>
14 __global__ void getValues2D(SparseGridType sparseGrid, const int offsetX=0, const int offsetY=0)
15 {
16  sparseGrid.init();
17 
18  const auto bDimX = blockDim.x;
19  const auto bDimY = blockDim.y;
20  const auto bIdX = blockIdx.x;
21  const auto bIdY = blockIdx.y;
22  const auto tIdX = threadIdx.x;
23  const auto tIdY = threadIdx.y;
24  int x = bIdX * bDimX + tIdX + offsetX;
25  int y = bIdY * bDimY + tIdY + offsetY;
27 
28  auto value = sparseGrid.template get<p>(coord);
29  value++;
30 
31  // Compiler avoid warning
32  x++;
33  y++;
34 }
35 
36 template<unsigned int p, typename SparseGridType>
37 __global__ void getValuesNeighbourhood2D(SparseGridType sparseGrid, const int offsetX=0, const int offsetY=0)
38 {
39  sparseGrid.init();
40 
41  const auto bDimX = blockDim.x;
42  const auto bDimY = blockDim.y;
43  const auto bIdX = blockIdx.x;
44  const auto bIdY = blockIdx.y;
45  const auto tIdX = threadIdx.x;
46  const auto tIdY = threadIdx.y;
47  int x = bIdX * bDimX + tIdX + offsetX;
48  int y = bIdY * bDimY + tIdY + offsetY;
49  --x; --y;
51  for (int i=0; i < 9; ++i)
52  {
53  auto value = sparseGrid.template get<p>(coord);
54  coord.set_d(0, x + i%3);
55  coord.set_d(1, y + i/3);
56  value++;
57  }
58 }
59 
60 template<unsigned int p, typename SparseGridType>
61 __global__ void insertValues2D(SparseGridType sparseGrid, const int offsetX=0, const int offsetY=0)
62 {
63  sparseGrid.init();
64 
65  const auto bDimX = blockDim.x;
66  const auto bDimY = blockDim.y;
67  const auto bIdX = blockIdx.x;
68  const auto bIdY = blockIdx.y;
69  const auto tIdX = threadIdx.x;
70  const auto tIdY = threadIdx.y;
71  int x = bIdX * bDimX + tIdX + offsetX;
72  int y = bIdY * bDimY + tIdY + offsetY;
74 
75  sparseGrid.template insert<p>(coord) = x*x*y*y; // some function...
76 
77  __syncthreads();
78 
79  sparseGrid.flush_block_insert();
80 
81  // Compiler avoid warning
82  x++;
83  y++;
84 }
85 
86 template<unsigned int p, unsigned int chunksPerBlock, unsigned int blockEdgeSize, typename SparseGridType>
87 __global__ void insertValues2DBlocked(SparseGridType sparseGrid, const int sOffsetX=0, const int sOffsetY=0)
88 {
89  constexpr unsigned int pMask = SparseGridType::pMask;
90  typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
91  typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
92 
93  sparseGrid.init();
94 
95  int posX = blockIdx.x * blockDim.x + threadIdx.x + sOffsetX;
96  int posY = blockIdx.y * blockDim.y + threadIdx.y + sOffsetY;
97  const unsigned int offsetX = posX % blockEdgeSize;
98  const unsigned int offsetY = posY % blockEdgeSize;
99 
100  const unsigned int offset = offsetY * blockEdgeSize + offsetX;
101 
102  grid_key_dx<SparseGridType::d, int> blockCoord({posX / blockEdgeSize, posY / blockEdgeSize});
103  auto encap = sparseGrid.insertBlock(sparseGrid.getBlockLinId(blockCoord));
104 
105  encap.template get<p>()[offset] = posX*posX * posY*posY;
106  BlockMapGpu_ker<>::setExist(encap.template get<pMask>()[offset]);
107 
108  __syncthreads();
109 
110  sparseGrid.flush_block_insert();
111 }
112 
113 
114 
115 template<unsigned int p, unsigned int chunksPerBlock=1, typename SparseGridType, typename ScalarT>
116 __global__ void insertConstantValue(SparseGridType sparseGrid, ScalarT value)
117 {
118  constexpr unsigned int pMask = SparseGridType::pMask;
119  typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
120  typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
121 
122  sparseGrid.init();
123 
124  int x = blockIdx.x * blockDim.x + threadIdx.x;
125  int y = blockIdx.y * blockDim.y + threadIdx.y;
126  int z = blockIdx.z * blockDim.z + threadIdx.z;
128 
129  auto pos = sparseGrid.getLinId(coord);
130  unsigned int dataBlockId = pos / BlockT::size;
131  unsigned int offset = pos % BlockT::size;
132 
133  auto encap = sparseGrid.template insertBlock<chunksPerBlock>(dataBlockId,BlockT::size);
134 
135  encap.template get<p>()[offset] = value;
136  BlockMapGpu_ker<>::setExist(encap.template get<pMask>()[offset]);
137 
138  __syncthreads();
139 
140  sparseGrid.flush_block_insert();
141 
142  // Compiler avoid warning
143  x++;
144  y++;
145  z++;
146 }
147 
148 
149 template<unsigned int p, typename SparseGridType, typename ValueT>
150 __global__ void insertOneValue(SparseGridType sparseGrid, dim3 pt, ValueT value)
151 {
152  sparseGrid.init();
153 
154  int x = blockIdx.x * blockDim.x + threadIdx.x;
155  int y = blockIdx.y * blockDim.y + threadIdx.y;
156  int z = blockIdx.z * blockDim.z + threadIdx.z;
157  dim3 thCoord(x, y, z);
158  if (thCoord.x == pt.x && thCoord.y == pt.y && thCoord.z == pt.z)
159  {
161  sparseGrid.template insert<p>(coord) = value;
162  }
163  __syncthreads();
164 
165  sparseGrid.flush_block_insert();
166 
167  // Compiler avoid warning
168  y++;
169  z++;
170 }
171 
172 template<unsigned int p, typename SparseGridType, typename VectorOutType>
173 __global__ void copyBlocksToOutput(SparseGridType sparseGrid, VectorOutType output)
174 {
175  const auto bDimX = blockDim.x;
176  const auto bDimY = blockDim.y;
177  const auto bDimZ = blockDim.z;
178  const auto bIdX = blockIdx.x;
179  const auto bIdY = blockIdx.y;
180  const auto bIdZ = blockIdx.z;
181  const auto tIdX = threadIdx.x;
182  const auto tIdY = threadIdx.y;
183  const auto tIdZ = threadIdx.z;
184  int x = bIdX * bDimX + tIdX;
185  int y = bIdY * bDimY + tIdY;
186  int z = bIdZ * bDimZ + tIdZ;
188 
189  size_t pos = sparseGrid.getLinId(coord);
190 
191  auto value = sparseGrid.template get<p>(coord);
192 
193  output.template get<p>(pos) = value;
194 
195  // Compiler avoid warning
196  x++;
197  y++;
198  z++;
199 }
200 
201 template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
203 {
204  typedef NNStar<dim> stencil_type;
205 
206  // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
207 
208  static constexpr unsigned int flops = 3 + 2*dim;
209 
210  static constexpr unsigned int supportRadius = 1;
211 
224  template<typename SparseGridT, typename DataBlockWrapperT>
225  static inline __device__ void stencil(
226  SparseGridT & sparseGrid,
227  const unsigned int dataBlockId,
228  const openfpm::sparse_index<unsigned int> dataBlockIdPos,
229  const unsigned int offset,
230  const grid_key_dx<dim, int> & pointCoord,
231  const DataBlockWrapperT & dataBlockLoad,
232  DataBlockWrapperT & dataBlockStore,
233  unsigned char curMask,
234  float dt)
235  {
236  typedef typename SparseGridT::AggregateBlockType AggregateT;
237  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
238 
239  constexpr unsigned int enlargedBlockSize = IntPow<
240  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
241 
242  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
243 
244  sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
245 
246  __syncthreads();
247 
248  decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
249  ScalarT res = 0;
250 
251  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
252  {
253  const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
254  // const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
255  linId = sparseGrid.getLinIdInEnlargedBlock(offset);
256  ScalarT cur = enlargedBlock[linId];
257  ScalarT laplacian = -2.0 * dim * cur; // The central part of the stencil
258 
259  for (int d = 0; d < dim; ++d)
260  {
261  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
262  auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
263  ScalarT neighbourPlus = enlargedBlock[nPlusId];
264  ScalarT neighbourMinus = enlargedBlock[nMinusId];
265  laplacian += neighbourMinus + neighbourPlus;
266  }
267  // enlargedBlock[linId] = cur + dt * laplacian;
268  res = cur + dt * laplacian;
269  }
270 
271  __syncthreads();
272  if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
273  {
274  enlargedBlock[linId] = res;
275  }
276  __syncthreads();
277  sparseGrid.template storeBlock<p_dst>(dataBlockStore, enlargedBlock);
278  }
279 
292  template<typename SparseGridT, typename DataBlockWrapperT>
293  static inline __host__ void stencilHost(
294  SparseGridT & sparseGrid,
295  const unsigned int dataBlockId,
296  const openfpm::sparse_index<unsigned int> dataBlockIdPos,
297  const unsigned int offset,
298  const grid_key_dx<dim, int> & pointCoord,
299  const DataBlockWrapperT & dataBlockLoad,
300  DataBlockWrapperT & dataBlockStore,
301  bool isActive,
302  float dt)
303  {
304  constexpr unsigned int blockEdgeSize = SparseGridT::getBlockEdgeSize();
305 
306  if (isActive)
307  {
308  auto cur = dataBlockLoad.template get<p_src>()[offset];
309  auto laplacian = -2.0 * dim * cur; // The central part of the stencil
310 
311  auto neighbourCoord = pointCoord;
312  auto counter = offset;
313  unsigned int dimStride = 1;
314  for (int d = 0; d < dim; ++d)
315  {
316  const auto localOffset = counter % blockEdgeSize;
317 
318  if (localOffset == 0) // This means we are at the lower boundary for this dimension
319  {
320  neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
321  laplacian += sparseGrid.template get<p_src>(neighbourCoord);
322  neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
323  }
324  else
325  {
326  laplacian += dataBlockLoad.template get<p_src>()[offset - dimStride];
327  }
328  if (localOffset == blockEdgeSize - 1) // This means we are at the lower boundary for this dimension
329  {
330  neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
331  laplacian += sparseGrid.template get<p_src>(neighbourCoord);
332  neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
333  }
334  else
335  {
336  laplacian += dataBlockLoad.template get<p_src>()[offset + dimStride];
337  }
338  //
339  counter /= blockEdgeSize;
340  dimStride *= blockEdgeSize;
341  }
342  dataBlockStore.template get<p_dst>()[offset] = cur + dt * laplacian;
343  }
344  }
345 
346  template <typename SparseGridT, typename CtxT>
347  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
348  {
349  sparseGrid.template flush <smax_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
350  }
351 };
352 
353 struct conv_coeff
354 {
355  float coeff[3][3][3];
356 };
357 
358 
359 
360 template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
361 struct Conv3x3x3
362 {
363  // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
364 
365  typedef NNFull<dim> stencil_type;
366 
367  static constexpr unsigned int supportRadius = 1;
368 
369  static constexpr unsigned int flops = 2*27;
370 
371  template<typename SparseGridT, typename DataBlockWrapperT>
372  static inline __device__ void stencil(
373  SparseGridT & sparseGrid,
374  const unsigned int dataBlockId,
376  unsigned int offset,
377  grid_key_dx<dim, int> & pointCoord,
378  DataBlockWrapperT & dataBlockLoad,
379  DataBlockWrapperT & dataBlockStore,
380  bool applyStencilHere,
381  conv_coeff & cc)
382  {
383  typedef typename SparseGridT::AggregateBlockType AggregateT;
384  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
385 
386  constexpr unsigned int enlargedBlockSize = IntPow<
387  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
388 
389  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
390 
391  sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockIdPos,enlargedBlock);
392 
393  __syncthreads();
394 
395  if (applyStencilHere)
396  {
397  const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
398  const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
399  ScalarT tot = 0.0;
400  for (int i = 0; i < dim; ++i)
401  {
402  for (int j = 0; j < dim; ++j)
403  {
404  for (int k = 0; k < dim; ++k)
405  {
407 
408  key.set_d(0,i-1);
409  key.set_d(1,j-1);
410  key.set_d(2,k-1);
411 
412  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
413  tot += enlargedBlock[nPlusId] * cc.coeff[i][j][k];
414  }
415  }
416  }
417 
418  dataBlockStore.template get<p_dst>()[offset] = tot;
419  }
420  }
421 
422  template <typename SparseGridT, typename CtxT>
423  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
424  {
425  sparseGrid.template flush <smax_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
426  }
427 };
428 
429 template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
430 struct Conv3x3x3_noshared
431 {
432  // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
433 
434  typedef NNFull<dim> stencil_type;
435 
436  static constexpr unsigned int supportRadius = 1;
437 
438  static constexpr unsigned int flops = 2*27;
439 
440  template<typename SparseGridT, typename DataBlockWrapperT>
441  static inline __device__ void stencil(
442  SparseGridT & sparseGrid,
443  const unsigned int dataBlockId,
445  unsigned int offset,
446  grid_key_dx<dim, int> & pointCoord,
447  DataBlockWrapperT & dataBlockLoad,
448  DataBlockWrapperT & dataBlockStore,
449  bool applyStencilHere,
450  conv_coeff & cc)
451  {
452  typedef typename SparseGridT::AggregateBlockType AggregateT;
453  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
454 
455  if (applyStencilHere)
456  {
457  ScalarT tot = 0.0;
458  for (int i = 0; i < dim; ++i)
459  {
460  for (int j = 0; j < dim; ++j)
461  {
462  for (int k = 0; k < dim; ++k)
463  {
464 
466 
467  key.set_d(0,k-1);
468  key.set_d(1,j-1);
469  key.set_d(2,i-1);
470 
471  auto pos = sparseGrid.template getNNPoint<stencil_type>(dataBlockIdPos, offset, key);
472 
473  tot += sparseGrid.template get<p_src>(pos) * cc.coeff[i][j][k];
474  }
475  }
476  }
477 
478  dataBlockStore.template get<p_dst>()[offset] = tot;
479  }
480  }
481 
482  template <typename SparseGridT, typename CtxT>
483  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
484  {
485  sparseGrid.template flush <smax_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
486  }
487 };
488 
489 template<typename SparseGridZ>
490 void testConv3x3x3_perf(std::string testName)
491 {
492  constexpr unsigned int dim = 3;
493  constexpr unsigned int blockEdgeSize = 8;
494  typedef aggregate<float,float> AggregateT;
495 
496  unsigned int iterations = 100;
497 
498  size_t sz[] = {1000,1000,1000};
499 
500  SparseGridZ sparseGrid(sz);
501  mgpu::ofp_context_t ctx;
502  sparseGrid.template setBackgroundValue<0>(0);
503 
504  // now create 3 3D sphere
505 
506  grid_key_dx<3,int> start({256,256,256});
507 
508  dim3 gridSize(32,32,32);
509 
510  // Insert values on the grid
511  sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
512  CUDA_LAUNCH_DIM3((insertSphere3D_radius<0>),
513  gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
514  sparseGrid.toKernel(), start,128, 56, 1);
515 
516  sparseGrid.template flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
517 
518  sparseGrid.template deviceToHost<0>(); // NECESSARY as count takes place on Host!
519  auto existingElements = sparseGrid.countExistingElements();
520  auto boundaryElements = sparseGrid.countBoundaryElements();
521  unsigned long long numElements = existingElements - boundaryElements;
522 
523  sparseGrid.template findNeighbours<NNFull<3>>(); // Pre-compute the neighbours pos for each block!
524 
525  sparseGrid.template setNNType<NNFull<dim>>();
526  sparseGrid.template tagBoundaries<NNFull<3>>(ctx);
527 
528  openfpm::vector<double> measures_gf;
529  openfpm::vector<double> measures_tm;
530 
531  for (unsigned int iter=0; iter<iterations; ++iter)
532  {
533  cudaDeviceSynchronize();
534  conv_coeff cc;
535 
536  for (int i = 0 ; i < 3 ; i++)
537  {
538  for (int j = 0 ; j < 3 ; j++)
539  {
540  for (int k = 0 ; k < 3 ; k++)
541  {
542  cc.coeff[k][j][i] = 1.0;
543  }
544  }
545  }
546 
547  timer ts;
548  ts.start();
549 
550  sparseGrid.template applyStencils<Conv3x3x3<dim,0,1>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE,cc);
551 
552  cudaDeviceSynchronize();
553  ts.stop();
554 
555  measures_tm.add(ts.getwct());
556 
557  float gElemS = numElements / (1e9 * ts.getwct());
558  float gFlopsS = gElemS * Conv3x3x3<dim,0,1>::flops;
559 
560  measures_gf.add(gFlopsS);
561  }
562 
563  double mean_tm = 0;
564  double deviation_tm = 0;
565  standard_deviation(measures_tm,mean_tm,deviation_tm);
566 
567  double mean_gf = 0;
568  double deviation_gf = 0;
569  standard_deviation(measures_gf,mean_gf,deviation_gf);
570 
571  std::cout << "Test: " << testName << std::endl;
572  std::cout << "Block: " << SparseGridZ::blockEdgeSize_
573  << "x" << SparseGridZ::blockEdgeSize_
574  << "x" << SparseGridZ::blockEdgeSize_
575  << std::endl;
576  std::cout << "Grid: " << sz[0] << "x" << sz[1] << "x" << sz[2] << std::endl;
577 
578  double dataOccupancyMean, dataOccupancyDev;
579  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);
580  std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
581 
582  std::cout << "Iterations: " << iterations << std::endl;
583  std::cout << "\tConvolution3x3x3: " << mean_tm << " dev:" << deviation_tm << " s" << std::endl;
584  std::cout << "Throughput: " << std::endl << "\t " << mean_gf << " GFlops/s dev: " << deviation_gf << " GFlops/s " << std::endl;
585 }
586 
587 template<typename SparseGridZ>
588 static void testConv3x3x3_no_shared_perf(std::string testName)
589 {
590  unsigned int iterations = 100;
591 
592  size_t sz[] = {1000,1000,1000};
593 
594  SparseGridZ sparseGrid(sz);
595  mgpu::ofp_context_t ctx;
596  sparseGrid.template setBackgroundValue<0>(0);
597 
598  // now create 3 3D sphere
599 
600  grid_key_dx<3,int> start({256,256,256});
601 
602  dim3 gridSize(32,32,32);
603 
604  // Insert values on the grid
605  sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
606  CUDA_LAUNCH_DIM3((insertSphere3D_radius<0>),
607  gridSize, dim3(SparseGridZ::blockEdgeSize_*SparseGridZ::blockEdgeSize_*SparseGridZ::blockEdgeSize_,1,1),
608  sparseGrid.toKernel(), start,128, 56, 1);
609 
610  sparseGrid.template flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
611 
612  sparseGrid.template deviceToHost<0>(); // NECESSARY as count takes place on Host!
613  auto existingElements = sparseGrid.countExistingElements();
614  auto boundaryElements = sparseGrid.countBoundaryElements();
615  unsigned long long numElements = existingElements - boundaryElements;
616 
617  sparseGrid.template findNeighbours<NNFull<3>>(); // Pre-compute the neighbours pos for each block!
618 
619  sparseGrid.template setNNType<NNFull<SparseGridZ::dims>>();
620  sparseGrid.template tagBoundaries<NNFull<3>>(ctx,No_check(),tag_boundaries::CALCULATE_EXISTING_POINTS);
621 
622 
623  openfpm::vector<double> measures_gf;
624  openfpm::vector<double> measures_tm;
625 
626  for (unsigned int iter=0; iter<iterations; ++iter)
627  {
628  cudaDeviceSynchronize();
629  conv_coeff cc;
630 
631  for (int i = 0 ; i < 3 ; i++)
632  {
633  for (int j = 0 ; j < 3 ; j++)
634  {
635  for (int k = 0 ; k < 3 ; k++)
636  {
637  cc.coeff[k][j][i] = 1.0;
638  }
639  }
640  }
641 
642  timer ts;
643  ts.start();
644 
645  sparseGrid.template applyStencils<Conv3x3x3_noshared<SparseGridZ::dims,0,1>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE_NO_SHARED,cc);
646 
647  cudaDeviceSynchronize();
648  ts.stop();
649 
650  measures_tm.add(ts.getwct());
651 
652  float gElemS = numElements / (1e9 * ts.getwct());
653  float gFlopsS = gElemS * Conv3x3x3_noshared<SparseGridZ::dims,0,1>::flops;
654 
655  measures_gf.add(gFlopsS);
656  }
657 
658  double mean_tm = 0;
659  double deviation_tm = 0;
660  standard_deviation(measures_tm,mean_tm,deviation_tm);
661 
662  double mean_gf = 0;
663  double deviation_gf = 0;
664  standard_deviation(measures_gf,mean_gf,deviation_gf);
665 
666  std::cout << "Test: " << testName << std::endl;
667  std::cout << "Block: " << SparseGridZ::blockEdgeSize_
668  << "x" << SparseGridZ::blockEdgeSize_
669  << "x" << SparseGridZ::blockEdgeSize_
670  << std::endl;
671  std::cout << "Grid: " << sz[0] << "x" << sz[1] << "x" << sz[2] << std::endl;
672 
673  double dataOccupancyMean, dataOccupancyDev;
674  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);
675  std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
676 
677  std::cout << "Iterations: " << iterations << std::endl;
678  std::cout << "\tConvolution3x3x3: " << mean_tm << " dev:" << deviation_tm << " s" << std::endl;
679  std::cout << "Throughput: " << std::endl << "\t " << mean_gf << " GFlops/s dev: " << deviation_gf << " GFlops/s " << std::endl;
680 }
681 
682 template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
684 {
685  typedef NNStar<dim> stencil_type;
686 
687  // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
688 
689  static constexpr unsigned int flops = 3 + 2*dim;
690 
691  static constexpr unsigned int supportRadius = 1;
692 
705  template<typename SparseGridT, typename DataBlockWrapperT>
706  static inline __device__ void stencil(
707  SparseGridT & sparseGrid,
708  const unsigned int dataBlockId,
709  const openfpm::sparse_index<unsigned int> dataBlockIdPos,
710  const unsigned int offset,
711  const grid_key_dx<dim, int> & pointCoord,
712  const DataBlockWrapperT & dataBlockLoad,
713  DataBlockWrapperT & dataBlockStore,
714  bool isActive,
715  float dt)
716  {
717  typedef typename SparseGridT::AggregateBlockType AggregateT;
718  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
719 
720  ScalarT res = 0;
721  auto coord = pointCoord;
722 
723  if (isActive)
724  {
725  // ScalarT cur = sparseGrid.template get<p_src>(pointCoord);
726  ScalarT cur = dataBlockLoad.template get<p_dst>()[offset];
727  ScalarT laplacian = -2.0 * dim * cur; // The central part of the stencil
728 
729  for (int d = 0; d < dim; ++d)
730  {
731  auto locC = coord.get(d);
732  coord.set_d(d, locC+1);
733  auto nPlus = sparseGrid.template get<p_src>(coord);
734  coord.set_d(d, locC-1);
735  auto nMinus = sparseGrid.template get<p_src>(coord);
736  laplacian += nMinus + nPlus;
737  coord.set_d(d, locC);
738  }
739  res = cur + dt * laplacian;
740  }
741  __syncthreads();
742  if (isActive)
743  {
744  dataBlockStore.template get<p_dst>()[offset] = res;
745  }
746  }
747 
760  template<typename SparseGridT, typename DataBlockWrapperT>
761  static inline __host__ void stencilHost(
762  SparseGridT & sparseGrid,
763  const unsigned int dataBlockId,
764  const openfpm::sparse_index<unsigned int> dataBlockIdPos,
765  const unsigned int offset,
766  const grid_key_dx<dim, int> & pointCoord,
767  const DataBlockWrapperT & dataBlockLoad,
768  DataBlockWrapperT & dataBlockStore,
769  bool isActive,
770  float dt)
771  {
772  constexpr unsigned int blockEdgeSize = SparseGridT::getBlockEdgeSize();
773 
774  if (isActive)
775  {
776  auto cur = dataBlockLoad.template get<p_src>()[offset];
777  auto laplacian = -2.0 * dim * cur; // The central part of the stencil
778 
779  auto neighbourCoord = pointCoord;
780  auto counter = offset;
781  unsigned int dimStride = 1;
782  for (int d = 0; d < dim; ++d)
783  {
784  const auto localOffset = counter % blockEdgeSize;
785 
786  if (localOffset == 0) // This means we are at the lower boundary for this dimension
787  {
788  neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
789  laplacian += sparseGrid.template get<p_src>(neighbourCoord);
790  neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
791  }
792  else
793  {
794  laplacian += dataBlockLoad.template get<p_src>()[offset - dimStride];
795  }
796  if (localOffset == blockEdgeSize - 1) // This means we are at the lower boundary for this dimension
797  {
798  neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
799  laplacian += sparseGrid.template get<p_src>(neighbourCoord);
800  neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
801  }
802  else
803  {
804  laplacian += dataBlockLoad.template get<p_src>()[offset + dimStride];
805  }
806  //
807  counter /= blockEdgeSize;
808  dimStride *= blockEdgeSize;
809  }
810  dataBlockStore.template get<p_dst>()[offset] = cur + dt * laplacian;
811  }
812  }
813 
814  template <typename SparseGridT, typename CtxT>
815  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
816  {
817  sparseGrid.template flush <sRight_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
818  }
819 };
820 
821 template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
823 {
824  typedef NNStar<dim> stencil_type;
825 
826  // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
827 
828  static constexpr unsigned int flops = 1;
829 
830  static constexpr unsigned int supportRadius = 1;
831 
844  template<typename SparseGridT, typename DataBlockWrapperT>
845  static inline __device__ void stencil(
846  SparseGridT & sparseGrid,
847  const unsigned int dataBlockId,
848  const openfpm::sparse_index<unsigned int> dataBlockIdPos,
849  const unsigned int offset,
850  const grid_key_dx<dim, int> & pointCoord,
851  const DataBlockWrapperT & dataBlockLoad,
852  DataBlockWrapperT & dataBlockStore,
853  bool isActive,
854  float dt)
855  {
856  typedef typename SparseGridT::AggregateBlockType AggregateT;
857  typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
858 
859  constexpr unsigned int enlargedBlockSize = IntPow<
860  SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
861 
862  __shared__ ScalarT enlargedBlock[enlargedBlockSize];
863  sparseGrid.loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
864 
865  __syncthreads();
866 
867  decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
868  ScalarT res = 0;
869 
870  if (isActive)
871  {
872  linId = sparseGrid.getLinIdInEnlargedBlock(offset);
873  ScalarT cur = enlargedBlock[linId];
874 
875  res = cur + cur;
876  }
877  __syncthreads();
878  if (isActive)
879  {
880  enlargedBlock[linId] = res;
881  }
882  __syncthreads();
883  sparseGrid.storeBlock<p_dst>(dataBlockStore, enlargedBlock);
884  }
885 
898  template<typename SparseGridT, typename DataBlockWrapperT>
899  static inline __host__ void stencilHost(
900  SparseGridT & sparseGrid,
901  const unsigned int dataBlockId,
902  const openfpm::sparse_index<unsigned int> dataBlockIdPos,
903  const unsigned int offset,
904  const grid_key_dx<dim, int> & pointCoord,
905  const DataBlockWrapperT & dataBlockLoad,
906  DataBlockWrapperT & dataBlockStore,
907  bool isActive,
908  float dt)
909  {
910  constexpr unsigned int blockEdgeSize = SparseGridT::getBlockEdgeSize();
911 
912  if (isActive)
913  {
914  auto cur = dataBlockLoad.template get<p_src>()[offset];
915  auto laplacian = -2.0 * dim * cur; // The central part of the stencil
916 
917  auto neighbourCoord = pointCoord;
918  auto counter = offset;
919  unsigned int dimStride = 1;
920  for (int d = 0; d < dim; ++d)
921  {
922  const auto localOffset = counter % blockEdgeSize;
923 
924  if (localOffset == 0) // This means we are at the lower boundary for this dimension
925  {
926  neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
927  laplacian += sparseGrid.template get<p_src>(neighbourCoord);
928  neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
929  }
930  else
931  {
932  laplacian += dataBlockLoad.template get<p_src>()[offset - dimStride];
933  }
934  if (localOffset == blockEdgeSize - 1) // This means we are at the lower boundary for this dimension
935  {
936  neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
937  laplacian += sparseGrid.template get<p_src>(neighbourCoord);
938  neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
939  }
940  else
941  {
942  laplacian += dataBlockLoad.template get<p_src>()[offset + dimStride];
943  }
944  //
945  counter /= blockEdgeSize;
946  dimStride *= blockEdgeSize;
947  }
948  dataBlockStore.template get<p_dst>()[offset] = cur + dt * laplacian;
949  }
950  }
951 
952  template <typename SparseGridT, typename CtxT>
953  static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
954  {
955  sparseGrid.template flush <sRight_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
956  }
957 };
958 
959 
960 
961 
962 #endif /* SPARSEGRIDGPU_UTIL_TEST_CUH_ */
static __device__ void stencil(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil function.
grid_key_dx is the key to access any element in the grid
Definition: grid_key.hpp:18
static __host__ void stencilHost(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil Host function.
static __host__ void stencilHost(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil Host function.
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
static __device__ void stencil(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, unsigned char curMask, float dt)
Stencil function.
static __device__ void stencil(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil function.
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition: grid_key.hpp:516
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
static __host__ void stencilHost(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil Host function.
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119