OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_util_test.cuh
1/*
2 * SparseGridGpu_util_test.cuh
3 *
4 * Created on: Sep 9, 2019
5 * Author: i-bird
6 */
7
8#ifndef SPARSEGRIDGPU_UTIL_TEST_CUH_
9#define SPARSEGRIDGPU_UTIL_TEST_CUH_
10
11#include "SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh"
12
13template<unsigned int p, typename SparseGridType>
14__global__ void getValues2D(SparseGridType sparseGrid, const int offsetX=0, const int offsetY=0)
15{
16 sparseGrid.init();
17
18 const auto bDimX = blockDim.x;
19 const auto bDimY = blockDim.y;
20 const auto bIdX = blockIdx.x;
21 const auto bIdY = blockIdx.y;
22 const auto tIdX = threadIdx.x;
23 const auto tIdY = threadIdx.y;
24 int x = bIdX * bDimX + tIdX + offsetX;
25 int y = bIdY * bDimY + tIdY + offsetY;
27
28 auto value = sparseGrid.template get<p>(coord);
29 value++;
30
31 // Compiler avoid warning
32 x++;
33 y++;
34}
35
36template<unsigned int p, typename SparseGridType>
37__global__ void getValuesNeighbourhood2D(SparseGridType sparseGrid, const int offsetX=0, const int offsetY=0)
38{
39 sparseGrid.init();
40
41 const auto bDimX = blockDim.x;
42 const auto bDimY = blockDim.y;
43 const auto bIdX = blockIdx.x;
44 const auto bIdY = blockIdx.y;
45 const auto tIdX = threadIdx.x;
46 const auto tIdY = threadIdx.y;
47 int x = bIdX * bDimX + tIdX + offsetX;
48 int y = bIdY * bDimY + tIdY + offsetY;
49 --x; --y;
51 for (int i=0; i < 9; ++i)
52 {
53 auto value = sparseGrid.template get<p>(coord);
54 coord.set_d(0, x + i%3);
55 coord.set_d(1, y + i/3);
56 value++;
57 }
58}
59
60template<unsigned int p, typename SparseGridType>
61__global__ void insertValues2D(SparseGridType sparseGrid, const int offsetX=0, const int offsetY=0)
62{
63 sparseGrid.init();
64
65 const auto bDimX = blockDim.x;
66 const auto bDimY = blockDim.y;
67 const auto bIdX = blockIdx.x;
68 const auto bIdY = blockIdx.y;
69 const auto tIdX = threadIdx.x;
70 const auto tIdY = threadIdx.y;
71 int x = bIdX * bDimX + tIdX + offsetX;
72 int y = bIdY * bDimY + tIdY + offsetY;
74
75 sparseGrid.template insert<p>(coord) = x*x*y*y; // some function...
76
77 __syncthreads();
78
79 sparseGrid.flush_block_insert();
80
81 // Compiler avoid warning
82 x++;
83 y++;
84}
85
86template<unsigned int p, unsigned int chunksPerBlock, unsigned int blockEdgeSize, typename SparseGridType>
87__global__ void insertValues2DBlocked(SparseGridType sparseGrid, const int sOffsetX=0, const int sOffsetY=0)
88{
89 constexpr unsigned int pMask = SparseGridType::pMask;
90 typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
91 typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
92
93 sparseGrid.init();
94
95 int posX = blockIdx.x * blockDim.x + threadIdx.x + sOffsetX;
96 int posY = blockIdx.y * blockDim.y + threadIdx.y + sOffsetY;
97 const unsigned int offsetX = posX % blockEdgeSize;
98 const unsigned int offsetY = posY % blockEdgeSize;
99
100 const unsigned int offset = offsetY * blockEdgeSize + offsetX;
101
102 grid_key_dx<SparseGridType::d, int> blockCoord({posX / blockEdgeSize, posY / blockEdgeSize});
103 auto encap = sparseGrid.insertBlock(sparseGrid.getBlockLinId(blockCoord));
104
105 encap.template get<p>()[offset] = posX*posX * posY*posY;
106 BlockMapGpu_ker<>::setExist(encap.template get<pMask>()[offset]);
107
108 __syncthreads();
109
110 sparseGrid.flush_block_insert();
111}
112
113
114
115template<unsigned int p, unsigned int chunksPerBlock=1, typename SparseGridType, typename ScalarT>
116__global__ void insertConstantValue(SparseGridType sparseGrid, ScalarT value)
117{
118 constexpr unsigned int pMask = SparseGridType::pMask;
119 typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
120 typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
121
122 sparseGrid.init();
123
124 int x = blockIdx.x * blockDim.x + threadIdx.x;
125 int y = blockIdx.y * blockDim.y + threadIdx.y;
126 int z = blockIdx.z * blockDim.z + threadIdx.z;
128
129 auto pos = sparseGrid.getLinId(coord);
130 unsigned int dataBlockId = pos / BlockT::size;
131 unsigned int offset = pos % BlockT::size;
132
133 auto encap = sparseGrid.template insertBlock<chunksPerBlock>(dataBlockId,BlockT::size);
134
135 encap.template get<p>()[offset] = value;
136 BlockMapGpu_ker<>::setExist(encap.template get<pMask>()[offset]);
137
138 __syncthreads();
139
140 sparseGrid.flush_block_insert();
141
142 // Compiler avoid warning
143 x++;
144 y++;
145 z++;
146}
147
148
149template<unsigned int p, typename SparseGridType, typename ValueT>
150__global__ void insertOneValue(SparseGridType sparseGrid, dim3 pt, ValueT value)
151{
152 sparseGrid.init();
153
154 int x = blockIdx.x * blockDim.x + threadIdx.x;
155 int y = blockIdx.y * blockDim.y + threadIdx.y;
156 int z = blockIdx.z * blockDim.z + threadIdx.z;
157 dim3 thCoord(x, y, z);
158 if (thCoord.x == pt.x && thCoord.y == pt.y && thCoord.z == pt.z)
159 {
161 sparseGrid.template insert<p>(coord) = value;
162 }
163 __syncthreads();
164
165 sparseGrid.flush_block_insert();
166
167 // Compiler avoid warning
168 y++;
169 z++;
170}
171
172template<unsigned int p, typename SparseGridType, typename VectorOutType>
173__global__ void copyBlocksToOutput(SparseGridType sparseGrid, VectorOutType output)
174{
175 const auto bDimX = blockDim.x;
176 const auto bDimY = blockDim.y;
177 const auto bDimZ = blockDim.z;
178 const auto bIdX = blockIdx.x;
179 const auto bIdY = blockIdx.y;
180 const auto bIdZ = blockIdx.z;
181 const auto tIdX = threadIdx.x;
182 const auto tIdY = threadIdx.y;
183 const auto tIdZ = threadIdx.z;
184 int x = bIdX * bDimX + tIdX;
185 int y = bIdY * bDimY + tIdY;
186 int z = bIdZ * bDimZ + tIdZ;
188
189 size_t pos = sparseGrid.getLinId(coord);
190
191 auto value = sparseGrid.template get<p>(coord);
192
193 output.template get<p>(pos) = value;
194
195 // Compiler avoid warning
196 x++;
197 y++;
198 z++;
199}
200
201template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
203{
205
206 // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
207
208 static constexpr unsigned int flops = 3 + 2*dim;
209
210 static constexpr unsigned int supportRadius = 1;
211
224 template<typename SparseGridT, typename DataBlockWrapperT>
225 static inline __device__ void stencil(
226 SparseGridT & sparseGrid,
227 const unsigned int dataBlockId,
228 const openfpm::sparse_index<unsigned int> dataBlockIdPos,
229 const unsigned int offset,
230 const grid_key_dx<dim, int> & pointCoord,
231 const DataBlockWrapperT & dataBlockLoad,
232 DataBlockWrapperT & dataBlockStore,
233 unsigned char curMask,
234 float dt)
235 {
236 typedef typename SparseGridT::AggregateBlockType AggregateT;
237 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
238
239 constexpr unsigned int enlargedBlockSize = IntPow<
240 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
241
242 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
243
244 sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
245
246 __syncthreads();
247
248 decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
249 ScalarT res = 0;
250
251 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
252 {
253 const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
254 // const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
255 linId = sparseGrid.getLinIdInEnlargedBlock(offset);
256 ScalarT cur = enlargedBlock[linId];
257 ScalarT laplacian = -2.0 * dim * cur; // The central part of the stencil
258
259 for (int d = 0; d < dim; ++d)
260 {
261 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
262 auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
263 ScalarT neighbourPlus = enlargedBlock[nPlusId];
264 ScalarT neighbourMinus = enlargedBlock[nMinusId];
265 laplacian += neighbourMinus + neighbourPlus;
266 }
267 // enlargedBlock[linId] = cur + dt * laplacian;
268 res = cur + dt * laplacian;
269 }
270
271 __syncthreads();
272 if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
273 {
274 enlargedBlock[linId] = res;
275 }
276 __syncthreads();
277 sparseGrid.template storeBlock<p_dst>(dataBlockStore, enlargedBlock);
278 }
279
292 template<typename SparseGridT, typename DataBlockWrapperT>
293 static inline __host__ void stencilHost(
294 SparseGridT & sparseGrid,
295 const unsigned int dataBlockId,
296 const openfpm::sparse_index<unsigned int> dataBlockIdPos,
297 const unsigned int offset,
298 const grid_key_dx<dim, int> & pointCoord,
299 const DataBlockWrapperT & dataBlockLoad,
300 DataBlockWrapperT & dataBlockStore,
301 bool isActive,
302 float dt)
303 {
304 constexpr unsigned int blockEdgeSize = SparseGridT::getBlockEdgeSize();
305
306 if (isActive)
307 {
308 auto cur = dataBlockLoad.template get<p_src>()[offset];
309 auto laplacian = -2.0 * dim * cur; // The central part of the stencil
310
311 auto neighbourCoord = pointCoord;
312 auto counter = offset;
313 unsigned int dimStride = 1;
314 for (int d = 0; d < dim; ++d)
315 {
316 const auto localOffset = counter % blockEdgeSize;
317
318 if (localOffset == 0) // This means we are at the lower boundary for this dimension
319 {
320 neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
321 laplacian += sparseGrid.template get<p_src>(neighbourCoord);
322 neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
323 }
324 else
325 {
326 laplacian += dataBlockLoad.template get<p_src>()[offset - dimStride];
327 }
328 if (localOffset == blockEdgeSize - 1) // This means we are at the lower boundary for this dimension
329 {
330 neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
331 laplacian += sparseGrid.template get<p_src>(neighbourCoord);
332 neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
333 }
334 else
335 {
336 laplacian += dataBlockLoad.template get<p_src>()[offset + dimStride];
337 }
338 //
339 counter /= blockEdgeSize;
340 dimStride *= blockEdgeSize;
341 }
342 dataBlockStore.template get<p_dst>()[offset] = cur + dt * laplacian;
343 }
344 }
345
346 template <typename SparseGridT, typename CtxT>
347 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
348 {
349 sparseGrid.template flush <smax_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
350 }
351};
352
353struct conv_coeff
354{
355 float coeff[3][3][3];
356};
357
358
359
360template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
361struct Conv3x3x3
362{
363 // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
364
365 typedef NNFull<dim> stencil_type;
366
367 static constexpr unsigned int supportRadius = 1;
368
369 static constexpr unsigned int flops = 2*27;
370
371 template<typename SparseGridT, typename DataBlockWrapperT>
372 static inline __device__ void stencil(
373 SparseGridT & sparseGrid,
374 const unsigned int dataBlockId,
376 unsigned int offset,
377 grid_key_dx<dim, int> & pointCoord,
378 DataBlockWrapperT & dataBlockLoad,
379 DataBlockWrapperT & dataBlockStore,
380 bool applyStencilHere,
381 conv_coeff & cc)
382 {
383 typedef typename SparseGridT::AggregateBlockType AggregateT;
384 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
385
386 constexpr unsigned int enlargedBlockSize = IntPow<
387 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
388
389 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
390
391 sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockIdPos,enlargedBlock);
392
393 __syncthreads();
394
395 if (applyStencilHere)
396 {
397 const auto coord = sparseGrid.getCoordInEnlargedBlock(offset);
398 const auto linId = sparseGrid.getLinIdInEnlargedBlock(offset);
399 ScalarT tot = 0.0;
400 for (int i = 0; i < dim; ++i)
401 {
402 for (int j = 0; j < dim; ++j)
403 {
404 for (int k = 0; k < dim; ++k)
405 {
407
408 key.set_d(0,i-1);
409 key.set_d(1,j-1);
410 key.set_d(2,k-1);
411
412 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
413 tot += enlargedBlock[nPlusId] * cc.coeff[i][j][k];
414 }
415 }
416 }
417
418 dataBlockStore.template get<p_dst>()[offset] = tot;
419 }
420 }
421
422 template <typename SparseGridT, typename CtxT>
423 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
424 {
425 sparseGrid.template flush <smax_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
426 }
427};
428
429template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
431{
432 // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
433
434 typedef NNFull<dim> stencil_type;
435
436 static constexpr unsigned int supportRadius = 1;
437
438 static constexpr unsigned int flops = 2*27;
439
440 template<typename SparseGridT, typename DataBlockWrapperT>
441 static inline __device__ void stencil(
442 SparseGridT & sparseGrid,
443 const unsigned int dataBlockId,
445 unsigned int offset,
446 grid_key_dx<dim, int> & pointCoord,
447 DataBlockWrapperT & dataBlockLoad,
448 DataBlockWrapperT & dataBlockStore,
449 bool applyStencilHere,
450 conv_coeff & cc)
451 {
452 typedef typename SparseGridT::AggregateBlockType AggregateT;
453 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
454
455 if (applyStencilHere)
456 {
457 ScalarT tot = 0.0;
458 for (int i = 0; i < dim; ++i)
459 {
460 for (int j = 0; j < dim; ++j)
461 {
462 for (int k = 0; k < dim; ++k)
463 {
464
466
467 key.set_d(0,k-1);
468 key.set_d(1,j-1);
469 key.set_d(2,i-1);
470
471 auto pos = sparseGrid.template getNNPoint<stencil_type>(dataBlockIdPos, offset, key);
472
473 tot += sparseGrid.template get<p_src>(pos) * cc.coeff[i][j][k];
474 }
475 }
476 }
477
478 dataBlockStore.template get<p_dst>()[offset] = tot;
479 }
480 }
481
482 template <typename SparseGridT, typename CtxT>
483 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
484 {
485 sparseGrid.template flush <smax_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
486 }
487};
488
489template<typename SparseGridZ>
490void testConv3x3x3_perf(std::string testName)
491{
492 constexpr unsigned int dim = 3;
493 constexpr unsigned int blockEdgeSize = 8;
494 typedef aggregate<float,float> AggregateT;
495
496 unsigned int iterations = 100;
497
498 size_t sz[] = {1000,1000,1000};
499
500 SparseGridZ sparseGrid(sz);
502 sparseGrid.template setBackgroundValue<0>(0);
503
504 // now create 3 3D sphere
505
506 grid_key_dx<3,int> start({256,256,256});
507
508 dim3 gridSize(32,32,32);
509
510 // Insert values on the grid
511 sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
512 CUDA_LAUNCH_DIM3((insertSphere3D_radius<0>),
513 gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
514 sparseGrid.toKernel(), start,128, 56, 1);
515
516 sparseGrid.template flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
517
518 sparseGrid.template deviceToHost<0>(); // NECESSARY as count takes place on Host!
519 auto existingElements = sparseGrid.countExistingElements();
520 auto boundaryElements = sparseGrid.countBoundaryElements();
521 unsigned long long numElements = existingElements - boundaryElements;
522
523 sparseGrid.template findNeighbours<NNFull<3>>(); // Pre-compute the neighbours pos for each block!
524
525 sparseGrid.template setNNType<NNFull<dim>>();
526 sparseGrid.template tagBoundaries<NNFull<3>>(ctx);
527
528 openfpm::vector<double> measures_gf;
529 openfpm::vector<double> measures_tm;
530
531 for (unsigned int iter=0; iter<iterations; ++iter)
532 {
533 cudaDeviceSynchronize();
534 conv_coeff cc;
535
536 for (int i = 0 ; i < 3 ; i++)
537 {
538 for (int j = 0 ; j < 3 ; j++)
539 {
540 for (int k = 0 ; k < 3 ; k++)
541 {
542 cc.coeff[k][j][i] = 1.0;
543 }
544 }
545 }
546
547 timer ts;
548 ts.start();
549
550 sparseGrid.template applyStencils<Conv3x3x3<dim,0,1>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE,cc);
551
552 cudaDeviceSynchronize();
553 ts.stop();
554
555 measures_tm.add(ts.getwct());
556
557 float gElemS = numElements / (1e9 * ts.getwct());
558 float gFlopsS = gElemS * Conv3x3x3<dim,0,1>::flops;
559
560 measures_gf.add(gFlopsS);
561 }
562
563 double mean_tm = 0;
564 double deviation_tm = 0;
565 standard_deviation(measures_tm,mean_tm,deviation_tm);
566
567 double mean_gf = 0;
568 double deviation_gf = 0;
569 standard_deviation(measures_gf,mean_gf,deviation_gf);
570
571 std::cout << "Test: " << testName << std::endl;
572 std::cout << "Block: " << SparseGridZ::blockEdgeSize_
573 << "x" << SparseGridZ::blockEdgeSize_
574 << "x" << SparseGridZ::blockEdgeSize_
575 << std::endl;
576 std::cout << "Grid: " << sz[0] << "x" << sz[1] << "x" << sz[2] << std::endl;
577
578 double dataOccupancyMean, dataOccupancyDev;
579 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);
580 std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
581
582 std::cout << "Iterations: " << iterations << std::endl;
583 std::cout << "\tConvolution3x3x3: " << mean_tm << " dev:" << deviation_tm << " s" << std::endl;
584 std::cout << "Throughput: " << std::endl << "\t " << mean_gf << " GFlops/s dev: " << deviation_gf << " GFlops/s " << std::endl;
585}
586
587template<typename SparseGridZ>
588static void testConv3x3x3_no_shared_perf(std::string testName)
589{
590 unsigned int iterations = 100;
591
592 size_t sz[] = {1000,1000,1000};
593
594 SparseGridZ sparseGrid(sz);
596 sparseGrid.template setBackgroundValue<0>(0);
597
598 // now create 3 3D sphere
599
600 grid_key_dx<3,int> start({256,256,256});
601
602 dim3 gridSize(32,32,32);
603
604 // Insert values on the grid
605 sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
606 CUDA_LAUNCH_DIM3((insertSphere3D_radius<0>),
607 gridSize, dim3(SparseGridZ::blockEdgeSize_*SparseGridZ::blockEdgeSize_*SparseGridZ::blockEdgeSize_,1,1),
608 sparseGrid.toKernel(), start,128, 56, 1);
609
610 sparseGrid.template flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
611
612 sparseGrid.template deviceToHost<0>(); // NECESSARY as count takes place on Host!
613 auto existingElements = sparseGrid.countExistingElements();
614 auto boundaryElements = sparseGrid.countBoundaryElements();
615 unsigned long long numElements = existingElements - boundaryElements;
616
617 sparseGrid.template findNeighbours<NNFull<3>>(); // Pre-compute the neighbours pos for each block!
618
619 sparseGrid.template setNNType<NNFull<SparseGridZ::dims>>();
620 sparseGrid.template tagBoundaries<NNFull<3>>(ctx,No_check(),tag_boundaries::CALCULATE_EXISTING_POINTS);
621
622
623 openfpm::vector<double> measures_gf;
624 openfpm::vector<double> measures_tm;
625
626 for (unsigned int iter=0; iter<iterations; ++iter)
627 {
628 cudaDeviceSynchronize();
629 conv_coeff cc;
630
631 for (int i = 0 ; i < 3 ; i++)
632 {
633 for (int j = 0 ; j < 3 ; j++)
634 {
635 for (int k = 0 ; k < 3 ; k++)
636 {
637 cc.coeff[k][j][i] = 1.0;
638 }
639 }
640 }
641
642 timer ts;
643 ts.start();
644
645 sparseGrid.template applyStencils<Conv3x3x3_noshared<SparseGridZ::dims,0,1>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE_NO_SHARED,cc);
646
647 cudaDeviceSynchronize();
648 ts.stop();
649
650 measures_tm.add(ts.getwct());
651
652 float gElemS = numElements / (1e9 * ts.getwct());
654
655 measures_gf.add(gFlopsS);
656 }
657
658 double mean_tm = 0;
659 double deviation_tm = 0;
660 standard_deviation(measures_tm,mean_tm,deviation_tm);
661
662 double mean_gf = 0;
663 double deviation_gf = 0;
664 standard_deviation(measures_gf,mean_gf,deviation_gf);
665
666 std::cout << "Test: " << testName << std::endl;
667 std::cout << "Block: " << SparseGridZ::blockEdgeSize_
668 << "x" << SparseGridZ::blockEdgeSize_
669 << "x" << SparseGridZ::blockEdgeSize_
670 << std::endl;
671 std::cout << "Grid: " << sz[0] << "x" << sz[1] << "x" << sz[2] << std::endl;
672
673 double dataOccupancyMean, dataOccupancyDev;
674 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);
675 std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
676
677 std::cout << "Iterations: " << iterations << std::endl;
678 std::cout << "\tConvolution3x3x3: " << mean_tm << " dev:" << deviation_tm << " s" << std::endl;
679 std::cout << "Throughput: " << std::endl << "\t " << mean_gf << " GFlops/s dev: " << deviation_gf << " GFlops/s " << std::endl;
680}
681
682template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
684{
686
687 // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
688
689 static constexpr unsigned int flops = 3 + 2*dim;
690
691 static constexpr unsigned int supportRadius = 1;
692
705 template<typename SparseGridT, typename DataBlockWrapperT>
706 static inline __device__ void stencil(
707 SparseGridT & sparseGrid,
708 const unsigned int dataBlockId,
709 const openfpm::sparse_index<unsigned int> dataBlockIdPos,
710 const unsigned int offset,
711 const grid_key_dx<dim, int> & pointCoord,
712 const DataBlockWrapperT & dataBlockLoad,
713 DataBlockWrapperT & dataBlockStore,
714 bool isActive,
715 float dt)
716 {
717 typedef typename SparseGridT::AggregateBlockType AggregateT;
718 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
719
720 ScalarT res = 0;
721 auto coord = pointCoord;
722
723 if (isActive)
724 {
725 // ScalarT cur = sparseGrid.template get<p_src>(pointCoord);
726 ScalarT cur = dataBlockLoad.template get<p_dst>()[offset];
727 ScalarT laplacian = -2.0 * dim * cur; // The central part of the stencil
728
729 for (int d = 0; d < dim; ++d)
730 {
731 auto locC = coord.get(d);
732 coord.set_d(d, locC+1);
733 auto nPlus = sparseGrid.template get<p_src>(coord);
734 coord.set_d(d, locC-1);
735 auto nMinus = sparseGrid.template get<p_src>(coord);
736 laplacian += nMinus + nPlus;
737 coord.set_d(d, locC);
738 }
739 res = cur + dt * laplacian;
740 }
741 __syncthreads();
742 if (isActive)
743 {
744 dataBlockStore.template get<p_dst>()[offset] = res;
745 }
746 }
747
760 template<typename SparseGridT, typename DataBlockWrapperT>
761 static inline __host__ void stencilHost(
762 SparseGridT & sparseGrid,
763 const unsigned int dataBlockId,
764 const openfpm::sparse_index<unsigned int> dataBlockIdPos,
765 const unsigned int offset,
766 const grid_key_dx<dim, int> & pointCoord,
767 const DataBlockWrapperT & dataBlockLoad,
768 DataBlockWrapperT & dataBlockStore,
769 bool isActive,
770 float dt)
771 {
772 constexpr unsigned int blockEdgeSize = SparseGridT::getBlockEdgeSize();
773
774 if (isActive)
775 {
776 auto cur = dataBlockLoad.template get<p_src>()[offset];
777 auto laplacian = -2.0 * dim * cur; // The central part of the stencil
778
779 auto neighbourCoord = pointCoord;
780 auto counter = offset;
781 unsigned int dimStride = 1;
782 for (int d = 0; d < dim; ++d)
783 {
784 const auto localOffset = counter % blockEdgeSize;
785
786 if (localOffset == 0) // This means we are at the lower boundary for this dimension
787 {
788 neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
789 laplacian += sparseGrid.template get<p_src>(neighbourCoord);
790 neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
791 }
792 else
793 {
794 laplacian += dataBlockLoad.template get<p_src>()[offset - dimStride];
795 }
796 if (localOffset == blockEdgeSize - 1) // This means we are at the lower boundary for this dimension
797 {
798 neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
799 laplacian += sparseGrid.template get<p_src>(neighbourCoord);
800 neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
801 }
802 else
803 {
804 laplacian += dataBlockLoad.template get<p_src>()[offset + dimStride];
805 }
806 //
807 counter /= blockEdgeSize;
808 dimStride *= blockEdgeSize;
809 }
810 dataBlockStore.template get<p_dst>()[offset] = cur + dt * laplacian;
811 }
812 }
813
814 template <typename SparseGridT, typename CtxT>
815 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
816 {
817 sparseGrid.template flush <sRight_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
818 }
819};
820
821template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
823{
825
826 // This is an example of a laplacian smoothing stencil to apply using the apply stencil facility of SparseGridGpu
827
828 static constexpr unsigned int flops = 1;
829
830 static constexpr unsigned int supportRadius = 1;
831
844 template<typename SparseGridT, typename DataBlockWrapperT>
845 static inline __device__ void stencil(
846 SparseGridT & sparseGrid,
847 const unsigned int dataBlockId,
848 const openfpm::sparse_index<unsigned int> dataBlockIdPos,
849 const unsigned int offset,
850 const grid_key_dx<dim, int> & pointCoord,
851 const DataBlockWrapperT & dataBlockLoad,
852 DataBlockWrapperT & dataBlockStore,
853 bool isActive,
854 float dt)
855 {
856 typedef typename SparseGridT::AggregateBlockType AggregateT;
857 typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
858
859 constexpr unsigned int enlargedBlockSize = IntPow<
860 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
861
862 __shared__ ScalarT enlargedBlock[enlargedBlockSize];
863 sparseGrid.loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
864
865 __syncthreads();
866
867 decltype(sparseGrid.getLinIdInEnlargedBlock(0)) linId = 0;
868 ScalarT res = 0;
869
870 if (isActive)
871 {
872 linId = sparseGrid.getLinIdInEnlargedBlock(offset);
873 ScalarT cur = enlargedBlock[linId];
874
875 res = cur + cur;
876 }
877 __syncthreads();
878 if (isActive)
879 {
880 enlargedBlock[linId] = res;
881 }
882 __syncthreads();
883 sparseGrid.storeBlock<p_dst>(dataBlockStore, enlargedBlock);
884 }
885
898 template<typename SparseGridT, typename DataBlockWrapperT>
899 static inline __host__ void stencilHost(
900 SparseGridT & sparseGrid,
901 const unsigned int dataBlockId,
902 const openfpm::sparse_index<unsigned int> dataBlockIdPos,
903 const unsigned int offset,
904 const grid_key_dx<dim, int> & pointCoord,
905 const DataBlockWrapperT & dataBlockLoad,
906 DataBlockWrapperT & dataBlockStore,
907 bool isActive,
908 float dt)
909 {
910 constexpr unsigned int blockEdgeSize = SparseGridT::getBlockEdgeSize();
911
912 if (isActive)
913 {
914 auto cur = dataBlockLoad.template get<p_src>()[offset];
915 auto laplacian = -2.0 * dim * cur; // The central part of the stencil
916
917 auto neighbourCoord = pointCoord;
918 auto counter = offset;
919 unsigned int dimStride = 1;
920 for (int d = 0; d < dim; ++d)
921 {
922 const auto localOffset = counter % blockEdgeSize;
923
924 if (localOffset == 0) // This means we are at the lower boundary for this dimension
925 {
926 neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
927 laplacian += sparseGrid.template get<p_src>(neighbourCoord);
928 neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
929 }
930 else
931 {
932 laplacian += dataBlockLoad.template get<p_src>()[offset - dimStride];
933 }
934 if (localOffset == blockEdgeSize - 1) // This means we are at the lower boundary for this dimension
935 {
936 neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
937 laplacian += sparseGrid.template get<p_src>(neighbourCoord);
938 neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
939 }
940 else
941 {
942 laplacian += dataBlockLoad.template get<p_src>()[offset + dimStride];
943 }
944 //
945 counter /= blockEdgeSize;
946 dimStride *= blockEdgeSize;
947 }
948 dataBlockStore.template get<p_dst>()[offset] = cur + dt * laplacian;
949 }
950 }
951
952 template <typename SparseGridT, typename CtxT>
953 static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
954 {
955 sparseGrid.template flush <sRight_<0>> (ctx, flush_type::FLUSH_ON_DEVICE);
956 }
957};
958
959
960
961
962#endif /* SPARSEGRIDGPU_UTIL_TEST_CUH_ */
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition grid_key.hpp:516
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
static __device__ void stencil(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil function.
static __host__ void stencilHost(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil Host function.
static __device__ void stencil(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, unsigned char curMask, float dt)
Stencil function.
static __host__ void stencilHost(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil Host function.
static __host__ void stencilHost(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil Host function.
static __device__ void stencil(SparseGridT &sparseGrid, const unsigned int dataBlockId, const openfpm::sparse_index< unsigned int > dataBlockIdPos, const unsigned int offset, const grid_key_dx< dim, int > &pointCoord, const DataBlockWrapperT &dataBlockLoad, DataBlockWrapperT &dataBlockStore, bool isActive, float dt)
Stencil function.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...