OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_performance_heat_stencil_3d.cu
1 /*
2  * SparseGridGpu_performance_heat_stencil_3d.cu
3  *
4  * Created on: Sep 10, 2019
5  * Author: i-bird
6  */
7 #define BOOST_TEST_DYN_LINK
8 #define DISABLE_MPI_WRITTERS
9 
10 #include <boost/test/unit_test.hpp>
11 #include "performancePlots.hpp"
12 #include <iostream>
13 #include "SparseGridGpu/SparseGridGpu.hpp"
14 #include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15 
16 extern std::string suiteURI;
17 extern report_sparse_grid_tests report_sparsegrid_funcs;
18 extern std::set<std::string> testSet;
19 
20 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
21 void testStencilHeat3D_perf(unsigned int i, std::string base)
22 {
23  auto testName = "In-place 3D stencil";
24 // unsigned int gridEdgeSize = 128;
25 // unsigned int gridEdgeSize = 64;
26  typedef HeatStencil<SparseGridZ::dims,0,1> Stencil01T;
27  typedef HeatStencil<SparseGridZ::dims,1,0> Stencil10T;
28 
29  report_sparsegrid_funcs.graphs.put(base + ".dim",3);
30  report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
31  report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
32  report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
33  report_sparsegrid_funcs.graphs.put(base + ".gridSize.z",gridEdgeSize*SparseGridZ::blockEdgeSize_);
34 
35  unsigned int iterations = 100;
36 
37  openfpm::vector<double> measures_gf;
38  openfpm::vector<double> measures_tm;
39 
40  dim3 gridSize(gridEdgeSize, gridEdgeSize, gridEdgeSize);
41  dim3 blockSize(SparseGridZ::blockEdgeSize_, SparseGridZ::blockEdgeSize_, SparseGridZ::blockEdgeSize_);
42 
43  typename SparseGridZ::grid_info blockGeometry(gridSize);
44  SparseGridZ sparseGrid(blockGeometry);
45  mgpu::ofp_context_t ctx;
46  sparseGrid.template setBackgroundValue<0>(0);
47 
48  unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_
49  *gridEdgeSize*SparseGridZ::blockEdgeSize_
50  *gridEdgeSize*SparseGridZ::blockEdgeSize_;
51 
52  // Initialize the grid
53  sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
54  CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
55  sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
56 
57  sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
58  dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2,
59  gridSize.y * SparseGridZ::blockEdgeSize_ / 2,
60  gridSize.z * SparseGridZ::blockEdgeSize_ / 2);
61  insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
62  sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
63 
64  sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
65 
66  iterations /= 2;
67  for (unsigned int iter=0; iter<iterations; ++iter)
68  {
69  cudaDeviceSynchronize();
70 
71  timer ts;
72  ts.start();
73 
74  sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
75  cudaDeviceSynchronize();
76  sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
77  cudaDeviceSynchronize();
78 
79  ts.stop();
80 
81  measures_tm.add(ts.getwct());
82 
83  float gElemS = 2 * numElements / (1e9 * ts.getwct());
84  float gFlopsS = gElemS * Stencil01T::flops;
85 
86  measures_gf.add(gFlopsS);
87  }
88 
89  double mean_tm = 0;
90  double deviation_tm = 0;
91  standard_deviation(measures_tm,mean_tm,deviation_tm);
92 
93  double mean_gf = 0;
94  double deviation_gf = 0;
95  standard_deviation(measures_gf,mean_gf,deviation_gf);
96 
97  // All times above are in ms
98 
99  float gElemS = 2 * numElements / (1e9 * mean_tm);
100  float gFlopsS = gElemS * Stencil01T::flops;
101  std::cout << "Test: " << testName << std::endl;
102  std::cout << "Block: " << SparseGridZ::blockEdgeSize_
103  << "x" << SparseGridZ::blockEdgeSize_
104  << "x" << SparseGridZ::blockEdgeSize_
105  << std::endl;
106  std::cout << "Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_
107  << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_
108  << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_
109  << std::endl;
110  double dataOccupancyMean, dataOccupancyDev;
111  sparseGrid.deviceToHost();
112  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
113  report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
114  report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
115  std::cout << "Iterations: " << iterations << std::endl;
116  std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
117  std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
118 
119  report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
120  report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
121  report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
122  report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
123 }
124 
125 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
126 void launch_testStencilHeat3D_perf(std::string testURI, unsigned int i)
127 {
128  constexpr unsigned int dim = 3;
129  typedef aggregate<float,float> AggregateT;
130  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
131 
132  std::string base(testURI + "(" + std::to_string(i) + ")");
133  report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN3D");
134 
135  testStencilHeat3D_perf<blockEdgeSize, gridEdgeSize,
137  cudaDeviceSynchronize();
138 }
139 
140 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
141 void testStencilHeat3DSparse_perf(unsigned int i, std::string base, float fillMultiplier=1, float voidMultiplier=1)
142 {
143  auto testName = "In-place 3D sparse stencil";
144 // unsigned int gridEdgeSize = 32;
145  constexpr unsigned int dim = SparseGridZ::dims;
146 // const unsigned int blockEdgeSize = SparseGridZ::blockEdgeSize_;
147 
148  typedef HeatStencil<dim, 0, 1> Stencil01T;
149  typedef HeatStencil<dim, 1, 0> Stencil10T;
150 
151 // std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").stencil");
152 
153  report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
154  report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
155  report_sparsegrid_funcs.graphs.put(base + ".gridSize.x", gridEdgeSize * blockEdgeSize);
156  report_sparsegrid_funcs.graphs.put(base + ".gridSize.y", gridEdgeSize * blockEdgeSize);
157  report_sparsegrid_funcs.graphs.put(base + ".gridSize.z", gridEdgeSize * blockEdgeSize);
158 
159  unsigned int iterations = 100;
160 
161  openfpm::vector<double> measures_gf;
162  openfpm::vector<double> measures_tm;
163 
164  dim3 gridSize(gridEdgeSize, gridEdgeSize, gridEdgeSize);
165  unsigned int spatialEdgeSize = 10000;
166  size_t sz[3] = {spatialEdgeSize, spatialEdgeSize, spatialEdgeSize};
167  typename SparseGridZ::grid_info blockGeometry(sz);
168  SparseGridZ sparseGrid(blockGeometry);
169  mgpu::ofp_context_t ctx;
170  sparseGrid.template setBackgroundValue<0>(0);
171 
173  float allMultiplier = fillMultiplier + voidMultiplier;
174  const unsigned int numSpheres = gridEdgeSize / (2*allMultiplier);
175  unsigned int centerPoint = spatialEdgeSize / 2;
176 
177  for (int i = 1; i <= numSpheres; ++i)
178  {
179  unsigned int rBig = allMultiplier*i * blockEdgeSize;
180  unsigned int rSmall = (allMultiplier*i - fillMultiplier) * blockEdgeSize;
181  // Sphere i-th
182  grid_key_dx<dim, int> start1({centerPoint, centerPoint, centerPoint});
183  sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
184  CUDA_LAUNCH_DIM3((insertSphere3D<0>),
185  gridSize, dim3(blockEdgeSize * blockEdgeSize * blockEdgeSize, 1, 1),
186  sparseGrid.toKernel(), start1, rBig, rSmall, 1);
187  cudaDeviceSynchronize();
188  sparseGrid.template flush<smax_<0 >>(ctx, flush_type::FLUSH_ON_DEVICE);
189  cudaDeviceSynchronize();
190  }
192 
193  sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
194  sparseGrid.tagBoundaries(ctx);
195 
196  sparseGrid.template deviceToHost<0>(); // NECESSARY as count takes place on Host!
197  auto existingElements = sparseGrid.countExistingElements();
198  auto boundaryElements = sparseGrid.countBoundaryElements();
199  unsigned long long numElements = existingElements - boundaryElements;
200 
201  // Now apply some boundary conditions
202  sparseGrid.template applyStencils<BoundaryStencilSetXRescaled<dim,0,0>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE,
203  centerPoint, centerPoint + 2*blockEdgeSize*gridEdgeSize,
204  0.0, 10.0);
205 
206  iterations /= 2;
207  for (unsigned int iter=0; iter<iterations; ++iter)
208  {
209 
210  timer ts;
211  ts.start();
212 
213  sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
214  sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
215 
216  ts.stop();
217 
218  measures_tm.add(ts.getwct());
219 
220  float gElemS = 2 * numElements / (1e9 * ts.getwct());
221  float gFlopsS = gElemS * Stencil01T::flops;
222 
223  measures_gf.add(gFlopsS);
224  }
225 
226  double mean_tm = 0;
227  double deviation_tm = 0;
228  standard_deviation(measures_tm,mean_tm,deviation_tm);
229 
230  double mean_gf = 0;
231  double deviation_gf = 0;
232  standard_deviation(measures_gf,mean_gf,deviation_gf);
233 
234  // All times above are in ms
235 
236  float gElemS = 2 * numElements / (1e9 * mean_tm);
237  float gFlopsS = gElemS * Stencil01T::flops;
238  std::cout << "Test: " << testName << std::endl;
239  std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "x" << blockEdgeSize << std::endl;
240  std::cout << "Grid: " << gridEdgeSize * blockEdgeSize
241  << "x" << gridEdgeSize * blockEdgeSize
242  << "x" << gridEdgeSize * blockEdgeSize
243  << std::endl;
244  double dataOccupancyMean, dataOccupancyDev;
245  sparseGrid.deviceToHost();
246  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
247  report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
248  report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
249  std::cout << "Iterations: " << iterations << std::endl;
250  std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
251  std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
252 
253  report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
254  report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
255  report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
256  report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
257 
258 // // DEBUG
259 // sparseGrid.template deviceToHost<0,1>();
260 // sparseGrid.write("SparseGridGPU_testStencilHeat3DSparse_perf_DEBUG.vtk");
261 }
262 
263 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
264 void launch_testStencilHeat3DSparse_perf(std::string testURI, unsigned int i,
265  float fillMultiplier=1, float voidMultiplier=1)
266 {
267  constexpr unsigned int dim = 3;
268  typedef aggregate<float,float> AggregateT;
269  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
270 
271  std::string base(testURI + "(" + std::to_string(i) + ")");
272  report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN3DSparse");
273 
274  testStencilHeat3DSparse_perf<blockEdgeSize, gridEdgeSize,
276  fillMultiplier, voidMultiplier);
277  cudaDeviceSynchronize();
278 }
279 
280 BOOST_AUTO_TEST_SUITE(performance)
281 
282 BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
283 
284 BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling)
285 {
286  std::string testURI = suiteURI + ".device.stencil.dense.N.3D.gridScaling";
287  unsigned int counter = 0;
288  constexpr unsigned int blockEdgeSize = 8;
289  launch_testStencilHeat3D_perf<blockEdgeSize, 8>(testURI, counter++);
290  launch_testStencilHeat3D_perf<blockEdgeSize, 16>(testURI, counter++);
291  launch_testStencilHeat3D_perf<blockEdgeSize, 32>(testURI, counter++);
292  launch_testStencilHeat3D_perf<blockEdgeSize, 64>(testURI, counter++);
293 // launch_testStencilHeat3D_perf<blockEdgeSize, 128>(testURI, counter++);
294 
295  testSet.insert(testURI);
296 }
297 
298 BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling_2)
299 {
300  std::string testURI = suiteURI + ".device.stencil.dense.N.3D.2.gridScaling";
301  unsigned int counter = 0;
302  launch_testStencilHeat3D_perf<2, 32>(testURI, counter++);
303  launch_testStencilHeat3D_perf<2, 64>(testURI, counter++);
304  launch_testStencilHeat3D_perf<2, 128>(testURI, counter++);
305  // launch_testStencilHeat3D_perf<2, 256>(testURI, counter++);
306 
307  testSet.insert(testURI);
308 }
309 
310 BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling_4)
311 {
312  std::string testURI = suiteURI + ".device.stencil.dense.N.3D.4.gridScaling";
313  unsigned int counter = 0;
314  launch_testStencilHeat3D_perf<4, 16>(testURI, counter++);
315  launch_testStencilHeat3D_perf<4, 32>(testURI, counter++);
316  launch_testStencilHeat3D_perf<4, 64>(testURI, counter++);
317 // launch_testStencilHeat3D_perf<4, 128>(testURI, counter++);
318 
319  testSet.insert(testURI);
320 }
321 
322 BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling_8)
323 {
324  std::string testURI = suiteURI + ".device.stencil.dense.N.3D.8.gridScaling";
325  unsigned int counter = 0;
326  launch_testStencilHeat3D_perf<8, 8>(testURI, counter++);
327  launch_testStencilHeat3D_perf<8, 16>(testURI, counter++);
328  launch_testStencilHeat3D_perf<8, 32>(testURI, counter++);
329  launch_testStencilHeat3D_perf<8, 64>(testURI, counter++);
330 
331  testSet.insert(testURI);
332 }
333 
334 BOOST_AUTO_TEST_CASE(testStencilHeat3D_blockScaling)
335 {
336  std::string testURI = suiteURI + ".device.stencil.dense.N.3D.blockScaling";
337  unsigned int counter = 0;
338  launch_testStencilHeat3D_perf<2, 128>(testURI, counter++);
339  launch_testStencilHeat3D_perf<4, 64>(testURI, counter++);
340  launch_testStencilHeat3D_perf<8, 32>(testURI, counter++);
341 // launch_testStencilHeat3D_perf<16, 16>(testURI, counter++); // Too big, it doesn't work
342 
343  testSet.insert(testURI);
344 }
345 
346 BOOST_AUTO_TEST_CASE(testStencilHeat3DSparse_gridScaling)
347 {
348  std::string testURI = suiteURI + ".device.stencil.sparse.N.3D.gridScaling";
349  unsigned int counter = 0;
350  constexpr unsigned int blockEdgeSize = 8;
351  launch_testStencilHeat3DSparse_perf<blockEdgeSize, 8>(testURI, counter++, 1, 1);
352  launch_testStencilHeat3DSparse_perf<blockEdgeSize, 16>(testURI, counter++, 1, 1);
353  launch_testStencilHeat3DSparse_perf<blockEdgeSize, 32>(testURI, counter++, 1, 1);
354  launch_testStencilHeat3DSparse_perf<blockEdgeSize, 64>(testURI, counter++, 1, 1);
355 
356  testSet.insert(testURI);
357 }
358 
359 BOOST_AUTO_TEST_CASE(testStencilHeat3DSparse_blockScaling)
360 {
361  std::string testURI = suiteURI + ".device.stencil.sparse.N.3D.blockScaling";
362  unsigned int counter = 0;
363  launch_testStencilHeat3DSparse_perf<2, 128>(testURI, counter++, 1, 1);
364  launch_testStencilHeat3DSparse_perf<4, 64>(testURI, counter++, 1, 1);
365  launch_testStencilHeat3DSparse_perf<8, 32>(testURI, counter++, 1, 1);
366 // launch_testStencilHeat3DSparse_perf<16, 16>(testURI, counter++, 1, 1); // Too big, it doesn't work
367 
368  testSet.insert(testURI);
369 }
370 
371 
372 
373 BOOST_AUTO_TEST_SUITE_END()
374 
375 BOOST_AUTO_TEST_SUITE_END()
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119