OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_performance_heat_stencil_sparse.cu
1 /*
2  * SparseGridGpu_performance_heat_stencil_sparse.cu
3  *
4  * Created on: Sep 10, 2019
5  * Author: i-bird
6  */
7 #define BOOST_TEST_DYN_LINK
8 #define DISABLE_MPI_WRITTERS
9 
10 #include <boost/test/unit_test.hpp>
11 #include "performancePlots.hpp"
12 #include <iostream>
13 #include "SparseGridGpu/SparseGridGpu.hpp"
14 #include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15 
16 extern std::string suiteURI;
17 extern report_sparse_grid_tests report_sparsegrid_funcs;
18 extern std::set<std::string> testSet;
19 
20 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
21 void testStencilHeatSparse_perf(unsigned int i, std::string base, float fillMultiplier=1, float voidMultiplier=1)
22 {
23  auto testName = "In-place sparse stencil";
24 // unsigned int gridEdgeSize = 128;
25  constexpr unsigned int dim = SparseGridZ::dims;
26 // const unsigned int blockEdgeSize = SparseGridZ::blockEdgeSize_;
27 
28  typedef HeatStencil<dim, 0, 1> Stencil01T;
29  typedef HeatStencil<dim, 1, 0> Stencil10T;
30 
31 // std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").stencil");
32 
33  report_sparsegrid_funcs.graphs.put(base + ".dim",2);
34  report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
35  report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
36  report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
37 
38  unsigned int iterations = 100;
39 
40  openfpm::vector<double> measures_gf;
41  openfpm::vector<double> measures_tm;
42 
43  dim3 gridSize(gridEdgeSize, gridEdgeSize);
44  unsigned int spatialEdgeSize = 1000000;
45  size_t sz[2] = {spatialEdgeSize, spatialEdgeSize};
46  typename SparseGridZ::grid_info blockGeometry(sz);
47  SparseGridZ sparseGrid(blockGeometry);
48  mgpu::ofp_context_t ctx;
49  sparseGrid.template setBackgroundValue<0>(0);
50 
52  float allMultiplier = fillMultiplier + voidMultiplier;
53  const unsigned int numSpheres = gridEdgeSize / (2*allMultiplier);
54 // const unsigned int numSpheres = 1;
55  unsigned int centerPoint = spatialEdgeSize / 2;
56 
57  for (int i = 1; i <= numSpheres; ++i)
58  {
59  unsigned int rBig = allMultiplier*i * blockEdgeSize;
60  unsigned int rSmall = (allMultiplier*i - fillMultiplier) * blockEdgeSize;
61  // Sphere i-th
62  grid_key_dx<dim, int> start1({centerPoint, centerPoint});
63  sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
64  CUDA_LAUNCH_DIM3((insertSphere<0>),
65  gridSize, dim3(blockEdgeSize * blockEdgeSize, 1, 1),
66  sparseGrid.toKernel(), start1, rBig, rSmall, 5);
67  cudaDeviceSynchronize();
68  sparseGrid.template flush<smax_<0 >>(ctx, flush_type::FLUSH_ON_DEVICE);
69  cudaDeviceSynchronize();
70  }
72 
73  sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
74  sparseGrid.tagBoundaries(ctx);
75 
76  sparseGrid.template deviceToHost<0>(); // NECESSARY as count takes place on Host!
77  auto existingElements = sparseGrid.countExistingElements();
78  auto boundaryElements = sparseGrid.countBoundaryElements();
79  unsigned long long numElements = existingElements - boundaryElements;
80 
81  // Now apply some boundary conditions
82  sparseGrid.template applyStencils<BoundaryStencilSetXRescaled<dim,0,0>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE,
83  centerPoint, centerPoint + 2*blockEdgeSize*gridEdgeSize,
84  0.0, 10.0);
85 
86  iterations /= 2;
87  for (unsigned int iter=0; iter<iterations; ++iter)
88  {
89  cudaDeviceSynchronize();
90 
91  timer ts;
92  ts.start();
93 
94  sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
95  cudaDeviceSynchronize();
96  sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
97  cudaDeviceSynchronize();
98 
99  ts.stop();
100 
101  measures_tm.add(ts.getwct());
102 
103  float gElemS = 2 * numElements / (1e9 * ts.getwct());
104  float gFlopsS = gElemS * Stencil01T::flops;
105 
106  measures_gf.add(gFlopsS);
107  }
108 
109  double mean_tm = 0;
110  double deviation_tm = 0;
111  standard_deviation(measures_tm,mean_tm,deviation_tm);
112 
113  double mean_gf = 0;
114  double deviation_gf = 0;
115  standard_deviation(measures_gf,mean_gf,deviation_gf);
116 
117  // All times above are in ms
118 
119  float gElemS = 2 * numElements / (1e9 * mean_tm);
120  float gFlopsS = gElemS * Stencil01T::flops;
121  std::cout << "Test: " << testName << std::endl;
122  std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << std::endl;
123  std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << std::endl;
124  double dataOccupancyMean, dataOccupancyDev;
125  sparseGrid.deviceToHost();
126  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);
127  std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
128  report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
129  report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
130  std::cout << "Iterations: " << iterations << std::endl;
131  std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
132  std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
133 
134  report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
135  report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
136  report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
137  report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
138 }
139 
140 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
141 void launch_testStencilHeatSparse_perf(std::string testURI, unsigned int i,
142  float fillMultiplier=1, float voidMultiplier=1, std::string occupancyStr="05")
143 {
144  constexpr unsigned int dim = 2;
145  typedef aggregate<float,float> AggregateT;
146  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
147 
148  std::string base(testURI + "(" + std::to_string(i) + ")");
149  report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilNSparse"+occupancyStr);
150 
151  testStencilHeatSparse_perf<blockEdgeSize, gridEdgeSize,
153  fillMultiplier, voidMultiplier);
154  cudaDeviceSynchronize();
155 }
156 
157 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
158 void launch_testStencilHeatSparseZ_perf(std::string testURI, unsigned int i,
159  float fillMultiplier=1, float voidMultiplier=1, std::string occupancyStr="05")
160 {
161  constexpr unsigned int dim = 2;
162  typedef aggregate<float,float> AggregateT;
163  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
164 
165  std::string base(testURI + "(" + std::to_string(i) + ")");
166  report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilNSparse"+occupancyStr);
167 
168  testStencilHeatSparse_perf<blockEdgeSize, gridEdgeSize,
170  fillMultiplier, voidMultiplier);
171  cudaDeviceSynchronize();
172 }
173 
174 BOOST_AUTO_TEST_SUITE(performance)
175 
176 BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
177 
178 BOOST_AUTO_TEST_CASE(testStencilHeatSparse05_gridScaling)
179 {
180  std::string testURI = suiteURI + ".device.stencil.sparse.N.2D.05.gridScaling";
181  unsigned int counter = 0;
182  constexpr unsigned int blockEdgeSize = 8;
183  launch_testStencilHeatSparse_perf<blockEdgeSize, 128>(testURI, counter++, 1.45, 1, "05");
184  launch_testStencilHeatSparse_perf<blockEdgeSize, 256>(testURI, counter++, 1.45, 1, "05");
185  launch_testStencilHeatSparse_perf<blockEdgeSize, 512>(testURI, counter++, 1.45, 1, "05");
186  launch_testStencilHeatSparse_perf<blockEdgeSize, 1024>(testURI, counter++, 1.45, 1, "05");
188 
189  testSet.insert(testURI);
190 }
191 
192 
193 BOOST_AUTO_TEST_CASE(testStencilHeatSparse08_gridScaling)
194 {
195  std::string testURI = suiteURI + ".device.stencil.sparse.N.2D.08.gridScaling";
196  unsigned int counter = 0;
197  constexpr unsigned int blockEdgeSize = 8;
198  launch_testStencilHeatSparse_perf<blockEdgeSize, 128>(testURI, counter++, 2, 0.20, "08");
199  launch_testStencilHeatSparse_perf<blockEdgeSize, 256>(testURI, counter++, 2, 0.20, "08");
200  launch_testStencilHeatSparse_perf<blockEdgeSize, 512>(testURI, counter++, 2, 0.20, "08");
201  launch_testStencilHeatSparse_perf<blockEdgeSize, 1024>(testURI, counter++, 2, 0.20, "08");
202 
203  testSet.insert(testURI);
204 }
205 
206 
207 BOOST_AUTO_TEST_CASE(testStencilHeatSparse09_gridScaling)
208 {
209  std::string testURI = suiteURI + ".device.stencil.sparse.N.2D.09.gridScaling";
210  unsigned int counter = 0;
211  constexpr unsigned int blockEdgeSize = 8;
212  launch_testStencilHeatSparse_perf<blockEdgeSize, 128>(testURI, counter++, 2.3, 0.07, "09");
213  launch_testStencilHeatSparse_perf<blockEdgeSize, 256>(testURI, counter++, 2.3, 0.07, "09");
214  launch_testStencilHeatSparse_perf<blockEdgeSize, 512>(testURI, counter++, 2.3, 0.07, "09");
215  launch_testStencilHeatSparse_perf<blockEdgeSize, 1024>(testURI, counter++, 2.3, 0.07, "09");
216 
217  testSet.insert(testURI);
218 }
219 
220 BOOST_AUTO_TEST_CASE(testStencilHeatSparseZ05_gridScaling)
221 {
222  std::string testURI = suiteURI + ".device.stencil.sparse.Z.2D.05.gridScaling";
223  unsigned int counter = 0;
224  constexpr unsigned int blockEdgeSize = 8;
225  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 128>(testURI, counter++, 1.45, 1, "05");
226  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 256>(testURI, counter++, 1.45, 1, "05");
227  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 512>(testURI, counter++, 1.45, 1, "05");
228  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 1024>(testURI, counter++, 1.45, 1, "05");
229 
230  testSet.insert(testURI);
231 }
232 
233 
234 BOOST_AUTO_TEST_CASE(testStencilHeatSparseZ08_gridScaling)
235 {
236  std::string testURI = suiteURI + ".device.stencil.sparse.Z.2D.08.gridScaling";
237  unsigned int counter = 0;
238  constexpr unsigned int blockEdgeSize = 8;
239  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 128>(testURI, counter++, 2, 0.20, "08");
240  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 256>(testURI, counter++, 2, 0.20, "08");
241  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 512>(testURI, counter++, 2, 0.20, "08");
242  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 1024>(testURI, counter++, 2, 0.20, "08");
243 
244  testSet.insert(testURI);
245 }
246 
247 
248 BOOST_AUTO_TEST_CASE(testStencilHeatSparseZ09_gridScaling)
249 {
250  std::string testURI = suiteURI + ".device.stencil.sparse.Z.2D.09.gridScaling";
251  unsigned int counter = 0;
252  constexpr unsigned int blockEdgeSize = 8;
253  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 128>(testURI, counter++, 2.3, 0.07, "09");
254  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 256>(testURI, counter++, 2.3, 0.07, "09");
255  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 512>(testURI, counter++, 2.3, 0.07, "09");
256  launch_testStencilHeatSparseZ_perf<blockEdgeSize, 1024>(testURI, counter++, 2.3, 0.07, "09");
257 
258  testSet.insert(testURI);
259 }
260 
261 BOOST_AUTO_TEST_CASE(testStencilHeatSparse05_32Block_2048Grid_Case)
262 {
263  std::string testURI = suiteURI + ".device.stencil.sparse.N.2D.05.32_2048";
264  unsigned int counter = 0;
265  launch_testStencilHeatSparse_perf<32, 2048/32>(testURI, counter++, 1.45, 1, "05");
266 
267  testSet.insert(testURI);
268 }
269 
270 BOOST_AUTO_TEST_SUITE_END()
271 
272 BOOST_AUTO_TEST_SUITE_END()
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119