OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_performance_heat_stencil.cu
1 /*
2  * SparseGridGpu_performance_heat_stencil.cu
3  *
4  * Created on: Sep 10, 2019
5  * Author: i-bird
6  */
7 #define BOOST_TEST_DYN_LINK
8 #define DISABLE_MPI_WRITTERS
9 
10 #include <boost/test/unit_test.hpp>
11 #include "performancePlots.hpp"
12 #include <iostream>
13 #include "SparseGridGpu/SparseGridGpu.hpp"
14 #include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15 
16 extern std::string suiteURI;
17 extern report_sparse_grid_tests report_sparsegrid_funcs;
18 extern std::set<std::string> testSet;
19 
20 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
21 void testStencilHeat_perf(unsigned int i, std::string base)
22 {
23  auto testName = "In-place stencil";
24  typedef HeatStencil<SparseGridZ::dims,0,1> Stencil01T;
25  typedef HeatStencil<SparseGridZ::dims,1,0> Stencil10T;
26 
27  // typedef HeatStencil<SparseGridZ::dims,0,0> Stencil01T;
28  // typedef HeatStencil<SparseGridZ::dims,0,0> Stencil10T;
29 
30  report_sparsegrid_funcs.graphs.put(base + ".dim",2);
31  report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
32  report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
33  report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
34 
35  unsigned int iterations = 100;
36 
37  openfpm::vector<double> measures_gf;
38  openfpm::vector<double> measures_tm;
39 
40  dim3 gridSize(gridEdgeSize, gridEdgeSize);
41  dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
42  typename SparseGridZ::grid_info blockGeometry(gridSize);
43  SparseGridZ sparseGrid(blockGeometry);
44  mgpu::ofp_context_t ctx;
45  sparseGrid.template setBackgroundValue<0>(0);
46 
47  unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
48 
49  // Initialize the grid
50  sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
51  CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
52  sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
53 
54  sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
55  dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
56  insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
57  sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
58 
59  sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
60 
61  iterations /= 2;
62  for (unsigned int iter=0; iter<iterations; ++iter)
63  {
64  cudaDeviceSynchronize();
65  timer ts;
66  ts.start();
67 
68  sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
69  cudaDeviceSynchronize();
70  sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
71  cudaDeviceSynchronize();
72 
73  ts.stop();
74 
75  measures_tm.add(ts.getwct());
76 
77  float gElemS = 2 * numElements / (1e9 * ts.getwct());
78  float gFlopsS = gElemS * Stencil01T::flops;
79 
80  measures_gf.add(gFlopsS);
81  }
82 
83  double mean_tm = 0;
84  double deviation_tm = 0;
85  standard_deviation(measures_tm,mean_tm,deviation_tm);
86 
87  double mean_gf = 0;
88  double deviation_gf = 0;
89  standard_deviation(measures_gf,mean_gf,deviation_gf);
90 
91  // All times above are in ms
92 
93  float gElemS = 2 * numElements / (1e9 * mean_tm);
94  float gFlopsS = gElemS * Stencil01T::flops;
95  std::cout << "Test: " << testName << std::endl;
96  std::cout << "Block: " << SparseGridZ::blockEdgeSize_ << "x" << SparseGridZ::blockEdgeSize_ << std::endl;
97  std::cout << "Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
98  double dataOccupancyMean, dataOccupancyDev;
99  sparseGrid.deviceToHost();
100  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
101  report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
102  report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
103  std::cout << "Iterations: " << iterations << std::endl;
104  std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
105  std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
106 
107  report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
108  report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
109  report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
110  report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
111 }
112 
113 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
114 void launch_testStencilHeatZ_perf(std::string testURI, unsigned int i)
115 {
116  constexpr unsigned int dim = 2;
117  typedef aggregate<float,float> AggregateT;
118  // typedef aggregate<float> AggregateT;
119  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
120 
121  std::string base(testURI + "(" + std::to_string(i) + ")");
122  report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilZ");
123 
124  testStencilHeat_perf<blockEdgeSize, gridEdgeSize,
126  cudaDeviceSynchronize();
127 }
128 
129 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
130 void launch_testStencilHeat_perf(std::string testURI, unsigned int i)
131 {
132  constexpr unsigned int dim = 2;
133  typedef aggregate<float,float> AggregateT;
134  // typedef aggregate<float> AggregateT;
135  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
136 
137  std::string base(testURI + "(" + std::to_string(i) + ")");
138  report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN");
139 
140  testStencilHeat_perf<blockEdgeSize, gridEdgeSize,
142  cudaDeviceSynchronize();
143 }
144 
145 
146 BOOST_AUTO_TEST_SUITE(performance)
147 
148 BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
149 
150 BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling)
151 {
152  std::string testURI = suiteURI + ".device.stencil.dense.N.2D.gridScaling";
153  unsigned int counter = 0;
154  constexpr unsigned int blockEdgeSize = 8;
155  launch_testStencilHeat_perf<blockEdgeSize, 128>(testURI, counter++);
156  launch_testStencilHeat_perf<blockEdgeSize, 256>(testURI, counter++);
157  launch_testStencilHeat_perf<blockEdgeSize, 512>(testURI, counter++);
158  launch_testStencilHeat_perf<blockEdgeSize, 1024>(testURI, counter++);
159 
160  testSet.insert(testURI);
161 }
162 
163 BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_2)
164 {
165  std::string testURI = suiteURI + ".device.stencil.dense.N.2D.2.gridScaling";
166  unsigned int counter = 0;
167  launch_testStencilHeat_perf<2, 512>(testURI, counter++);
168  launch_testStencilHeat_perf<2, 1024>(testURI, counter++);
169  launch_testStencilHeat_perf<2, 2048>(testURI, counter++);
170  // launch_testStencilHeat_perf<2, 4096>(testURI, counter++); // test
171 
172  testSet.insert(testURI);
173 }
174 
175 BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_4)
176 {
177  std::string testURI = suiteURI + ".device.stencil.dense.N.2D.4.gridScaling";
178  unsigned int counter = 0;
179  launch_testStencilHeat_perf<4, 256>(testURI, counter++);
180  launch_testStencilHeat_perf<4, 512>(testURI, counter++);
181  launch_testStencilHeat_perf<4, 1024>(testURI, counter++);
182 // launch_testStencilHeat_perf<4, 2048>(testURI, counter++);
183 
184  testSet.insert(testURI);
185 }
186 
187 BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_8)
188 {
189  std::string testURI = suiteURI + ".device.stencil.dense.N.2D.8.gridScaling";
190  unsigned int counter = 0;
191  launch_testStencilHeat_perf<8, 128>(testURI, counter++);
192  launch_testStencilHeat_perf<8, 256>(testURI, counter++);
193  launch_testStencilHeat_perf<8, 512>(testURI, counter++);
194 // launch_testStencilHeat_perf<8, 1024>(testURI, counter++);
195 
196  testSet.insert(testURI);
197 }
198 
199 BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_16)
200 {
201  std::string testURI = suiteURI + ".device.stencil.dense.N.2D.16.gridScaling";
202  unsigned int counter = 0;
203  launch_testStencilHeat_perf<16, 64>(testURI, counter++);
204  launch_testStencilHeat_perf<16, 128>(testURI, counter++);
205  launch_testStencilHeat_perf<16, 256>(testURI, counter++);
206 // launch_testStencilHeat_perf<16, 512>(testURI, counter++);
207 
208  testSet.insert(testURI);
209 }
210 
211 BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_32)
212 {
213  std::string testURI = suiteURI + ".device.stencil.dense.N.2D.32.gridScaling";
214  unsigned int counter = 0;
215  launch_testStencilHeat_perf<32, 32>(testURI, counter++);
216  launch_testStencilHeat_perf<32, 64>(testURI, counter++);
217  launch_testStencilHeat_perf<32, 128>(testURI, counter++);
218 // launch_testStencilHeat_perf<32, 256>(testURI, counter++); // test
219 
220  testSet.insert(testURI);
221 }
222 
223 BOOST_AUTO_TEST_CASE(testStencilHeat_blockScaling)
224 {
225  std::string testURI = suiteURI + ".device.stencil.dense.N.2D.blockScaling";
226  unsigned int counter = 0;
227  // Note - blockEdgeSize == 2 doesn't work
228  launch_testStencilHeat_perf<4, 2048>(testURI, counter++);
229  launch_testStencilHeat_perf<8, 1024>(testURI, counter++);
230  launch_testStencilHeat_perf<16, 512>(testURI, counter++);
231  launch_testStencilHeat_perf<32, 256>(testURI, counter++);
232 
233  testSet.insert(testURI);
234 }
235 
236 
237 BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling)
238 {
239  std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.gridScaling";
240  unsigned int counter = 0;
241  constexpr unsigned int blockEdgeSize = 8;
242  launch_testStencilHeatZ_perf<blockEdgeSize, 128>(testURI, counter++);
243  launch_testStencilHeatZ_perf<blockEdgeSize, 256>(testURI, counter++);
244  launch_testStencilHeatZ_perf<blockEdgeSize, 512>(testURI, counter++);
245  launch_testStencilHeatZ_perf<blockEdgeSize, 1024>(testURI, counter++);
246 // launch_testStencilHeatZ_perf<blockEdgeSize, 2048>(testURI, counter++);
247 
248  testSet.insert(testURI);
249 }
250 
251 BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_2)
252 {
253  std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.2.gridScaling";
254  unsigned int counter = 0;
255  launch_testStencilHeatZ_perf<2, 512>(testURI, counter++);
256  launch_testStencilHeatZ_perf<2, 1024>(testURI, counter++);
257  launch_testStencilHeatZ_perf<2, 2048>(testURI, counter++);
258  // launch_testStencilHeatZ_perf<2, 4096>(testURI, counter++);
259 
260  testSet.insert(testURI);
261 }
262 
263 BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_4)
264 {
265  std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.4.gridScaling";
266  unsigned int counter = 0;
267  launch_testStencilHeatZ_perf<4, 256>(testURI, counter++);
268  launch_testStencilHeatZ_perf<4, 512>(testURI, counter++);
269  launch_testStencilHeatZ_perf<4, 1024>(testURI, counter++);
270  launch_testStencilHeatZ_perf<4, 2048>(testURI, counter++);
271 
272  testSet.insert(testURI);
273 }
274 
275 BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_8)
276 {
277  std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.8.gridScaling";
278  unsigned int counter = 0;
279  launch_testStencilHeatZ_perf<8, 128>(testURI, counter++);
280  launch_testStencilHeatZ_perf<8, 256>(testURI, counter++);
281  launch_testStencilHeatZ_perf<8, 512>(testURI, counter++);
282  launch_testStencilHeatZ_perf<8, 1024>(testURI, counter++);
283 
284  testSet.insert(testURI);
285 }
286 
287 BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_16)
288 {
289  std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.16.gridScaling";
290  unsigned int counter = 0;
291  launch_testStencilHeatZ_perf<16, 64>(testURI, counter++);
292  launch_testStencilHeatZ_perf<16, 128>(testURI, counter++);
293  launch_testStencilHeatZ_perf<16, 256>(testURI, counter++);
294  launch_testStencilHeatZ_perf<16, 512>(testURI, counter++);
295 
296  testSet.insert(testURI);
297 }
298 
299 BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_32)
300 {
301  std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.32.gridScaling";
302  unsigned int counter = 0;
303  launch_testStencilHeatZ_perf<32, 32>(testURI, counter++);
304  launch_testStencilHeatZ_perf<32, 64>(testURI, counter++);
305  launch_testStencilHeatZ_perf<32, 128>(testURI, counter++);
306  launch_testStencilHeatZ_perf<32, 256>(testURI, counter++);
307 
308  testSet.insert(testURI);
309 }
310 
311 BOOST_AUTO_TEST_CASE(testStencilHeatZ_blockScaling)
312 {
313  std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.blockScaling";
314  unsigned int counter = 0;
315  // Note - blockEdgeSize == 2 doesn't work
316  launch_testStencilHeatZ_perf<4, 2048>(testURI, counter++);
317  launch_testStencilHeatZ_perf<8, 1024>(testURI, counter++);
318  launch_testStencilHeatZ_perf<16, 512>(testURI, counter++);
319  launch_testStencilHeatZ_perf<32, 256>(testURI, counter++);
320 
321  testSet.insert(testURI);
322 }
323 
324 
325 BOOST_AUTO_TEST_SUITE_END()
326 
327 BOOST_AUTO_TEST_SUITE_END()
328 
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119