OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_performance_heat_stencil.cu
1/*
2 * SparseGridGpu_performance_heat_stencil.cu
3 *
4 * Created on: Sep 10, 2019
5 * Author: i-bird
6 */
7#define BOOST_TEST_DYN_LINK
8#define DISABLE_MPI_WRITTERS
9
10#include <boost/test/unit_test.hpp>
11#include "performancePlots.hpp"
12#include <iostream>
13#include "SparseGridGpu/SparseGridGpu.hpp"
14#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15
16extern std::string suiteURI;
17extern report_sparse_grid_tests report_sparsegrid_funcs;
18extern std::set<std::string> testSet;
19
20template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
21void testStencilHeat_perf(unsigned int i, std::string base)
22{
23 auto testName = "In-place stencil";
24 typedef HeatStencil<SparseGridZ::dims,0,1> Stencil01T;
25 typedef HeatStencil<SparseGridZ::dims,1,0> Stencil10T;
26
27 // typedef HeatStencil<SparseGridZ::dims,0,0> Stencil01T;
28 // typedef HeatStencil<SparseGridZ::dims,0,0> Stencil10T;
29
30 report_sparsegrid_funcs.graphs.put(base + ".dim",2);
31 report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
32 report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
33 report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
34
35 unsigned int iterations = 100;
36
37 openfpm::vector<double> measures_gf;
38 openfpm::vector<double> measures_tm;
39
40 dim3 gridSize(gridEdgeSize, gridEdgeSize);
41 dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
42 typename SparseGridZ::grid_info blockGeometry(gridSize);
43 SparseGridZ sparseGrid(blockGeometry);
45 sparseGrid.template setBackgroundValue<0>(0);
46
47 unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
48
49 // Initialize the grid
50 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
51 CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
52 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
53
54 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
55 dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
56 insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
57 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
58
59 sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
60
61 iterations /= 2;
62 for (unsigned int iter=0; iter<iterations; ++iter)
63 {
64 cudaDeviceSynchronize();
65 timer ts;
66 ts.start();
67
68 sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
69 cudaDeviceSynchronize();
70 sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
71 cudaDeviceSynchronize();
72
73 ts.stop();
74
75 measures_tm.add(ts.getwct());
76
77 float gElemS = 2 * numElements / (1e9 * ts.getwct());
78 float gFlopsS = gElemS * Stencil01T::flops;
79
80 measures_gf.add(gFlopsS);
81 }
82
83 double mean_tm = 0;
84 double deviation_tm = 0;
85 standard_deviation(measures_tm,mean_tm,deviation_tm);
86
87 double mean_gf = 0;
88 double deviation_gf = 0;
89 standard_deviation(measures_gf,mean_gf,deviation_gf);
90
91 // All times above are in ms
92
93 float gElemS = 2 * numElements / (1e9 * mean_tm);
94 float gFlopsS = gElemS * Stencil01T::flops;
95 std::cout << "Test: " << testName << std::endl;
96 std::cout << "Block: " << SparseGridZ::blockEdgeSize_ << "x" << SparseGridZ::blockEdgeSize_ << std::endl;
97 std::cout << "Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
98 double dataOccupancyMean, dataOccupancyDev;
99 sparseGrid.deviceToHost();
100 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
101 report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
102 report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
103 std::cout << "Iterations: " << iterations << std::endl;
104 std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
105 std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
106
107 report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
108 report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
109 report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
110 report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
111}
112
113template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
114void launch_testStencilHeatZ_perf(std::string testURI, unsigned int i)
115{
116 constexpr unsigned int dim = 2;
117 typedef aggregate<float,float> AggregateT;
118 // typedef aggregate<float> AggregateT;
119 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
120
121 std::string base(testURI + "(" + std::to_string(i) + ")");
122 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilZ");
123
124 testStencilHeat_perf<blockEdgeSize, gridEdgeSize,
126 cudaDeviceSynchronize();
127}
128
129template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
130void launch_testStencilHeat_perf(std::string testURI, unsigned int i)
131{
132 constexpr unsigned int dim = 2;
133 typedef aggregate<float,float> AggregateT;
134 // typedef aggregate<float> AggregateT;
135 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
136
137 std::string base(testURI + "(" + std::to_string(i) + ")");
138 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN");
139
140 testStencilHeat_perf<blockEdgeSize, gridEdgeSize,
142 cudaDeviceSynchronize();
143}
144
145
146BOOST_AUTO_TEST_SUITE(performance)
147
148BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
149
150BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling)
151{
152 std::string testURI = suiteURI + ".device.stencil.dense.N.2D.gridScaling";
153 unsigned int counter = 0;
154 constexpr unsigned int blockEdgeSize = 8;
155 launch_testStencilHeat_perf<blockEdgeSize, 128>(testURI, counter++);
156 launch_testStencilHeat_perf<blockEdgeSize, 256>(testURI, counter++);
157 launch_testStencilHeat_perf<blockEdgeSize, 512>(testURI, counter++);
158 launch_testStencilHeat_perf<blockEdgeSize, 1024>(testURI, counter++);
159
160 testSet.insert(testURI);
161}
162
163BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_2)
164{
165 std::string testURI = suiteURI + ".device.stencil.dense.N.2D.2.gridScaling";
166 unsigned int counter = 0;
167 launch_testStencilHeat_perf<2, 512>(testURI, counter++);
168 launch_testStencilHeat_perf<2, 1024>(testURI, counter++);
169 launch_testStencilHeat_perf<2, 2048>(testURI, counter++);
170 // launch_testStencilHeat_perf<2, 4096>(testURI, counter++); // test
171
172 testSet.insert(testURI);
173}
174
175BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_4)
176{
177 std::string testURI = suiteURI + ".device.stencil.dense.N.2D.4.gridScaling";
178 unsigned int counter = 0;
179 launch_testStencilHeat_perf<4, 256>(testURI, counter++);
180 launch_testStencilHeat_perf<4, 512>(testURI, counter++);
181 launch_testStencilHeat_perf<4, 1024>(testURI, counter++);
182// launch_testStencilHeat_perf<4, 2048>(testURI, counter++);
183
184 testSet.insert(testURI);
185}
186
187BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_8)
188{
189 std::string testURI = suiteURI + ".device.stencil.dense.N.2D.8.gridScaling";
190 unsigned int counter = 0;
191 launch_testStencilHeat_perf<8, 128>(testURI, counter++);
192 launch_testStencilHeat_perf<8, 256>(testURI, counter++);
193 launch_testStencilHeat_perf<8, 512>(testURI, counter++);
194// launch_testStencilHeat_perf<8, 1024>(testURI, counter++);
195
196 testSet.insert(testURI);
197}
198
199BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_16)
200{
201 std::string testURI = suiteURI + ".device.stencil.dense.N.2D.16.gridScaling";
202 unsigned int counter = 0;
203 launch_testStencilHeat_perf<16, 64>(testURI, counter++);
204 launch_testStencilHeat_perf<16, 128>(testURI, counter++);
205 launch_testStencilHeat_perf<16, 256>(testURI, counter++);
206// launch_testStencilHeat_perf<16, 512>(testURI, counter++);
207
208 testSet.insert(testURI);
209}
210
211BOOST_AUTO_TEST_CASE(testStencilHeat_gridScaling_32)
212{
213 std::string testURI = suiteURI + ".device.stencil.dense.N.2D.32.gridScaling";
214 unsigned int counter = 0;
215 launch_testStencilHeat_perf<32, 32>(testURI, counter++);
216 launch_testStencilHeat_perf<32, 64>(testURI, counter++);
217 launch_testStencilHeat_perf<32, 128>(testURI, counter++);
218// launch_testStencilHeat_perf<32, 256>(testURI, counter++); // test
219
220 testSet.insert(testURI);
221}
222
223BOOST_AUTO_TEST_CASE(testStencilHeat_blockScaling)
224{
225 std::string testURI = suiteURI + ".device.stencil.dense.N.2D.blockScaling";
226 unsigned int counter = 0;
227 // Note - blockEdgeSize == 2 doesn't work
228 launch_testStencilHeat_perf<4, 2048>(testURI, counter++);
229 launch_testStencilHeat_perf<8, 1024>(testURI, counter++);
230 launch_testStencilHeat_perf<16, 512>(testURI, counter++);
231 launch_testStencilHeat_perf<32, 256>(testURI, counter++);
232
233 testSet.insert(testURI);
234}
235
236
237BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling)
238{
239 std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.gridScaling";
240 unsigned int counter = 0;
241 constexpr unsigned int blockEdgeSize = 8;
242 launch_testStencilHeatZ_perf<blockEdgeSize, 128>(testURI, counter++);
243 launch_testStencilHeatZ_perf<blockEdgeSize, 256>(testURI, counter++);
244 launch_testStencilHeatZ_perf<blockEdgeSize, 512>(testURI, counter++);
245 launch_testStencilHeatZ_perf<blockEdgeSize, 1024>(testURI, counter++);
246// launch_testStencilHeatZ_perf<blockEdgeSize, 2048>(testURI, counter++);
247
248 testSet.insert(testURI);
249}
250
251BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_2)
252{
253 std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.2.gridScaling";
254 unsigned int counter = 0;
255 launch_testStencilHeatZ_perf<2, 512>(testURI, counter++);
256 launch_testStencilHeatZ_perf<2, 1024>(testURI, counter++);
257 launch_testStencilHeatZ_perf<2, 2048>(testURI, counter++);
258 // launch_testStencilHeatZ_perf<2, 4096>(testURI, counter++);
259
260 testSet.insert(testURI);
261}
262
263BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_4)
264{
265 std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.4.gridScaling";
266 unsigned int counter = 0;
267 launch_testStencilHeatZ_perf<4, 256>(testURI, counter++);
268 launch_testStencilHeatZ_perf<4, 512>(testURI, counter++);
269 launch_testStencilHeatZ_perf<4, 1024>(testURI, counter++);
270 launch_testStencilHeatZ_perf<4, 2048>(testURI, counter++);
271
272 testSet.insert(testURI);
273}
274
275BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_8)
276{
277 std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.8.gridScaling";
278 unsigned int counter = 0;
279 launch_testStencilHeatZ_perf<8, 128>(testURI, counter++);
280 launch_testStencilHeatZ_perf<8, 256>(testURI, counter++);
281 launch_testStencilHeatZ_perf<8, 512>(testURI, counter++);
282 launch_testStencilHeatZ_perf<8, 1024>(testURI, counter++);
283
284 testSet.insert(testURI);
285}
286
287BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_16)
288{
289 std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.16.gridScaling";
290 unsigned int counter = 0;
291 launch_testStencilHeatZ_perf<16, 64>(testURI, counter++);
292 launch_testStencilHeatZ_perf<16, 128>(testURI, counter++);
293 launch_testStencilHeatZ_perf<16, 256>(testURI, counter++);
294 launch_testStencilHeatZ_perf<16, 512>(testURI, counter++);
295
296 testSet.insert(testURI);
297}
298
299BOOST_AUTO_TEST_CASE(testStencilHeatZ_gridScaling_32)
300{
301 std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.32.gridScaling";
302 unsigned int counter = 0;
303 launch_testStencilHeatZ_perf<32, 32>(testURI, counter++);
304 launch_testStencilHeatZ_perf<32, 64>(testURI, counter++);
305 launch_testStencilHeatZ_perf<32, 128>(testURI, counter++);
306 launch_testStencilHeatZ_perf<32, 256>(testURI, counter++);
307
308 testSet.insert(testURI);
309}
310
311BOOST_AUTO_TEST_CASE(testStencilHeatZ_blockScaling)
312{
313 std::string testURI = suiteURI + ".device.stencil.dense.Z.2D.blockScaling";
314 unsigned int counter = 0;
315 // Note - blockEdgeSize == 2 doesn't work
316 launch_testStencilHeatZ_perf<4, 2048>(testURI, counter++);
317 launch_testStencilHeatZ_perf<8, 1024>(testURI, counter++);
318 launch_testStencilHeatZ_perf<16, 512>(testURI, counter++);
319 launch_testStencilHeatZ_perf<32, 256>(testURI, counter++);
320
321 testSet.insert(testURI);
322}
323
324
325BOOST_AUTO_TEST_SUITE_END()
326
327BOOST_AUTO_TEST_SUITE_END()
328
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...