OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_performance_heat_stencil_3d.cu
1/*
2 * SparseGridGpu_performance_heat_stencil_3d.cu
3 *
4 * Created on: Sep 10, 2019
5 * Author: i-bird
6 */
7#define BOOST_TEST_DYN_LINK
8#define DISABLE_MPI_WRITTERS
9
10#include <boost/test/unit_test.hpp>
11#include "performancePlots.hpp"
12#include <iostream>
13#include "SparseGridGpu/SparseGridGpu.hpp"
14#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15
16extern std::string suiteURI;
17extern report_sparse_grid_tests report_sparsegrid_funcs;
18extern std::set<std::string> testSet;
19
20template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
21void testStencilHeat3D_perf(unsigned int i, std::string base)
22{
23 auto testName = "In-place 3D stencil";
24// unsigned int gridEdgeSize = 128;
25// unsigned int gridEdgeSize = 64;
26 typedef HeatStencil<SparseGridZ::dims,0,1> Stencil01T;
27 typedef HeatStencil<SparseGridZ::dims,1,0> Stencil10T;
28
29 report_sparsegrid_funcs.graphs.put(base + ".dim",3);
30 report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
31 report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
32 report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
33 report_sparsegrid_funcs.graphs.put(base + ".gridSize.z",gridEdgeSize*SparseGridZ::blockEdgeSize_);
34
35 unsigned int iterations = 100;
36
37 openfpm::vector<double> measures_gf;
38 openfpm::vector<double> measures_tm;
39
40 dim3 gridSize(gridEdgeSize, gridEdgeSize, gridEdgeSize);
41 dim3 blockSize(SparseGridZ::blockEdgeSize_, SparseGridZ::blockEdgeSize_, SparseGridZ::blockEdgeSize_);
42
43 typename SparseGridZ::grid_info blockGeometry(gridSize);
44 SparseGridZ sparseGrid(blockGeometry);
46 sparseGrid.template setBackgroundValue<0>(0);
47
48 unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_
49 *gridEdgeSize*SparseGridZ::blockEdgeSize_
50 *gridEdgeSize*SparseGridZ::blockEdgeSize_;
51
52 // Initialize the grid
53 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
54 CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
55 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
56
57 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
58 dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2,
59 gridSize.y * SparseGridZ::blockEdgeSize_ / 2,
60 gridSize.z * SparseGridZ::blockEdgeSize_ / 2);
61 insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
62 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
63
64 sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
65
66 iterations /= 2;
67 for (unsigned int iter=0; iter<iterations; ++iter)
68 {
69 cudaDeviceSynchronize();
70
71 timer ts;
72 ts.start();
73
74 sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
75 cudaDeviceSynchronize();
76 sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
77 cudaDeviceSynchronize();
78
79 ts.stop();
80
81 measures_tm.add(ts.getwct());
82
83 float gElemS = 2 * numElements / (1e9 * ts.getwct());
84 float gFlopsS = gElemS * Stencil01T::flops;
85
86 measures_gf.add(gFlopsS);
87 }
88
89 double mean_tm = 0;
90 double deviation_tm = 0;
91 standard_deviation(measures_tm,mean_tm,deviation_tm);
92
93 double mean_gf = 0;
94 double deviation_gf = 0;
95 standard_deviation(measures_gf,mean_gf,deviation_gf);
96
97 // All times above are in ms
98
99 float gElemS = 2 * numElements / (1e9 * mean_tm);
100 float gFlopsS = gElemS * Stencil01T::flops;
101 std::cout << "Test: " << testName << std::endl;
102 std::cout << "Block: " << SparseGridZ::blockEdgeSize_
103 << "x" << SparseGridZ::blockEdgeSize_
104 << "x" << SparseGridZ::blockEdgeSize_
105 << std::endl;
106 std::cout << "Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_
107 << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_
108 << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_
109 << std::endl;
110 double dataOccupancyMean, dataOccupancyDev;
111 sparseGrid.deviceToHost();
112 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
113 report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
114 report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
115 std::cout << "Iterations: " << iterations << std::endl;
116 std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
117 std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
118
119 report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
120 report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
121 report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
122 report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
123}
124
125template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
126void launch_testStencilHeat3D_perf(std::string testURI, unsigned int i)
127{
128 constexpr unsigned int dim = 3;
129 typedef aggregate<float,float> AggregateT;
130 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
131
132 std::string base(testURI + "(" + std::to_string(i) + ")");
133 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN3D");
134
135 testStencilHeat3D_perf<blockEdgeSize, gridEdgeSize,
137 cudaDeviceSynchronize();
138}
139
140template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
141void testStencilHeat3DSparse_perf(unsigned int i, std::string base, float fillMultiplier=1, float voidMultiplier=1)
142{
143 auto testName = "In-place 3D sparse stencil";
144// unsigned int gridEdgeSize = 32;
145 constexpr unsigned int dim = SparseGridZ::dims;
146// const unsigned int blockEdgeSize = SparseGridZ::blockEdgeSize_;
147
148 typedef HeatStencil<dim, 0, 1> Stencil01T;
149 typedef HeatStencil<dim, 1, 0> Stencil10T;
150
151// std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").stencil");
152
153 report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
154 report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
155 report_sparsegrid_funcs.graphs.put(base + ".gridSize.x", gridEdgeSize * blockEdgeSize);
156 report_sparsegrid_funcs.graphs.put(base + ".gridSize.y", gridEdgeSize * blockEdgeSize);
157 report_sparsegrid_funcs.graphs.put(base + ".gridSize.z", gridEdgeSize * blockEdgeSize);
158
159 unsigned int iterations = 100;
160
161 openfpm::vector<double> measures_gf;
162 openfpm::vector<double> measures_tm;
163
164 dim3 gridSize(gridEdgeSize, gridEdgeSize, gridEdgeSize);
165 unsigned int spatialEdgeSize = 10000;
166 size_t sz[3] = {spatialEdgeSize, spatialEdgeSize, spatialEdgeSize};
167 typename SparseGridZ::grid_info blockGeometry(sz);
168 SparseGridZ sparseGrid(blockGeometry);
170 sparseGrid.template setBackgroundValue<0>(0);
171
173 float allMultiplier = fillMultiplier + voidMultiplier;
174 const unsigned int numSpheres = gridEdgeSize / (2*allMultiplier);
175 unsigned int centerPoint = spatialEdgeSize / 2;
176
177 for (int i = 1; i <= numSpheres; ++i)
178 {
179 unsigned int rBig = allMultiplier*i * blockEdgeSize;
180 unsigned int rSmall = (allMultiplier*i - fillMultiplier) * blockEdgeSize;
181 // Sphere i-th
182 grid_key_dx<dim, int> start1({centerPoint, centerPoint, centerPoint});
183 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
184 CUDA_LAUNCH_DIM3((insertSphere3D<0>),
185 gridSize, dim3(blockEdgeSize * blockEdgeSize * blockEdgeSize, 1, 1),
186 sparseGrid.toKernel(), start1, rBig, rSmall, 1);
187 cudaDeviceSynchronize();
188 sparseGrid.template flush<smax_<0 >>(ctx, flush_type::FLUSH_ON_DEVICE);
189 cudaDeviceSynchronize();
190 }
192
193 sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
194 sparseGrid.tagBoundaries(ctx);
195
196 sparseGrid.template deviceToHost<0>(); // NECESSARY as count takes place on Host!
197 auto existingElements = sparseGrid.countExistingElements();
198 auto boundaryElements = sparseGrid.countBoundaryElements();
199 unsigned long long numElements = existingElements - boundaryElements;
200
201 // Now apply some boundary conditions
202 sparseGrid.template applyStencils<BoundaryStencilSetXRescaled<dim,0,0>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE,
203 centerPoint, centerPoint + 2*blockEdgeSize*gridEdgeSize,
204 0.0, 10.0);
205
206 iterations /= 2;
207 for (unsigned int iter=0; iter<iterations; ++iter)
208 {
209
210 timer ts;
211 ts.start();
212
213 sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
214 sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
215
216 ts.stop();
217
218 measures_tm.add(ts.getwct());
219
220 float gElemS = 2 * numElements / (1e9 * ts.getwct());
221 float gFlopsS = gElemS * Stencil01T::flops;
222
223 measures_gf.add(gFlopsS);
224 }
225
226 double mean_tm = 0;
227 double deviation_tm = 0;
228 standard_deviation(measures_tm,mean_tm,deviation_tm);
229
230 double mean_gf = 0;
231 double deviation_gf = 0;
232 standard_deviation(measures_gf,mean_gf,deviation_gf);
233
234 // All times above are in ms
235
236 float gElemS = 2 * numElements / (1e9 * mean_tm);
237 float gFlopsS = gElemS * Stencil01T::flops;
238 std::cout << "Test: " << testName << std::endl;
239 std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "x" << blockEdgeSize << std::endl;
240 std::cout << "Grid: " << gridEdgeSize * blockEdgeSize
241 << "x" << gridEdgeSize * blockEdgeSize
242 << "x" << gridEdgeSize * blockEdgeSize
243 << std::endl;
244 double dataOccupancyMean, dataOccupancyDev;
245 sparseGrid.deviceToHost();
246 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
247 report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
248 report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
249 std::cout << "Iterations: " << iterations << std::endl;
250 std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
251 std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
252
253 report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
254 report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
255 report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
256 report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
257
258// // DEBUG
259// sparseGrid.template deviceToHost<0,1>();
260// sparseGrid.write("SparseGridGPU_testStencilHeat3DSparse_perf_DEBUG.vtk");
261}
262
263template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
264void launch_testStencilHeat3DSparse_perf(std::string testURI, unsigned int i,
265 float fillMultiplier=1, float voidMultiplier=1)
266{
267 constexpr unsigned int dim = 3;
268 typedef aggregate<float,float> AggregateT;
269 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
270
271 std::string base(testURI + "(" + std::to_string(i) + ")");
272 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN3DSparse");
273
274 testStencilHeat3DSparse_perf<blockEdgeSize, gridEdgeSize,
276 fillMultiplier, voidMultiplier);
277 cudaDeviceSynchronize();
278}
279
280BOOST_AUTO_TEST_SUITE(performance)
281
282BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
283
284BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling)
285{
286 std::string testURI = suiteURI + ".device.stencil.dense.N.3D.gridScaling";
287 unsigned int counter = 0;
288 constexpr unsigned int blockEdgeSize = 8;
289 launch_testStencilHeat3D_perf<blockEdgeSize, 8>(testURI, counter++);
290 launch_testStencilHeat3D_perf<blockEdgeSize, 16>(testURI, counter++);
291 launch_testStencilHeat3D_perf<blockEdgeSize, 32>(testURI, counter++);
292 launch_testStencilHeat3D_perf<blockEdgeSize, 64>(testURI, counter++);
293// launch_testStencilHeat3D_perf<blockEdgeSize, 128>(testURI, counter++);
294
295 testSet.insert(testURI);
296}
297
298BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling_2)
299{
300 std::string testURI = suiteURI + ".device.stencil.dense.N.3D.2.gridScaling";
301 unsigned int counter = 0;
302 launch_testStencilHeat3D_perf<2, 32>(testURI, counter++);
303 launch_testStencilHeat3D_perf<2, 64>(testURI, counter++);
304 launch_testStencilHeat3D_perf<2, 128>(testURI, counter++);
305 // launch_testStencilHeat3D_perf<2, 256>(testURI, counter++);
306
307 testSet.insert(testURI);
308}
309
310BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling_4)
311{
312 std::string testURI = suiteURI + ".device.stencil.dense.N.3D.4.gridScaling";
313 unsigned int counter = 0;
314 launch_testStencilHeat3D_perf<4, 16>(testURI, counter++);
315 launch_testStencilHeat3D_perf<4, 32>(testURI, counter++);
316 launch_testStencilHeat3D_perf<4, 64>(testURI, counter++);
317// launch_testStencilHeat3D_perf<4, 128>(testURI, counter++);
318
319 testSet.insert(testURI);
320}
321
322BOOST_AUTO_TEST_CASE(testStencilHeat3D_gridScaling_8)
323{
324 std::string testURI = suiteURI + ".device.stencil.dense.N.3D.8.gridScaling";
325 unsigned int counter = 0;
326 launch_testStencilHeat3D_perf<8, 8>(testURI, counter++);
327 launch_testStencilHeat3D_perf<8, 16>(testURI, counter++);
328 launch_testStencilHeat3D_perf<8, 32>(testURI, counter++);
329 launch_testStencilHeat3D_perf<8, 64>(testURI, counter++);
330
331 testSet.insert(testURI);
332}
333
334BOOST_AUTO_TEST_CASE(testStencilHeat3D_blockScaling)
335{
336 std::string testURI = suiteURI + ".device.stencil.dense.N.3D.blockScaling";
337 unsigned int counter = 0;
338 launch_testStencilHeat3D_perf<2, 128>(testURI, counter++);
339 launch_testStencilHeat3D_perf<4, 64>(testURI, counter++);
340 launch_testStencilHeat3D_perf<8, 32>(testURI, counter++);
341// launch_testStencilHeat3D_perf<16, 16>(testURI, counter++); // Too big, it doesn't work
342
343 testSet.insert(testURI);
344}
345
346BOOST_AUTO_TEST_CASE(testStencilHeat3DSparse_gridScaling)
347{
348 std::string testURI = suiteURI + ".device.stencil.sparse.N.3D.gridScaling";
349 unsigned int counter = 0;
350 constexpr unsigned int blockEdgeSize = 8;
351 launch_testStencilHeat3DSparse_perf<blockEdgeSize, 8>(testURI, counter++, 1, 1);
352 launch_testStencilHeat3DSparse_perf<blockEdgeSize, 16>(testURI, counter++, 1, 1);
353 launch_testStencilHeat3DSparse_perf<blockEdgeSize, 32>(testURI, counter++, 1, 1);
354 launch_testStencilHeat3DSparse_perf<blockEdgeSize, 64>(testURI, counter++, 1, 1);
355
356 testSet.insert(testURI);
357}
358
359BOOST_AUTO_TEST_CASE(testStencilHeat3DSparse_blockScaling)
360{
361 std::string testURI = suiteURI + ".device.stencil.sparse.N.3D.blockScaling";
362 unsigned int counter = 0;
363 launch_testStencilHeat3DSparse_perf<2, 128>(testURI, counter++, 1, 1);
364 launch_testStencilHeat3DSparse_perf<4, 64>(testURI, counter++, 1, 1);
365 launch_testStencilHeat3DSparse_perf<8, 32>(testURI, counter++, 1, 1);
366// launch_testStencilHeat3DSparse_perf<16, 16>(testURI, counter++, 1, 1); // Too big, it doesn't work
367
368 testSet.insert(testURI);
369}
370
371
372
373BOOST_AUTO_TEST_SUITE_END()
374
375BOOST_AUTO_TEST_SUITE_END()
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...