OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_performance_tests.cu
1//
2// Created by tommaso on 4/07/19.
3//
4#define BOOST_TEST_DYN_LINK
5#define DISABLE_MPI_WRITTERS
6
7//#define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE __launch_bounds__(512)
8#define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED __launch_bounds__(BLOCK_SIZE_STENCIL,12)
9
10#include <boost/test/unit_test.hpp>
11#include "SparseGridGpu/SparseGridGpu.hpp"
12#include "cuda_macro.h"
13#include "util/stat/common_statistics.hpp"
14#include "Plot/GoogleChart.hpp"
15#include "util/performance/performance_util.hpp"
16#include "SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh"
17#include <set>
18#include "performancePlots.hpp"
19#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
20
21extern char * test_dir;
22
23// Property tree
24
25report_sparse_grid_tests report_sparsegrid_funcs;
26std::string suiteURI = "performance.SparseGridGpu";
27std::set<std::string> testSet;
28
29
30BOOST_AUTO_TEST_SUITE(performance)
31
32BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
33
34
35template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
36void testStencilHeatGet_perf(unsigned int i, std::string base)
37{
38 auto testName = "In-place GET stencil";
41
42 // typedef HeatStencilGet<SparseGridZ::dims,0,0> Stencil01T;
43 // typedef HeatStencilGet<SparseGridZ::dims,0,0> Stencil10T;
44
45 report_sparsegrid_funcs.graphs.put(base + ".dim",2);
46 report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
47 report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
48 report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
49
50 unsigned int iterations = 100;
51
52 openfpm::vector<double> measures_gf;
53 openfpm::vector<double> measures_tm;
54
55 dim3 gridSize(gridEdgeSize, gridEdgeSize);
56 dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
57 typename SparseGridZ::grid_info blockGeometry(gridSize);
58 SparseGridZ sparseGrid(blockGeometry);
60 sparseGrid.template setBackgroundValue<0>(0);
61
62 unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
63
64 // Initialize the grid
65 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
66 CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
67 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
68
69 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
70 dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
71 insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
72 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
73
74 sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
75
76 iterations /= 2;
77 for (unsigned int iter=0; iter<iterations; ++iter)
78 {
79 cudaDeviceSynchronize();
80
81 timer ts;
82 ts.start();
83
84 sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
85 cudaDeviceSynchronize();
86 sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
87 cudaDeviceSynchronize();
88
89 ts.stop();
90
91 measures_tm.add(ts.getwct());
92
93 float gElemS = 2 * numElements / (1e9 * ts.getwct());
94 float gFlopsS = gElemS * Stencil01T::flops;
95
96 measures_gf.add(gFlopsS);
97 }
98
99 double mean_tm = 0;
100 double deviation_tm = 0;
101 standard_deviation(measures_tm,mean_tm,deviation_tm);
102
103 double mean_gf = 0;
104 double deviation_gf = 0;
105 standard_deviation(measures_gf,mean_gf,deviation_gf);
106
107 // All times above are in ms
108
109 float gElemS = 2 * numElements / (1e9 * mean_tm);
110 float gFlopsS = gElemS * Stencil01T::flops;
111 std::cout << "Test: " << testName << std::endl;
112 std::cout << "Block: " << SparseGridZ::blockEdgeSize_ << "x" << SparseGridZ::blockEdgeSize_ << std::endl;
113 std::cout << "Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
114 double dataOccupancyMean, dataOccupancyDev;
115 sparseGrid.deviceToHost();
116 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
117 report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
118 report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
119 std::cout << "Iterations: " << iterations << std::endl;
120 std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
121 std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
122
123 report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
124 report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
125 report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
126 report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
127}
128template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
129void launch_testStencilHeatGet_perf(std::string testURI, unsigned int i)
130{
131 constexpr unsigned int dim = 2;
132 typedef aggregate<float,float> AggregateT;
133 // typedef aggregate<float> AggregateT;
134 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
135
136 std::string base(testURI + "(" + std::to_string(i) + ")");
137 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN");
138
139 testStencilHeatGet_perf<blockEdgeSize, gridEdgeSize,
141 cudaDeviceSynchronize();
142}
143template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
144void launch_testStencilHeatGetZ_perf(std::string testURI, unsigned int i)
145{
146 constexpr unsigned int dim = 2;
147 typedef aggregate<float,float> AggregateT;
148 // typedef aggregate<float> AggregateT;
149 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
150
151 std::string base(testURI + "(" + std::to_string(i) + ")");
152 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilZ");
153
154 testStencilHeatGet_perf<blockEdgeSize, gridEdgeSize,
156 cudaDeviceSynchronize();
157}
158
159template<unsigned int blockEdgeSize, unsigned int gridEdgeSize, typename SparseGridZ>
160void testStencilSkeleton_perf(unsigned int i, std::string base)
161{
162 auto testName = "In-place stencil";
165
166 // typedef SkeletonStencil<SparseGridZ::dims,0,0> Stencil01T;
167 // typedef SkeletonStencil<SparseGridZ::dims,0,0> Stencil10T;
168
169 report_sparsegrid_funcs.graphs.put(base + ".dim",2);
170 report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
171 report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
172 report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
173
174 unsigned int iterations = 100;
175
176 openfpm::vector<double> measures_gf;
177 openfpm::vector<double> measures_tm;
178
179 dim3 gridSize(gridEdgeSize, gridEdgeSize);
180 dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
181 typename SparseGridZ::grid_info blockGeometry(gridSize);
182 SparseGridZ sparseGrid(blockGeometry);
184 sparseGrid.template setBackgroundValue<0>(0);
185
186 unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
187
188 // Initialize the grid
189 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
190 CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
191 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
192
193 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
194 dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
195 insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
196 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
197
198 sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
199
200 iterations /= 2;
201 for (unsigned int iter=0; iter<iterations; ++iter)
202 {
203 cudaDeviceSynchronize();
204
205 timer ts;
206 ts.start();
207
208 sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
209 cudaDeviceSynchronize();
210 sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
211 cudaDeviceSynchronize();
212
213 ts.stop();
214
215 measures_tm.add(ts.getwct());
216
217 float gElemS = 2 * numElements / (1e9 * ts.getwct());
218 float gFlopsS = gElemS * Stencil01T::flops;
219
220 measures_gf.add(gFlopsS);
221 }
222
223 double mean_tm = 0;
224 double deviation_tm = 0;
225 standard_deviation(measures_tm,mean_tm,deviation_tm);
226
227 double mean_gf = 0;
228 double deviation_gf = 0;
229 standard_deviation(measures_gf,mean_gf,deviation_gf);
230
231 // All times above are in ms
232
233 float gElemS = 2 * numElements / (1e9 * mean_tm);
234 float gFlopsS = gElemS * Stencil01T::flops;
235 std::cout << "Test: " << testName << std::endl;
236 std::cout << "Block: " << SparseGridZ::blockEdgeSize_ << "x" << SparseGridZ::blockEdgeSize_ << std::endl;
237 std::cout << "Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
238 double dataOccupancyMean, dataOccupancyDev;
239 sparseGrid.deviceToHost();
240 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
241 report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
242 report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
243 std::cout << "Iterations: " << iterations << std::endl;
244 std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
245 std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
246
247 report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",mean_gf);
248 report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",deviation_gf);
249 report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
250 report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
251}
252
253void launch_testConv3x3x3_perf_z_morton(std::string testURI, unsigned int i)
254{
255 constexpr unsigned int dim = 3;
256 typedef aggregate<float,float> AggregateT;
257 constexpr unsigned int chunkSize = IntPow<8,dim>::value;
258
259 std::string base(testURI + "(" + std::to_string(i) + ")");
260 report_sparsegrid_funcs.graphs.put(base + ".test.name","Conv3x3x3");
261
262 testConv3x3x3_perf<SparseGridGpu_z<dim, AggregateT, 8, chunkSize,long int>>("Convolution 3x3x3 Z-morton");
263}
264
265void launch_testConv3x3x3_perf(std::string testURI, unsigned int i)
266{
267 constexpr unsigned int dim = 3;
268 typedef aggregate<float,float> AggregateT;
269 constexpr unsigned int chunkSize = IntPow<8,dim>::value;
270
271 std::string base(testURI + "(" + std::to_string(i) + ")");
272 report_sparsegrid_funcs.graphs.put(base + ".test.name","Conv3x3x3");
273
274 testConv3x3x3_perf<SparseGridGpu<dim, AggregateT, 8, chunkSize,long int>>("Convolution 3x3x3 ");
275}
276
277void launch_testConv3x3x3_perf_no_shared_z_morton(std::string testURI, unsigned int i)
278{
279 constexpr unsigned int dim = 3;
280 typedef aggregate<float,float> AggregateT;
281
282 std::string base(testURI + "(" + std::to_string(i) + ")");
283 report_sparsegrid_funcs.graphs.put(base + ".test.name","Conv3x3x3");
284
285 testConv3x3x3_no_shared_perf<SparseGridGpu_z<dim, AggregateT, 8, 512, long int>>("Convolution 3x3x3_noshared z-morton");
286}
287
288void launch_testConv3x3x3_perf_no_shared(std::string testURI, unsigned int i)
289{
290 constexpr unsigned int dim = 3;
291 typedef aggregate<float,float> AggregateT;
292
293 std::string base(testURI + "(" + std::to_string(i) + ")");
294 report_sparsegrid_funcs.graphs.put(base + ".test.name","Conv3x3x3");
295
296 testConv3x3x3_no_shared_perf<SparseGridGpu<dim, AggregateT, 8, 512, long int>>("Convolution 3x3x3_noshared");
297}
298
299template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
300void launch_testStencilSkeleton_perf(std::string testURI, unsigned int i)
301{
302 constexpr unsigned int dim = 2;
303 typedef aggregate<float,float> AggregateT;
304 // typedef aggregate<float> AggregateT;
305 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
306
307 std::string base(testURI + "(" + std::to_string(i) + ")");
308 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilN");
309
310 testStencilSkeleton_perf<blockEdgeSize, gridEdgeSize,
312 cudaDeviceSynchronize();
313}
314
315
316template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
317void launch_testStencilSkeletonZ_perf(std::string testURI, unsigned int i)
318{
319 constexpr unsigned int dim = 2;
320 typedef aggregate<float,float> AggregateT;
321 // typedef aggregate<float> AggregateT;
322 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
323
324 std::string base(testURI + "(" + std::to_string(i) + ")");
325 report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilZ");
326
327 testStencilSkeleton_perf<blockEdgeSize, gridEdgeSize,
329 cudaDeviceSynchronize();
330}
331
332BOOST_AUTO_TEST_CASE(testConv3x3x3_noshared)
333{
334 std::string testURI = suiteURI + ".device.conv3x3x3_no_shared.sparse.N.3D.gridScaling";
335 unsigned int counter = 0;
336 launch_testConv3x3x3_perf_no_shared(testURI, counter++);
337 testSet.insert(testURI);
338}
339
340BOOST_AUTO_TEST_CASE(testConv3x3x3_noshared_z_morton)
341{
342 std::string testURI = suiteURI + ".device.conv3x3x3_no_shared.sparse.N.3D.gridScaling";
343 unsigned int counter = 0;
344 launch_testConv3x3x3_perf_no_shared_z_morton(testURI, counter++);
345 testSet.insert(testURI);
346}
347
348BOOST_AUTO_TEST_CASE(testConv3x3x3)
349{
350 std::string testURI = suiteURI + ".device.conv3x3x3.sparse.N.3D.gridScaling";
351 unsigned int counter = 0;
352 launch_testConv3x3x3_perf(testURI, counter++);
353 testSet.insert(testURI);
354}
355
356BOOST_AUTO_TEST_CASE(testConv3x3x3_zmorton)
357{
358
359 std::string testURI = suiteURI + ".device.conv3x3x3_zmorton.sparse.N.3D.gridScaling";
360 unsigned int counter = 0;
361 launch_testConv3x3x3_perf_z_morton(testURI, counter++);
362 testSet.insert(testURI);
363}
364
365BOOST_AUTO_TEST_CASE(testStencilSkeleton_gridScaling)
366{
367 std::string testURI = suiteURI + ".device.skeleton.dense.N.2D.gridScaling";
368 unsigned int counter = 0;
369 constexpr unsigned int blockEdgeSize = 8;
370 launch_testStencilSkeleton_perf<blockEdgeSize, 128>(testURI, counter++);
371 launch_testStencilSkeleton_perf<blockEdgeSize, 256>(testURI, counter++);
372 launch_testStencilSkeleton_perf<blockEdgeSize, 512>(testURI, counter++);
373 launch_testStencilSkeleton_perf<blockEdgeSize, 1024>(testURI, counter++);
374
375 testSet.insert(testURI);
376}
377
378BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_2)
379{
380 std::string testURI = suiteURI + ".device.stencilGet.dense.N.2D.2.gridScaling";
381 unsigned int counter = 0;
382 launch_testStencilHeatGet_perf<2, 512>(testURI, counter++);
383 launch_testStencilHeatGet_perf<2, 1024>(testURI, counter++);
384 launch_testStencilHeatGet_perf<2, 2048>(testURI, counter++);
385 // launch_testStencilHeatGet_perf<2, 4096>(testURI, counter++); // test
386
387 testSet.insert(testURI);
388}
389
390BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_4)
391{
392 std::string testURI = suiteURI + ".device.stencilGet.dense.N.2D.4.gridScaling";
393 unsigned int counter = 0;
394 launch_testStencilHeatGet_perf<4, 256>(testURI, counter++);
395 launch_testStencilHeatGet_perf<4, 512>(testURI, counter++);
396 launch_testStencilHeatGet_perf<4, 1024>(testURI, counter++);
397 launch_testStencilHeatGet_perf<4, 2048>(testURI, counter++);
398
399 testSet.insert(testURI);
400}
401
402BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_8)
403{
404 std::string testURI = suiteURI + ".device.stencilGet.dense.N.2D.8.gridScaling";
405 unsigned int counter = 0;
406 launch_testStencilHeatGet_perf<8, 128>(testURI, counter++);
407 launch_testStencilHeatGet_perf<8, 256>(testURI, counter++);
408 launch_testStencilHeatGet_perf<8, 512>(testURI, counter++);
409 launch_testStencilHeatGet_perf<8, 1024>(testURI, counter++);
410
411 testSet.insert(testURI);
412}
413
414BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_16)
415{
416 std::string testURI = suiteURI + ".device.stencilGet.dense.N.2D.16.gridScaling";
417 unsigned int counter = 0;
418 launch_testStencilHeatGet_perf<16, 64>(testURI, counter++);
419 launch_testStencilHeatGet_perf<16, 128>(testURI, counter++);
420 launch_testStencilHeatGet_perf<16, 256>(testURI, counter++);
421 launch_testStencilHeatGet_perf<16, 512>(testURI, counter++);
422
423 testSet.insert(testURI);
424}
425
426BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_32)
427{
428 std::string testURI = suiteURI + ".device.stencilGet.dense.N.2D.32.gridScaling";
429 unsigned int counter = 0;
430 launch_testStencilHeatGet_perf<32, 32>(testURI, counter++);
431 launch_testStencilHeatGet_perf<32, 64>(testURI, counter++);
432 launch_testStencilHeatGet_perf<32, 128>(testURI, counter++);
433 launch_testStencilHeatGet_perf<32, 256>(testURI, counter++); // test
434
435 testSet.insert(testURI);
436}
437
438BOOST_AUTO_TEST_CASE(testStencilHeatGet_blockScaling)
439{
440 std::string testURI = suiteURI + ".device.stencilGet.dense.N.2D.blockScaling";
441 unsigned int counter = 0;
442 // Note - blockEdgeSize == 2 doesn't work
443 launch_testStencilHeatGet_perf<4, 2048>(testURI, counter++);
444 launch_testStencilHeatGet_perf<8, 1024>(testURI, counter++);
445 launch_testStencilHeatGet_perf<16, 512>(testURI, counter++);
446 launch_testStencilHeatGet_perf<32, 256>(testURI, counter++);
447
448 testSet.insert(testURI);
449}
450
451
452BOOST_AUTO_TEST_CASE(write_teport)
453{
454 write_test_report(report_sparsegrid_funcs, testSet);
455}
456
457BOOST_AUTO_TEST_SUITE_END()
458
459BOOST_AUTO_TEST_SUITE_END()
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...