4#define BOOST_TEST_DYN_LINK
5#define DISABLE_MPI_WRITTERS
8#define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED __launch_bounds__(BLOCK_SIZE_STENCIL,12)
10#include <boost/test/unit_test.hpp>
11#include "SparseGridGpu/SparseGridGpu.hpp"
12#include "cuda_macro.h"
13#include "util/stat/common_statistics.hpp"
14#include "Plot/GoogleChart.hpp"
15#include "util/performance/performance_util.hpp"
16#include "SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh"
18#include "performancePlots.hpp"
19#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
21extern char * test_dir;
26std::string suiteURI =
"performance.SparseGridGpu";
27std::set<std::string> testSet;
30BOOST_AUTO_TEST_SUITE(performance)
32BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
35template<
unsigned int blockEdgeSize,
unsigned int gr
idEdgeSize,
typename SparseGr
idZ>
36void testStencilHeatGet_perf(
unsigned int i, std::string base)
38 auto testName =
"In-place GET stencil";
45 report_sparsegrid_funcs.graphs.put(base +
".dim",2);
46 report_sparsegrid_funcs.graphs.put(base +
".blockSize",blockEdgeSize);
47 report_sparsegrid_funcs.graphs.put(base +
".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
48 report_sparsegrid_funcs.graphs.put(base +
".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
50 unsigned int iterations = 100;
55 dim3 gridSize(gridEdgeSize, gridEdgeSize);
56 dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
57 typename SparseGridZ::grid_info blockGeometry(gridSize);
58 SparseGridZ sparseGrid(blockGeometry);
60 sparseGrid.template setBackgroundValue<0>(0);
62 unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
65 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
66 CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
67 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
69 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
70 dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
71 insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
72 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
74 sparseGrid.findNeighbours();
77 for (
unsigned int iter=0; iter<iterations; ++iter)
79 cudaDeviceSynchronize();
84 sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
85 cudaDeviceSynchronize();
86 sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
87 cudaDeviceSynchronize();
91 measures_tm.add(ts.
getwct());
93 float gElemS = 2 * numElements / (1e9 * ts.
getwct());
94 float gFlopsS = gElemS * Stencil01T::flops;
96 measures_gf.add(gFlopsS);
100 double deviation_tm = 0;
101 standard_deviation(measures_tm,mean_tm,deviation_tm);
104 double deviation_gf = 0;
105 standard_deviation(measures_gf,mean_gf,deviation_gf);
109 float gElemS = 2 * numElements / (1e9 * mean_tm);
110 float gFlopsS = gElemS * Stencil01T::flops;
111 std::cout <<
"Test: " << testName << std::endl;
112 std::cout <<
"Block: " << SparseGridZ::blockEdgeSize_ <<
"x" << SparseGridZ::blockEdgeSize_ << std::endl;
113 std::cout <<
"Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ <<
"x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
114 double dataOccupancyMean, dataOccupancyDev;
115 sparseGrid.deviceToHost();
116 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout <<
"Data Occupancy: " << dataOccupancyMean <<
" dev:" << dataOccupancyDev << std::endl;
117 report_sparsegrid_funcs.graphs.put(base +
".dataOccupancy.mean",dataOccupancyMean);
118 report_sparsegrid_funcs.graphs.put(base +
".dataOccupancy.dev",dataOccupancyDev);
119 std::cout <<
"Iterations: " << iterations << std::endl;
120 std::cout <<
"\tStencil: " << mean_gf <<
" dev:" << deviation_gf <<
" s" << std::endl;
121 std::cout <<
"Throughput: " << std::endl <<
"\t " << gElemS <<
" GElem/s " << std::endl <<
"\t " << gFlopsS <<
" GFlops/s" << std::endl;
123 report_sparsegrid_funcs.graphs.put(base +
".GFlops.mean",mean_gf);
124 report_sparsegrid_funcs.graphs.put(base +
".GFlops.dev",deviation_gf);
125 report_sparsegrid_funcs.graphs.put(base +
".time.mean",mean_tm);
126 report_sparsegrid_funcs.graphs.put(base +
".time.dev",deviation_tm);
128template<
unsigned int blockEdgeSize,
unsigned int gr
idEdgeSize>
129void launch_testStencilHeatGet_perf(std::string testURI,
unsigned int i)
131 constexpr unsigned int dim = 2;
136 std::string base(testURI +
"(" + std::to_string(i) +
")");
137 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"StencilN");
139 testStencilHeatGet_perf<blockEdgeSize, gridEdgeSize,
141 cudaDeviceSynchronize();
143template<
unsigned int blockEdgeSize,
unsigned int gr
idEdgeSize>
144void launch_testStencilHeatGetZ_perf(std::string testURI,
unsigned int i)
146 constexpr unsigned int dim = 2;
151 std::string base(testURI +
"(" + std::to_string(i) +
")");
152 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"StencilZ");
154 testStencilHeatGet_perf<blockEdgeSize, gridEdgeSize,
156 cudaDeviceSynchronize();
159template<
unsigned int blockEdgeSize,
unsigned int gr
idEdgeSize,
typename SparseGr
idZ>
160void testStencilSkeleton_perf(
unsigned int i, std::string base)
162 auto testName =
"In-place stencil";
169 report_sparsegrid_funcs.graphs.put(base +
".dim",2);
170 report_sparsegrid_funcs.graphs.put(base +
".blockSize",blockEdgeSize);
171 report_sparsegrid_funcs.graphs.put(base +
".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
172 report_sparsegrid_funcs.graphs.put(base +
".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
174 unsigned int iterations = 100;
179 dim3 gridSize(gridEdgeSize, gridEdgeSize);
180 dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
181 typename SparseGridZ::grid_info blockGeometry(gridSize);
182 SparseGridZ sparseGrid(blockGeometry);
184 sparseGrid.template setBackgroundValue<0>(0);
186 unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
189 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
190 CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
191 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
193 sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
194 dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
195 insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
196 sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
198 sparseGrid.findNeighbours();
201 for (
unsigned int iter=0; iter<iterations; ++iter)
203 cudaDeviceSynchronize();
208 sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
209 cudaDeviceSynchronize();
210 sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
211 cudaDeviceSynchronize();
215 measures_tm.add(ts.
getwct());
217 float gElemS = 2 * numElements / (1e9 * ts.
getwct());
218 float gFlopsS = gElemS * Stencil01T::flops;
220 measures_gf.add(gFlopsS);
224 double deviation_tm = 0;
225 standard_deviation(measures_tm,mean_tm,deviation_tm);
228 double deviation_gf = 0;
229 standard_deviation(measures_gf,mean_gf,deviation_gf);
233 float gElemS = 2 * numElements / (1e9 * mean_tm);
234 float gFlopsS = gElemS * Stencil01T::flops;
235 std::cout <<
"Test: " << testName << std::endl;
236 std::cout <<
"Block: " << SparseGridZ::blockEdgeSize_ <<
"x" << SparseGridZ::blockEdgeSize_ << std::endl;
237 std::cout <<
"Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ <<
"x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
238 double dataOccupancyMean, dataOccupancyDev;
239 sparseGrid.deviceToHost();
240 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout <<
"Data Occupancy: " << dataOccupancyMean <<
" dev:" << dataOccupancyDev << std::endl;
241 report_sparsegrid_funcs.graphs.put(base +
".dataOccupancy.mean",dataOccupancyMean);
242 report_sparsegrid_funcs.graphs.put(base +
".dataOccupancy.dev",dataOccupancyDev);
243 std::cout <<
"Iterations: " << iterations << std::endl;
244 std::cout <<
"\tStencil: " << mean_gf <<
" dev:" << deviation_gf <<
" s" << std::endl;
245 std::cout <<
"Throughput: " << std::endl <<
"\t " << gElemS <<
" GElem/s " << std::endl <<
"\t " << gFlopsS <<
" GFlops/s" << std::endl;
247 report_sparsegrid_funcs.graphs.put(base +
".GFlops.mean",mean_gf);
248 report_sparsegrid_funcs.graphs.put(base +
".GFlops.dev",deviation_gf);
249 report_sparsegrid_funcs.graphs.put(base +
".time.mean",mean_tm);
250 report_sparsegrid_funcs.graphs.put(base +
".time.dev",deviation_tm);
253void launch_testConv3x3x3_perf_z_morton(std::string testURI,
unsigned int i)
255 constexpr unsigned int dim = 3;
259 std::string base(testURI +
"(" + std::to_string(i) +
")");
260 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"Conv3x3x3");
262 testConv3x3x3_perf<SparseGridGpu_z<dim, AggregateT, 8, chunkSize,long int>>(
"Convolution 3x3x3 Z-morton");
265void launch_testConv3x3x3_perf(std::string testURI,
unsigned int i)
267 constexpr unsigned int dim = 3;
271 std::string base(testURI +
"(" + std::to_string(i) +
")");
272 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"Conv3x3x3");
274 testConv3x3x3_perf<SparseGridGpu<dim, AggregateT, 8, chunkSize,long int>>(
"Convolution 3x3x3 ");
277void launch_testConv3x3x3_perf_no_shared_z_morton(std::string testURI,
unsigned int i)
279 constexpr unsigned int dim = 3;
282 std::string base(testURI +
"(" + std::to_string(i) +
")");
283 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"Conv3x3x3");
285 testConv3x3x3_no_shared_perf<SparseGridGpu_z<dim, AggregateT, 8, 512, long int>>(
"Convolution 3x3x3_noshared z-morton");
288void launch_testConv3x3x3_perf_no_shared(std::string testURI,
unsigned int i)
290 constexpr unsigned int dim = 3;
293 std::string base(testURI +
"(" + std::to_string(i) +
")");
294 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"Conv3x3x3");
296 testConv3x3x3_no_shared_perf<SparseGridGpu<dim, AggregateT, 8, 512, long int>>(
"Convolution 3x3x3_noshared");
299template<
unsigned int blockEdgeSize,
unsigned int gr
idEdgeSize>
300void launch_testStencilSkeleton_perf(std::string testURI,
unsigned int i)
302 constexpr unsigned int dim = 2;
307 std::string base(testURI +
"(" + std::to_string(i) +
")");
308 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"StencilN");
310 testStencilSkeleton_perf<blockEdgeSize, gridEdgeSize,
312 cudaDeviceSynchronize();
316template<
unsigned int blockEdgeSize,
unsigned int gr
idEdgeSize>
317void launch_testStencilSkeletonZ_perf(std::string testURI,
unsigned int i)
319 constexpr unsigned int dim = 2;
324 std::string base(testURI +
"(" + std::to_string(i) +
")");
325 report_sparsegrid_funcs.graphs.put(base +
".test.name",
"StencilZ");
327 testStencilSkeleton_perf<blockEdgeSize, gridEdgeSize,
329 cudaDeviceSynchronize();
332BOOST_AUTO_TEST_CASE(testConv3x3x3_noshared)
334 std::string testURI = suiteURI +
".device.conv3x3x3_no_shared.sparse.N.3D.gridScaling";
335 unsigned int counter = 0;
336 launch_testConv3x3x3_perf_no_shared(testURI, counter++);
337 testSet.insert(testURI);
340BOOST_AUTO_TEST_CASE(testConv3x3x3_noshared_z_morton)
342 std::string testURI = suiteURI +
".device.conv3x3x3_no_shared.sparse.N.3D.gridScaling";
343 unsigned int counter = 0;
344 launch_testConv3x3x3_perf_no_shared_z_morton(testURI, counter++);
345 testSet.insert(testURI);
348BOOST_AUTO_TEST_CASE(testConv3x3x3)
350 std::string testURI = suiteURI +
".device.conv3x3x3.sparse.N.3D.gridScaling";
351 unsigned int counter = 0;
352 launch_testConv3x3x3_perf(testURI, counter++);
353 testSet.insert(testURI);
356BOOST_AUTO_TEST_CASE(testConv3x3x3_zmorton)
359 std::string testURI = suiteURI +
".device.conv3x3x3_zmorton.sparse.N.3D.gridScaling";
360 unsigned int counter = 0;
361 launch_testConv3x3x3_perf_z_morton(testURI, counter++);
362 testSet.insert(testURI);
365BOOST_AUTO_TEST_CASE(testStencilSkeleton_gridScaling)
367 std::string testURI = suiteURI +
".device.skeleton.dense.N.2D.gridScaling";
368 unsigned int counter = 0;
369 constexpr unsigned int blockEdgeSize = 8;
370 launch_testStencilSkeleton_perf<blockEdgeSize, 128>(testURI, counter++);
371 launch_testStencilSkeleton_perf<blockEdgeSize, 256>(testURI, counter++);
372 launch_testStencilSkeleton_perf<blockEdgeSize, 512>(testURI, counter++);
373 launch_testStencilSkeleton_perf<blockEdgeSize, 1024>(testURI, counter++);
375 testSet.insert(testURI);
378BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_2)
380 std::string testURI = suiteURI +
".device.stencilGet.dense.N.2D.2.gridScaling";
381 unsigned int counter = 0;
382 launch_testStencilHeatGet_perf<2, 512>(testURI, counter++);
383 launch_testStencilHeatGet_perf<2, 1024>(testURI, counter++);
384 launch_testStencilHeatGet_perf<2, 2048>(testURI, counter++);
387 testSet.insert(testURI);
390BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_4)
392 std::string testURI = suiteURI +
".device.stencilGet.dense.N.2D.4.gridScaling";
393 unsigned int counter = 0;
394 launch_testStencilHeatGet_perf<4, 256>(testURI, counter++);
395 launch_testStencilHeatGet_perf<4, 512>(testURI, counter++);
396 launch_testStencilHeatGet_perf<4, 1024>(testURI, counter++);
397 launch_testStencilHeatGet_perf<4, 2048>(testURI, counter++);
399 testSet.insert(testURI);
402BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_8)
404 std::string testURI = suiteURI +
".device.stencilGet.dense.N.2D.8.gridScaling";
405 unsigned int counter = 0;
406 launch_testStencilHeatGet_perf<8, 128>(testURI, counter++);
407 launch_testStencilHeatGet_perf<8, 256>(testURI, counter++);
408 launch_testStencilHeatGet_perf<8, 512>(testURI, counter++);
409 launch_testStencilHeatGet_perf<8, 1024>(testURI, counter++);
411 testSet.insert(testURI);
414BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_16)
416 std::string testURI = suiteURI +
".device.stencilGet.dense.N.2D.16.gridScaling";
417 unsigned int counter = 0;
418 launch_testStencilHeatGet_perf<16, 64>(testURI, counter++);
419 launch_testStencilHeatGet_perf<16, 128>(testURI, counter++);
420 launch_testStencilHeatGet_perf<16, 256>(testURI, counter++);
421 launch_testStencilHeatGet_perf<16, 512>(testURI, counter++);
423 testSet.insert(testURI);
426BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_32)
428 std::string testURI = suiteURI +
".device.stencilGet.dense.N.2D.32.gridScaling";
429 unsigned int counter = 0;
430 launch_testStencilHeatGet_perf<32, 32>(testURI, counter++);
431 launch_testStencilHeatGet_perf<32, 64>(testURI, counter++);
432 launch_testStencilHeatGet_perf<32, 128>(testURI, counter++);
433 launch_testStencilHeatGet_perf<32, 256>(testURI, counter++);
435 testSet.insert(testURI);
438BOOST_AUTO_TEST_CASE(testStencilHeatGet_blockScaling)
440 std::string testURI = suiteURI +
".device.stencilGet.dense.N.2D.blockScaling";
441 unsigned int counter = 0;
443 launch_testStencilHeatGet_perf<4, 2048>(testURI, counter++);
444 launch_testStencilHeatGet_perf<8, 1024>(testURI, counter++);
445 launch_testStencilHeatGet_perf<16, 512>(testURI, counter++);
446 launch_testStencilHeatGet_perf<32, 256>(testURI, counter++);
448 testSet.insert(testURI);
452BOOST_AUTO_TEST_CASE(write_teport)
454 write_test_report(report_sparsegrid_funcs, testSet);
457BOOST_AUTO_TEST_SUITE_END()
459BOOST_AUTO_TEST_SUITE_END()
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
void stop()
Stop the timer.
void start()
Start the timer.
double getwct()
Return the elapsed real time.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...