4#define BOOST_TEST_DYN_LINK 
    5#define DISABLE_MPI_WRITTERS 
    8#define SPARSEGRIDGPU_LAUNCH_BOUND_APPLY_STENCIL_IN_PLACE_NO_SHARED __launch_bounds__(BLOCK_SIZE_STENCIL,12) 
   10#include <boost/test/unit_test.hpp> 
   11#include "SparseGridGpu/SparseGridGpu.hpp" 
   12#include "cuda_macro.h" 
   13#include "util/stat/common_statistics.hpp" 
   14#include "Plot/GoogleChart.hpp" 
   15#include "util/performance/performance_util.hpp" 
   16#include "SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh" 
   18#include "performancePlots.hpp" 
   19#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh" 
   21extern char * test_dir;
 
   26std::string suiteURI = 
"performance.SparseGridGpu";
 
   27std::set<std::string> testSet;
 
   30BOOST_AUTO_TEST_SUITE(performance)
 
   32BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
 
   35template<
unsigned int blockEdgeSize, 
unsigned int gr
idEdgeSize, 
typename SparseGr
idZ>
 
   36void testStencilHeatGet_perf(
unsigned int i, std::string base)
 
   38    auto testName = 
"In-place GET stencil";
 
   45    report_sparsegrid_funcs.graphs.put(base + 
".dim",2);
 
   46    report_sparsegrid_funcs.graphs.put(base + 
".blockSize",blockEdgeSize);
 
   47    report_sparsegrid_funcs.graphs.put(base + 
".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
 
   48    report_sparsegrid_funcs.graphs.put(base + 
".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
 
   50    unsigned int iterations = 100;
 
   55    dim3 gridSize(gridEdgeSize, gridEdgeSize);
 
   56    dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
 
   57    typename SparseGridZ::grid_info blockGeometry(gridSize);
 
   58    SparseGridZ sparseGrid(blockGeometry);
 
   60    sparseGrid.template setBackgroundValue<0>(0);
 
   62    unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
 
   65    sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
 
   66    CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
 
   67    sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
   69    sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
 
   70    dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
 
   71    insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
 
   72    sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
   74    sparseGrid.findNeighbours(); 
 
   77    for (
unsigned int iter=0; iter<iterations; ++iter)
 
   79        cudaDeviceSynchronize();
 
   84        sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
 
   85        cudaDeviceSynchronize();
 
   86        sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
 
   87        cudaDeviceSynchronize();
 
   91        measures_tm.add(ts.
getwct());
 
   93        float gElemS = 2 * numElements / (1e9 * ts.
getwct());
 
   94        float gFlopsS = gElemS * Stencil01T::flops;
 
   96        measures_gf.add(gFlopsS);
 
  100    double deviation_tm = 0;
 
  101    standard_deviation(measures_tm,mean_tm,deviation_tm);
 
  104    double deviation_gf = 0;
 
  105    standard_deviation(measures_gf,mean_gf,deviation_gf);
 
  109    float gElemS = 2 * numElements / (1e9 * mean_tm);
 
  110    float gFlopsS = gElemS * Stencil01T::flops;
 
  111    std::cout << 
"Test: " << testName << std::endl;
 
  112    std::cout << 
"Block: " << SparseGridZ::blockEdgeSize_ << 
"x" << SparseGridZ::blockEdgeSize_ << std::endl;
 
  113    std::cout << 
"Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ << 
"x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
 
  114    double dataOccupancyMean, dataOccupancyDev;
 
  115    sparseGrid.deviceToHost();
 
  116    sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << 
"Data Occupancy: " << dataOccupancyMean << 
" dev:" << dataOccupancyDev << std::endl;
 
  117    report_sparsegrid_funcs.graphs.put(base + 
".dataOccupancy.mean",dataOccupancyMean);
 
  118    report_sparsegrid_funcs.graphs.put(base +
".dataOccupancy.dev",dataOccupancyDev);
 
  119    std::cout << 
"Iterations: " << iterations << std::endl;
 
  120    std::cout << 
"\tStencil: " << mean_gf << 
" dev:" << deviation_gf << 
" s" << std::endl;
 
  121    std::cout << 
"Throughput: " << std::endl << 
"\t " << gElemS << 
" GElem/s " << std::endl << 
"\t " << gFlopsS << 
" GFlops/s" << std::endl;
 
  123    report_sparsegrid_funcs.graphs.put(base + 
".GFlops.mean",mean_gf);
 
  124    report_sparsegrid_funcs.graphs.put(base +
".GFlops.dev",deviation_gf);
 
  125    report_sparsegrid_funcs.graphs.put(base + 
".time.mean",mean_tm);
 
  126    report_sparsegrid_funcs.graphs.put(base +
".time.dev",deviation_tm);
 
  128template<
unsigned int blockEdgeSize, 
unsigned int gr
idEdgeSize>
 
  129void launch_testStencilHeatGet_perf(std::string testURI, 
unsigned int i)
 
  131    constexpr unsigned int dim = 2;
 
  136    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  137    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"StencilN");
 
  139    testStencilHeatGet_perf<blockEdgeSize, gridEdgeSize,
 
  141    cudaDeviceSynchronize();
 
  143template<
unsigned int blockEdgeSize, 
unsigned int gr
idEdgeSize>
 
  144void launch_testStencilHeatGetZ_perf(std::string testURI, 
unsigned int i)
 
  146    constexpr unsigned int dim = 2;
 
  151    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  152    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"StencilZ");
 
  154    testStencilHeatGet_perf<blockEdgeSize, gridEdgeSize,
 
  156    cudaDeviceSynchronize();
 
  159template<
unsigned int blockEdgeSize, 
unsigned int gr
idEdgeSize, 
typename SparseGr
idZ>
 
  160void testStencilSkeleton_perf(
unsigned int i, std::string base)
 
  162    auto testName = 
"In-place stencil";
 
  169    report_sparsegrid_funcs.graphs.put(base + 
".dim",2);
 
  170    report_sparsegrid_funcs.graphs.put(base + 
".blockSize",blockEdgeSize);
 
  171    report_sparsegrid_funcs.graphs.put(base + 
".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
 
  172    report_sparsegrid_funcs.graphs.put(base + 
".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
 
  174    unsigned int iterations = 100;
 
  179    dim3 gridSize(gridEdgeSize, gridEdgeSize);
 
  180    dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
 
  181    typename SparseGridZ::grid_info blockGeometry(gridSize);
 
  182    SparseGridZ sparseGrid(blockGeometry);
 
  184    sparseGrid.template setBackgroundValue<0>(0);
 
  186    unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
 
  189    sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
 
  190    CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
 
  191    sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
  193    sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
 
  194    dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
 
  195    insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
 
  196    sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
  198    sparseGrid.findNeighbours(); 
 
  201    for (
unsigned int iter=0; iter<iterations; ++iter)
 
  203        cudaDeviceSynchronize();
 
  208        sparseGrid.template applyStencils<Stencil01T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
 
  209        cudaDeviceSynchronize();
 
  210        sparseGrid.template applyStencils<Stencil10T>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1);
 
  211        cudaDeviceSynchronize();
 
  215        measures_tm.add(ts.
getwct());
 
  217        float gElemS = 2 * numElements / (1e9 * ts.
getwct());
 
  218        float gFlopsS = gElemS * Stencil01T::flops;
 
  220        measures_gf.add(gFlopsS);
 
  224    double deviation_tm = 0;
 
  225    standard_deviation(measures_tm,mean_tm,deviation_tm);
 
  228    double deviation_gf = 0;
 
  229    standard_deviation(measures_gf,mean_gf,deviation_gf);
 
  233    float gElemS = 2 * numElements / (1e9 * mean_tm);
 
  234    float gFlopsS = gElemS * Stencil01T::flops;
 
  235    std::cout << 
"Test: " << testName << std::endl;
 
  236    std::cout << 
"Block: " << SparseGridZ::blockEdgeSize_ << 
"x" << SparseGridZ::blockEdgeSize_ << std::endl;
 
  237    std::cout << 
"Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ << 
"x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
 
  238    double dataOccupancyMean, dataOccupancyDev;
 
  239    sparseGrid.deviceToHost();
 
  240    sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << 
"Data Occupancy: " << dataOccupancyMean << 
" dev:" << dataOccupancyDev << std::endl;
 
  241    report_sparsegrid_funcs.graphs.put(base + 
".dataOccupancy.mean",dataOccupancyMean);
 
  242    report_sparsegrid_funcs.graphs.put(base +
".dataOccupancy.dev",dataOccupancyDev);
 
  243    std::cout << 
"Iterations: " << iterations << std::endl;
 
  244    std::cout << 
"\tStencil: " << mean_gf << 
" dev:" << deviation_gf << 
" s" << std::endl;
 
  245    std::cout << 
"Throughput: " << std::endl << 
"\t " << gElemS << 
" GElem/s " << std::endl << 
"\t " << gFlopsS << 
" GFlops/s" << std::endl;
 
  247    report_sparsegrid_funcs.graphs.put(base + 
".GFlops.mean",mean_gf);
 
  248    report_sparsegrid_funcs.graphs.put(base +
".GFlops.dev",deviation_gf);
 
  249    report_sparsegrid_funcs.graphs.put(base + 
".time.mean",mean_tm);
 
  250    report_sparsegrid_funcs.graphs.put(base +
".time.dev",deviation_tm);
 
  253void launch_testConv3x3x3_perf_z_morton(std::string testURI, 
unsigned int i)
 
  255    constexpr unsigned int dim = 3;
 
  259    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  260    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"Conv3x3x3");
 
  262    testConv3x3x3_perf<SparseGridGpu_z<dim, AggregateT, 8, chunkSize,long int>>(
"Convolution 3x3x3 Z-morton");
 
  265void launch_testConv3x3x3_perf(std::string testURI, 
unsigned int i)
 
  267    constexpr unsigned int dim = 3;
 
  271    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  272    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"Conv3x3x3");
 
  274    testConv3x3x3_perf<SparseGridGpu<dim, AggregateT, 8, chunkSize,long int>>(
"Convolution 3x3x3 ");
 
  277void launch_testConv3x3x3_perf_no_shared_z_morton(std::string testURI, 
unsigned int i)
 
  279    constexpr unsigned int dim = 3;
 
  282    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  283    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"Conv3x3x3");
 
  285    testConv3x3x3_no_shared_perf<SparseGridGpu_z<dim, AggregateT, 8, 512, long int>>(
"Convolution 3x3x3_noshared z-morton");
 
  288void launch_testConv3x3x3_perf_no_shared(std::string testURI, 
unsigned int i)
 
  290    constexpr unsigned int dim = 3;
 
  293    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  294    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"Conv3x3x3");
 
  296    testConv3x3x3_no_shared_perf<SparseGridGpu<dim, AggregateT, 8, 512, long int>>(
"Convolution 3x3x3_noshared");
 
  299template<
unsigned int blockEdgeSize, 
unsigned int gr
idEdgeSize>
 
  300void launch_testStencilSkeleton_perf(std::string testURI, 
unsigned int i)
 
  302    constexpr unsigned int dim = 2;
 
  307    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  308    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"StencilN");
 
  310    testStencilSkeleton_perf<blockEdgeSize, gridEdgeSize,
 
  312    cudaDeviceSynchronize();
 
  316template<
unsigned int blockEdgeSize, 
unsigned int gr
idEdgeSize>
 
  317void launch_testStencilSkeletonZ_perf(std::string testURI, 
unsigned int i)
 
  319    constexpr unsigned int dim = 2;
 
  324    std::string base(testURI + 
"(" + std::to_string(i) + 
")");
 
  325    report_sparsegrid_funcs.graphs.put(base + 
".test.name",
"StencilZ");
 
  327    testStencilSkeleton_perf<blockEdgeSize, gridEdgeSize,
 
  329    cudaDeviceSynchronize();
 
  332BOOST_AUTO_TEST_CASE(testConv3x3x3_noshared)
 
  334    std::string testURI = suiteURI + 
".device.conv3x3x3_no_shared.sparse.N.3D.gridScaling";
 
  335    unsigned int counter = 0;
 
  336    launch_testConv3x3x3_perf_no_shared(testURI, counter++);
 
  337    testSet.insert(testURI);
 
  340BOOST_AUTO_TEST_CASE(testConv3x3x3_noshared_z_morton)
 
  342    std::string testURI = suiteURI + 
".device.conv3x3x3_no_shared.sparse.N.3D.gridScaling";
 
  343    unsigned int counter = 0;
 
  344    launch_testConv3x3x3_perf_no_shared_z_morton(testURI, counter++);
 
  345    testSet.insert(testURI);
 
  348BOOST_AUTO_TEST_CASE(testConv3x3x3)
 
  350    std::string testURI = suiteURI + 
".device.conv3x3x3.sparse.N.3D.gridScaling";
 
  351    unsigned int counter = 0;
 
  352    launch_testConv3x3x3_perf(testURI, counter++);
 
  353    testSet.insert(testURI);
 
  356BOOST_AUTO_TEST_CASE(testConv3x3x3_zmorton)
 
  359    std::string testURI = suiteURI + 
".device.conv3x3x3_zmorton.sparse.N.3D.gridScaling";
 
  360    unsigned int counter = 0;
 
  361    launch_testConv3x3x3_perf_z_morton(testURI, counter++);
 
  362    testSet.insert(testURI);
 
  365BOOST_AUTO_TEST_CASE(testStencilSkeleton_gridScaling)
 
  367    std::string testURI = suiteURI + 
".device.skeleton.dense.N.2D.gridScaling";
 
  368    unsigned int counter = 0;
 
  369    constexpr unsigned int blockEdgeSize = 8;
 
  370    launch_testStencilSkeleton_perf<blockEdgeSize, 128>(testURI, counter++);
 
  371    launch_testStencilSkeleton_perf<blockEdgeSize, 256>(testURI, counter++);
 
  372    launch_testStencilSkeleton_perf<blockEdgeSize, 512>(testURI, counter++);
 
  373    launch_testStencilSkeleton_perf<blockEdgeSize, 1024>(testURI, counter++);
 
  375    testSet.insert(testURI);
 
  378BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_2)
 
  380   std::string testURI = suiteURI + 
".device.stencilGet.dense.N.2D.2.gridScaling";
 
  381   unsigned int counter = 0;
 
  382   launch_testStencilHeatGet_perf<2, 512>(testURI, counter++);
 
  383   launch_testStencilHeatGet_perf<2, 1024>(testURI, counter++);
 
  384   launch_testStencilHeatGet_perf<2, 2048>(testURI, counter++);
 
  387   testSet.insert(testURI);
 
  390BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_4)
 
  392   std::string testURI = suiteURI + 
".device.stencilGet.dense.N.2D.4.gridScaling";
 
  393   unsigned int counter = 0;
 
  394   launch_testStencilHeatGet_perf<4, 256>(testURI, counter++);
 
  395   launch_testStencilHeatGet_perf<4, 512>(testURI, counter++);
 
  396   launch_testStencilHeatGet_perf<4, 1024>(testURI, counter++);
 
  397   launch_testStencilHeatGet_perf<4, 2048>(testURI, counter++);
 
  399   testSet.insert(testURI);
 
  402BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_8)
 
  404   std::string testURI = suiteURI + 
".device.stencilGet.dense.N.2D.8.gridScaling";
 
  405   unsigned int counter = 0;
 
  406   launch_testStencilHeatGet_perf<8, 128>(testURI, counter++);
 
  407   launch_testStencilHeatGet_perf<8, 256>(testURI, counter++);
 
  408   launch_testStencilHeatGet_perf<8, 512>(testURI, counter++);
 
  409   launch_testStencilHeatGet_perf<8, 1024>(testURI, counter++);
 
  411   testSet.insert(testURI);
 
  414BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_16)
 
  416   std::string testURI = suiteURI + 
".device.stencilGet.dense.N.2D.16.gridScaling";
 
  417   unsigned int counter = 0;
 
  418   launch_testStencilHeatGet_perf<16, 64>(testURI, counter++);
 
  419   launch_testStencilHeatGet_perf<16, 128>(testURI, counter++);
 
  420   launch_testStencilHeatGet_perf<16, 256>(testURI, counter++);
 
  421   launch_testStencilHeatGet_perf<16, 512>(testURI, counter++);
 
  423   testSet.insert(testURI);
 
  426BOOST_AUTO_TEST_CASE(testStencilHeatGet_gridScaling_32)
 
  428   std::string testURI = suiteURI + 
".device.stencilGet.dense.N.2D.32.gridScaling";
 
  429   unsigned int counter = 0;
 
  430   launch_testStencilHeatGet_perf<32, 32>(testURI, counter++);
 
  431   launch_testStencilHeatGet_perf<32, 64>(testURI, counter++);
 
  432   launch_testStencilHeatGet_perf<32, 128>(testURI, counter++);
 
  433   launch_testStencilHeatGet_perf<32, 256>(testURI, counter++); 
 
  435   testSet.insert(testURI);
 
  438BOOST_AUTO_TEST_CASE(testStencilHeatGet_blockScaling)
 
  440    std::string testURI = suiteURI + 
".device.stencilGet.dense.N.2D.blockScaling";
 
  441    unsigned int counter = 0;
 
  443    launch_testStencilHeatGet_perf<4, 2048>(testURI, counter++);
 
  444    launch_testStencilHeatGet_perf<8, 1024>(testURI, counter++);
 
  445    launch_testStencilHeatGet_perf<16, 512>(testURI, counter++);
 
  446    launch_testStencilHeatGet_perf<32, 256>(testURI, counter++);
 
  448    testSet.insert(testURI);
 
  452BOOST_AUTO_TEST_CASE(write_teport)
 
  454    write_test_report(report_sparsegrid_funcs, testSet);
 
  457BOOST_AUTO_TEST_SUITE_END()
 
  459BOOST_AUTO_TEST_SUITE_END()
 
Implementation of 1-D std::vector like structure.
 
Class for cpu time benchmarking.
 
void stop()
Stop the timer.
 
void start()
Start the timer.
 
double getwct()
Return the elapsed real time.
 
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...