OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_performance_insert_single.cu
1 /*
2  * SparseGridGpu_performance_insert_single.cu
3  *
4  * Created on: Sep 10, 2019
5  * Author: i-bird
6  */
7 #define BOOST_TEST_DYN_LINK
8 #define DISABLE_MPI_WRITTERS
9 
10 #include <boost/test/unit_test.hpp>
11 #include "performancePlots.hpp"
12 #include <iostream>
13 #include "SparseGridGpu/SparseGridGpu.hpp"
14 #include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15 
16 extern std::string suiteURI;
17 extern report_sparse_grid_tests report_sparsegrid_funcs;
18 extern std::set<std::string> testSet;
19 
20 
21 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
22 void testInsertSingle(std::string testURI, unsigned int i)
23 {
24  auto testName = "Insert single (one chunk per element)";
25  constexpr unsigned int dim = 2;
26 // constexpr unsigned int blockEdgeSize = 8;
27  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
28  typedef aggregate<float> AggregateT;
29 
30  unsigned int iterations = 10;
31  bool prePopulateGrid = true;
32 
33 // std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").insertSingle");
34  std::string base(testURI + "(" + std::to_string(i) + ")");
35  report_sparsegrid_funcs.graphs.put(base + ".test.name","InsertSingle");
36 
37  report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
38  report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
39  report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
40  report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
41 
42  dim3 gridSize(gridEdgeSize, gridEdgeSize);
43  dim3 blockSize(blockEdgeSize, blockEdgeSize);
44  grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
46  mgpu::ofp_context_t ctx;
47  sparseGrid.template setBackgroundValue<0>(0);
48 
49  if (prePopulateGrid)
50  {
51  // Pre-populate grid
52  sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
53  insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), 0, 0);
54  sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
55  cudaDeviceSynchronize();
57  }
58 
59  for (unsigned int iter=0; iter<5; ++iter)
60  {
61  auto offset = 0;
62  sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
63  insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
64  sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
65  cudaDeviceSynchronize();
66  }
67 
68  unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
69  openfpm::vector<double> measures;
70 
71  for (unsigned int iter=0; iter<iterations; ++iter)
72  {
73  auto offset = 0;
74 
75  cudaDeviceSynchronize();
76 
77  timer ts;
78  ts.start();
79 
80  sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
81  insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
82  sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
83  cudaDeviceSynchronize();
84 
85  ts.stop();
86 
87  float mElemS = numElements / (1e6 * ts.getwct());
88  measures.add(mElemS);
89  }
90 
91  double mean = 0;
92  double deviation = 0;
93  standard_deviation(measures,mean,deviation);
94 
95  report_sparsegrid_funcs.graphs.put(base + ".Minsert.mean",mean);
96  report_sparsegrid_funcs.graphs.put(base +".Minsert.dev",deviation);
97 
98  // All times above are in ms
99 
100  std::cout << "Test: " << testName << "\n";
101  std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
102  std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
103  double dataOccupancyMean, dataOccupancyDev;
104  sparseGrid.deviceToHost();
105  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
106  report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
107  report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
108  std::cout << "Iterations: " << iterations << "\n";
109  std::cout << "Throughput:\n\t" << mean << "M/s" << "\n";
110 }
111 
112 
113 BOOST_AUTO_TEST_SUITE(performance)
114 
115 BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
116 
117 BOOST_AUTO_TEST_CASE(testInsert_gridScaling_2)
118 {
119  std::string testURI = suiteURI + ".device.insert.dense.single.2D.2.gridScaling";
120  unsigned int counter = 0;
121  testInsertSingle<2, 128>(testURI, counter++);
122  testInsertSingle<2, 256>(testURI, counter++);
123  testInsertSingle<2, 512>(testURI, counter++);
124  testInsertSingle<2, 1024>(testURI, counter++);
125  testSet.insert(testURI);
126 }
127 BOOST_AUTO_TEST_CASE(testInsert_gridScaling_4)
128 {
129  std::string testURI = suiteURI + ".device.insert.dense.single.2D.4.gridScaling";
130  unsigned int counter = 0;
131  testInsertSingle<4, 64>(testURI, counter++);
132  testInsertSingle<4, 128>(testURI, counter++);
133  testInsertSingle<4, 256>(testURI, counter++);
134  testInsertSingle<4, 512>(testURI, counter++);
135  testSet.insert(testURI);
136 }
137 BOOST_AUTO_TEST_CASE(testInsert_gridScaling_8)
138 {
139  std::string testURI = suiteURI + ".device.insert.dense.single.2D.8.gridScaling";
140  unsigned int counter = 0;
141  testInsertSingle<8, 32>(testURI, counter++);
142  testInsertSingle<8, 64>(testURI, counter++);
143  testInsertSingle<8, 128>(testURI, counter++);
144  testInsertSingle<8, 256>(testURI, counter++);
145  testSet.insert(testURI);
146 }
147 
148 BOOST_AUTO_TEST_CASE(testInsert_blockScaling)
149 {
150  std::string testURI = suiteURI + ".device.insert.dense.single.2D.blockScaling";
151  unsigned int counter = 0;
152  testInsertSingle<2, 1024>(testURI, counter++);
153  testInsertSingle<4, 512>(testURI, counter++);
154  testInsertSingle<8, 256>(testURI, counter++);
155 
156  testSet.insert(testURI);
157 }
158 
159 
160 
161 BOOST_AUTO_TEST_SUITE_END()
162 
163 BOOST_AUTO_TEST_SUITE_END()
164 
165 
166 
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119