OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu_performance_insert_block.cu
1 /*
2  * SparseGridGpu_performance_insert_block.cu
3  *
4  * Created on: Sep 10, 2019
5  * Author: i-bird
6  */
7 #define BOOST_TEST_DYN_LINK
8 #define DISABLE_MPI_WRITTERS
9 
10 #include <boost/test/unit_test.hpp>
11 #include "performancePlots.hpp"
12 #include <iostream>
13 #include "SparseGridGpu/SparseGridGpu.hpp"
14 #include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15 
16 extern std::string suiteURI;
17 extern report_sparse_grid_tests report_sparsegrid_funcs;
18 extern std::set<std::string> testSet;
19 
20 template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
21 void test_insert_block(std::string testURI, unsigned int i)
22 {
23  auto testName = "Insert (one chunk per block)";
24  constexpr unsigned int dim = 2;
25 // constexpr unsigned int blockEdgeSize = 8;
26  constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
27  typedef aggregate<float> AggregateT;
28 
29 // std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").insert");
30  std::string base(testURI + "(" + std::to_string(i) + ")");
31  report_sparsegrid_funcs.graphs.put(base + ".test.name","InsertBlock");
32 
33  report_sparsegrid_funcs.graphs.put(base + ".name","Block insert");
34  report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
35  report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
36  report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
37  report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
38 
39  unsigned int iterations = 10;
40 
41  openfpm::vector<double> measures;
42 
43  unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
44  dim3 gridSize(gridEdgeSize, gridEdgeSize);
45  dim3 blockSize(blockEdgeSize, blockEdgeSize);
46  dim3 blockSizeBlockedInsert(1, 1);
47  grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
49  mgpu::ofp_context_t ctx;
50  sparseGrid.template setBackgroundValue<0>(0);
51 
52  // Warmup
53  for (unsigned int iter=0; iter<5; ++iter)
54  {
55  auto offset = 0;
56  sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
57  insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
58  (sparseGrid.toKernel(), offset, offset);
59  sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
60  }
61 
62 
63  cudaDeviceSynchronize();
64 
65 
66  for (unsigned int iter=0; iter<iterations; ++iter)
67  {
68  auto offset = 0;
69 
70  cudaDeviceSynchronize();
71 
72  timer ts;
73  ts.start();
74 
75  sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
76  insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
77  (sparseGrid.toKernel(), offset, offset);
78  sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
79 
80  cudaDeviceSynchronize();
81 
82  ts.stop();
83 
84  float mElemS = numElements / (1e6 * ts.getwct());
85  measures.add(mElemS);
86  }
87 
88  double mean = 0;
89  double deviation = 0;
90  standard_deviation(measures,mean,deviation);
91 
92  report_sparsegrid_funcs.graphs.put(base + ".Minsert.mean",mean);
93  report_sparsegrid_funcs.graphs.put(base +".Minsert.dev",deviation);
94 
95  // All times above are in ms
96 
97  std::cout << "Test: " << testName << "\n";
98  std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
99  std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
100  double dataOccupancyMean, dataOccupancyDev;
101  sparseGrid.deviceToHost();
102  sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
103  report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
104  report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
105  std::cout << "Iterations: " << iterations << "\n";
106  std::cout << "\tInsert: " << mean << " dev: " << deviation << " s" << std::endl;
107  std::cout << "Throughput:\n\t" << mean << " MElem/s\n";
108 }
109 
110 
111 BOOST_AUTO_TEST_SUITE(performance)
112 
113 BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
114 
115 BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_2)
116 {
117  std::string testURI = suiteURI + ".device.insert.dense.block.2D.2.gridScaling";
118  unsigned int counter = 0;
119  test_insert_block<2,128>(testURI, counter++);
120  test_insert_block<2,256>(testURI, counter++);
121  test_insert_block<2,512>(testURI, counter++);
122  test_insert_block<2,1024>(testURI, counter++);
123  test_insert_block<2,2048>(testURI, counter++);
124 // test_insert_block<2,4096>(testURI, counter++);
125 
126  testSet.insert(testURI);
127 }
128 
129 BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_4)
130 {
131  std::string testURI = suiteURI + ".device.insert.dense.block.2D.4.gridScaling";
132  unsigned int counter = 0;
133  test_insert_block<4,64>(testURI, counter++);
134  test_insert_block<4,128>(testURI, counter++);
135  test_insert_block<4,256>(testURI, counter++);
136  test_insert_block<4,512>(testURI, counter++);
137  test_insert_block<4,1024>(testURI, counter++);
138  test_insert_block<4,2048>(testURI, counter++);
139 
140  testSet.insert(testURI);
141 }
142 
143 BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_8)
144 {
145  std::string testURI = suiteURI + ".device.insert.dense.block.2D.8.gridScaling";
146  unsigned int counter = 0;
147  test_insert_block<8,32>(testURI, counter++);
148  test_insert_block<8,64>(testURI, counter++);
149  test_insert_block<8,128>(testURI, counter++);
150  test_insert_block<8,256>(testURI, counter++);
151  test_insert_block<8,512>(testURI, counter++);
152  test_insert_block<8,1024>(testURI, counter++);
153 
154  testSet.insert(testURI);
155 }
156 
157 BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_16)
158 {
159  std::string testURI = suiteURI + ".device.insert.dense.block.2D.16.gridScaling";
160  unsigned int counter = 0;
161  test_insert_block<16,16>(testURI, counter++);
162  test_insert_block<16,32>(testURI, counter++);
163  test_insert_block<16,64>(testURI, counter++);
164  test_insert_block<16,128>(testURI, counter++);
165  test_insert_block<16,256>(testURI, counter++);
166  test_insert_block<16,512>(testURI, counter++);
167 
168  testSet.insert(testURI);
169 }
170 
171 BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_32)
172 {
173  std::string testURI = suiteURI + ".device.insert.dense.block.2D.32.gridScaling";
174  unsigned int counter = 0;
175  test_insert_block<32,8>(testURI, counter++);
176  test_insert_block<32,16>(testURI, counter++);
177  test_insert_block<32,32>(testURI, counter++);
178  test_insert_block<32,64>(testURI, counter++);
179  test_insert_block<32,128>(testURI, counter++);
180  test_insert_block<32,256>(testURI, counter++);
181 
182  testSet.insert(testURI);
183 }
184 
185 BOOST_AUTO_TEST_CASE(testInsertBlocked_blockScaling)
186 {
187  std::string testURI = suiteURI + ".device.insert.dense.block.2D.blockScaling";
188  unsigned int counter = 0;
189  test_insert_block<2,2048>(testURI, counter++);
190  test_insert_block<4,1024>(testURI, counter++);
191  test_insert_block<8,512>(testURI, counter++);
192  test_insert_block<16,256>(testURI, counter++);
193  test_insert_block<32,128>(testURI, counter++);
194 
195  testSet.insert(testURI);
196 }
197 
198 BOOST_AUTO_TEST_SUITE_END()
199 
200 BOOST_AUTO_TEST_SUITE_END()
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119