OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_performance_insert_block.cu
1/*
2 * SparseGridGpu_performance_insert_block.cu
3 *
4 * Created on: Sep 10, 2019
5 * Author: i-bird
6 */
7#define BOOST_TEST_DYN_LINK
8#define DISABLE_MPI_WRITTERS
9
10#include <boost/test/unit_test.hpp>
11#include "performancePlots.hpp"
12#include <iostream>
13#include "SparseGridGpu/SparseGridGpu.hpp"
14#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15
16extern std::string suiteURI;
17extern report_sparse_grid_tests report_sparsegrid_funcs;
18extern std::set<std::string> testSet;
19
20template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
21void test_insert_block(std::string testURI, unsigned int i)
22{
23 auto testName = "Insert (one chunk per block)";
24 constexpr unsigned int dim = 2;
25// constexpr unsigned int blockEdgeSize = 8;
26 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
27 typedef aggregate<float> AggregateT;
28
29// std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").insert");
30 std::string base(testURI + "(" + std::to_string(i) + ")");
31 report_sparsegrid_funcs.graphs.put(base + ".test.name","InsertBlock");
32
33 report_sparsegrid_funcs.graphs.put(base + ".name","Block insert");
34 report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
35 report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
36 report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
37 report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
38
39 unsigned int iterations = 10;
40
42
43 unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
44 dim3 gridSize(gridEdgeSize, gridEdgeSize);
45 dim3 blockSize(blockEdgeSize, blockEdgeSize);
46 dim3 blockSizeBlockedInsert(1, 1);
47 grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
50 sparseGrid.template setBackgroundValue<0>(0);
51
52 // Warmup
53 for (unsigned int iter=0; iter<5; ++iter)
54 {
55 auto offset = 0;
56 sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
57 insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
58 (sparseGrid.toKernel(), offset, offset);
59 sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
60 }
61
62
63 cudaDeviceSynchronize();
64
65
66 for (unsigned int iter=0; iter<iterations; ++iter)
67 {
68 auto offset = 0;
69
70 cudaDeviceSynchronize();
71
72 timer ts;
73 ts.start();
74
75 sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
76 insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
77 (sparseGrid.toKernel(), offset, offset);
78 sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
79
80 cudaDeviceSynchronize();
81
82 ts.stop();
83
84 float mElemS = numElements / (1e6 * ts.getwct());
85 measures.add(mElemS);
86 }
87
88 double mean = 0;
89 double deviation = 0;
90 standard_deviation(measures,mean,deviation);
91
92 report_sparsegrid_funcs.graphs.put(base + ".Minsert.mean",mean);
93 report_sparsegrid_funcs.graphs.put(base +".Minsert.dev",deviation);
94
95 // All times above are in ms
96
97 std::cout << "Test: " << testName << "\n";
98 std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
99 std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
100 double dataOccupancyMean, dataOccupancyDev;
101 sparseGrid.deviceToHost();
102 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
103 report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
104 report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
105 std::cout << "Iterations: " << iterations << "\n";
106 std::cout << "\tInsert: " << mean << " dev: " << deviation << " s" << std::endl;
107 std::cout << "Throughput:\n\t" << mean << " MElem/s\n";
108}
109
110
111BOOST_AUTO_TEST_SUITE(performance)
112
113BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
114
115BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_2)
116{
117 std::string testURI = suiteURI + ".device.insert.dense.block.2D.2.gridScaling";
118 unsigned int counter = 0;
119 test_insert_block<2,128>(testURI, counter++);
120 test_insert_block<2,256>(testURI, counter++);
121 test_insert_block<2,512>(testURI, counter++);
122 test_insert_block<2,1024>(testURI, counter++);
123 test_insert_block<2,2048>(testURI, counter++);
124// test_insert_block<2,4096>(testURI, counter++);
125
126 testSet.insert(testURI);
127}
128
129BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_4)
130{
131 std::string testURI = suiteURI + ".device.insert.dense.block.2D.4.gridScaling";
132 unsigned int counter = 0;
133 test_insert_block<4,64>(testURI, counter++);
134 test_insert_block<4,128>(testURI, counter++);
135 test_insert_block<4,256>(testURI, counter++);
136 test_insert_block<4,512>(testURI, counter++);
137 test_insert_block<4,1024>(testURI, counter++);
138 test_insert_block<4,2048>(testURI, counter++);
139
140 testSet.insert(testURI);
141}
142
143BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_8)
144{
145 std::string testURI = suiteURI + ".device.insert.dense.block.2D.8.gridScaling";
146 unsigned int counter = 0;
147 test_insert_block<8,32>(testURI, counter++);
148 test_insert_block<8,64>(testURI, counter++);
149 test_insert_block<8,128>(testURI, counter++);
150 test_insert_block<8,256>(testURI, counter++);
151 test_insert_block<8,512>(testURI, counter++);
152 test_insert_block<8,1024>(testURI, counter++);
153
154 testSet.insert(testURI);
155}
156
157BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_16)
158{
159 std::string testURI = suiteURI + ".device.insert.dense.block.2D.16.gridScaling";
160 unsigned int counter = 0;
161 test_insert_block<16,16>(testURI, counter++);
162 test_insert_block<16,32>(testURI, counter++);
163 test_insert_block<16,64>(testURI, counter++);
164 test_insert_block<16,128>(testURI, counter++);
165 test_insert_block<16,256>(testURI, counter++);
166 test_insert_block<16,512>(testURI, counter++);
167
168 testSet.insert(testURI);
169}
170
171BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_32)
172{
173 std::string testURI = suiteURI + ".device.insert.dense.block.2D.32.gridScaling";
174 unsigned int counter = 0;
175 test_insert_block<32,8>(testURI, counter++);
176 test_insert_block<32,16>(testURI, counter++);
177 test_insert_block<32,32>(testURI, counter++);
178 test_insert_block<32,64>(testURI, counter++);
179 test_insert_block<32,128>(testURI, counter++);
180 test_insert_block<32,256>(testURI, counter++);
181
182 testSet.insert(testURI);
183}
184
185BOOST_AUTO_TEST_CASE(testInsertBlocked_blockScaling)
186{
187 std::string testURI = suiteURI + ".device.insert.dense.block.2D.blockScaling";
188 unsigned int counter = 0;
189 test_insert_block<2,2048>(testURI, counter++);
190 test_insert_block<4,1024>(testURI, counter++);
191 test_insert_block<8,512>(testURI, counter++);
192 test_insert_block<16,256>(testURI, counter++);
193 test_insert_block<32,128>(testURI, counter++);
194
195 testSet.insert(testURI);
196}
197
198BOOST_AUTO_TEST_SUITE_END()
199
200BOOST_AUTO_TEST_SUITE_END()
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...