OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu_performance_insert_single.cu
1/*
2 * SparseGridGpu_performance_insert_single.cu
3 *
4 * Created on: Sep 10, 2019
5 * Author: i-bird
6 */
7#define BOOST_TEST_DYN_LINK
8#define DISABLE_MPI_WRITTERS
9
10#include <boost/test/unit_test.hpp>
11#include "performancePlots.hpp"
12#include <iostream>
13#include "SparseGridGpu/SparseGridGpu.hpp"
14#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
15
16extern std::string suiteURI;
17extern report_sparse_grid_tests report_sparsegrid_funcs;
18extern std::set<std::string> testSet;
19
20
21template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
22void testInsertSingle(std::string testURI, unsigned int i)
23{
24 auto testName = "Insert single (one chunk per element)";
25 constexpr unsigned int dim = 2;
26// constexpr unsigned int blockEdgeSize = 8;
27 constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
28 typedef aggregate<float> AggregateT;
29
30 unsigned int iterations = 10;
31 bool prePopulateGrid = true;
32
33// std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").insertSingle");
34 std::string base(testURI + "(" + std::to_string(i) + ")");
35 report_sparsegrid_funcs.graphs.put(base + ".test.name","InsertSingle");
36
37 report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
38 report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
39 report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
40 report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
41
42 dim3 gridSize(gridEdgeSize, gridEdgeSize);
43 dim3 blockSize(blockEdgeSize, blockEdgeSize);
44 grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
47 sparseGrid.template setBackgroundValue<0>(0);
48
49 if (prePopulateGrid)
50 {
51 // Pre-populate grid
52 sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
53 insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), 0, 0);
54 sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
55 cudaDeviceSynchronize();
57 }
58
59 for (unsigned int iter=0; iter<5; ++iter)
60 {
61 auto offset = 0;
62 sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
63 insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
64 sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
65 cudaDeviceSynchronize();
66 }
67
68 unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
70
71 for (unsigned int iter=0; iter<iterations; ++iter)
72 {
73 auto offset = 0;
74
75 cudaDeviceSynchronize();
76
77 timer ts;
78 ts.start();
79
80 sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
81 insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
82 sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
83 cudaDeviceSynchronize();
84
85 ts.stop();
86
87 float mElemS = numElements / (1e6 * ts.getwct());
88 measures.add(mElemS);
89 }
90
91 double mean = 0;
92 double deviation = 0;
93 standard_deviation(measures,mean,deviation);
94
95 report_sparsegrid_funcs.graphs.put(base + ".Minsert.mean",mean);
96 report_sparsegrid_funcs.graphs.put(base +".Minsert.dev",deviation);
97
98 // All times above are in ms
99
100 std::cout << "Test: " << testName << "\n";
101 std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
102 std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
103 double dataOccupancyMean, dataOccupancyDev;
104 sparseGrid.deviceToHost();
105 sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
106 report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
107 report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
108 std::cout << "Iterations: " << iterations << "\n";
109 std::cout << "Throughput:\n\t" << mean << "M/s" << "\n";
110}
111
112
113BOOST_AUTO_TEST_SUITE(performance)
114
115BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
116
117BOOST_AUTO_TEST_CASE(testInsert_gridScaling_2)
118{
119 std::string testURI = suiteURI + ".device.insert.dense.single.2D.2.gridScaling";
120 unsigned int counter = 0;
121 testInsertSingle<2, 128>(testURI, counter++);
122 testInsertSingle<2, 256>(testURI, counter++);
123 testInsertSingle<2, 512>(testURI, counter++);
124 testInsertSingle<2, 1024>(testURI, counter++);
125 testSet.insert(testURI);
126}
127BOOST_AUTO_TEST_CASE(testInsert_gridScaling_4)
128{
129 std::string testURI = suiteURI + ".device.insert.dense.single.2D.4.gridScaling";
130 unsigned int counter = 0;
131 testInsertSingle<4, 64>(testURI, counter++);
132 testInsertSingle<4, 128>(testURI, counter++);
133 testInsertSingle<4, 256>(testURI, counter++);
134 testInsertSingle<4, 512>(testURI, counter++);
135 testSet.insert(testURI);
136}
137BOOST_AUTO_TEST_CASE(testInsert_gridScaling_8)
138{
139 std::string testURI = suiteURI + ".device.insert.dense.single.2D.8.gridScaling";
140 unsigned int counter = 0;
141 testInsertSingle<8, 32>(testURI, counter++);
142 testInsertSingle<8, 64>(testURI, counter++);
143 testInsertSingle<8, 128>(testURI, counter++);
144 testInsertSingle<8, 256>(testURI, counter++);
145 testSet.insert(testURI);
146}
147
148BOOST_AUTO_TEST_CASE(testInsert_blockScaling)
149{
150 std::string testURI = suiteURI + ".device.insert.dense.single.2D.blockScaling";
151 unsigned int counter = 0;
152 testInsertSingle<2, 1024>(testURI, counter++);
153 testInsertSingle<4, 512>(testURI, counter++);
154 testInsertSingle<8, 256>(testURI, counter++);
155
156 testSet.insert(testURI);
157}
158
159
160
161BOOST_AUTO_TEST_SUITE_END()
162
163BOOST_AUTO_TEST_SUITE_END()
164
165
166
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...