OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
compute_optimal_device_grid.hpp
1/*
2 * compute_optimal_device_grid.hpp
3 *
4 * Created on: Oct 1, 2017
5 * Author: i-bird
6 */
7
8#ifndef OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_
9#define OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_
10
11
36template<unsigned int dim>
37void calculate_optimal_device_grid(device_grid<dim> & dg,
38 size_t (& sz)[dim],
39 size_t max_block_size,
40 size_t min_block_size)
41{
42
43 if (dim == 0) {return ;}
44
45 size_t tot_block = 1;
46
47 // Here we calculate the factors for each grid dimension and prioritize the
48 // factors by 2 for the blocks
49
50 // Get the factors for x
51 std::vector<unsigned long int> x;
52 openfpm::math::getFactorization(sz[0],x);
53
54 dg.threads.x = 1;
55 size_t jx = 0;
56
57 while(jx < x.size() && x[jx] == 2 && tot_block < max_block_size)
58 {
59 if (tot_block * 2 > max_block_size)
60 {break;}
61
62 dg.threads.x *= 2;
63 tot_block *= 2;
64
65 jx++;
66 }
67
68 // if we already reach the maximum block size we finished
69 if (tot_block * 2 > max_block_size)
70 {
71 dg.threads.y = 1;
72 dg.threads.z = 1;
73
74 dg.grids.x = 1;
75 for (; jx < x.size() ; jx++)
76 {dg.grids.x *= x[jx];}
77
78 if (dim >= 2)
79 {dg.grids.y = sz[1];}
80 else
81 {dg.grids.y = 1;}
82
83 dg.grids.z = 1;
84 for (size_t k = 2 ; k < dim ; k++)
85 // coverty[dead_error_line]
86 {dg.grids.z *= sz[k];}
87
88 return;
89 }
90
91
92 // Get the factors for y
93 std::vector<unsigned long int> y;
94 size_t jy = 0;
95 dg.threads.y = 1;
96
97 if (dim >= 2)
98 {
99 openfpm::math::getFactorization(sz[1],y);
100
101 while(jy < y.size() && y[jy] == 2 && tot_block < max_block_size)
102 {
103 if (tot_block * 2 > max_block_size)
104 {break;}
105
106 dg.threads.y *= 2;
107 tot_block *= 2;
108
109 jy++;
110 }
111
112 // if we already reach the maximum block size we finished
113 if (tot_block * 2 > max_block_size)
114 {
115 dg.threads.z = 1;
116
117 dg.grids.x = 1;
118 for (; jx < x.size() ; jx++)
119 {dg.grids.x *= x[jx];}
120
121 dg.grids.y = 1;
122 for (; jy < y.size() ; jy++)
123 {dg.grids.y *= y[jy];}
124
125 dg.grids.z = 1;
126 for (size_t k = 2 ; k < dim ; k++)
127 {dg.grids.z *= sz[k];}
128
129 return;
130 }
131 }
132
133 // Get the factors for z
134 std::vector<unsigned long int> z;
135
136 size_t jz = 0;
137 dg.threads.z = 1;
138
139 if (dim >= 3)
140 {
141 openfpm::math::getFactorization(sz[2],z);
142
143 while(jz < z.size() && z[jz] == 2 && tot_block < max_block_size)
144 {
145 if (tot_block * 2 > max_block_size)
146 {break;}
147
148 dg.threads.z *= 2;
149 tot_block *= 2;
150
151 jz++;
152 }
153
154 // if we already reach the maximum block size we finished
155 if (tot_block * 2 > max_block_size)
156 {
157 dg.grids.x = 1;
158 for (; jx < x.size() ; jx++)
159 {dg.grids.x *= x[jx];}
160
161 dg.grids.y = 1;
162 for (; jy < y.size() ; jy++)
163 {dg.grids.y *= y[jy];}
164
165 dg.grids.z = 1;
166 for (; jz < z.size() ; jz++)
167 {dg.grids.z *= z[jz];}
168
169 for (size_t k = 3 ; k < dim ; k++)
170 // coverty[dead_error_line]
171 {dg.grids.z *= sz[k];}
172
173 return;
174 }
175 }
176
177 if (tot_block >= min_block_size)
178 {return;}
179
180 // Calculate the grids from the threads configuration
181
182 dg.grids.x = 1;
183 for (size_t k = jx ; k < x.size() ; k++)
184 {dg.grids.x *= x[k];}
185
186 dg.grids.y = 1;
187 for (size_t k = jy ; k < y.size() ; k++)
188 {dg.grids.y *= y[k];}
189
190 dg.grids.z = 1;
191 for (size_t k = jz ; k < z.size() ; k++)
192 {dg.grids.z *= z[k];}
193
194 std::vector<unsigned long int> * ptr_xyz[3];
195 ptr_xyz[0] = &x;
196 ptr_xyz[1] = &y;
197 ptr_xyz[2] = &z;
198
199 size_t * jj[3];
200 jj[0] = &jx;
201 jj[1] = &jy;
202 jj[2] = &jz;
203
204 while (tot_block < min_block_size && (jx < x.size() || jy < y.size() || jz < z.size() ) )
205 {
206 size_t best_fact = std::numeric_limits<size_t>::max();
207 size_t k_best = 0;
208
209 for (size_t k = 0 ; k < dim ; k++)
210 {
211 if (*jj[k] < ptr_xyz[k]->size() && ptr_xyz[k]->operator[](*jj[k]) < best_fact )
212 {
213 best_fact = ptr_xyz[k]->operator[](*jj[k]);
214 k_best = k;
215 }
216 }
217
218 // The maximum block size cannot be passed
219 if (tot_block * best_fact > max_block_size)
220 {break;}
221
222 if (k_best == 0)
223 {
224 dg.threads.x *= best_fact;
225 dg.grids.x /= best_fact;
226 }
227 else if (k_best == 1)
228 {
229 dg.threads.y *= best_fact;
230 dg.grids.y /= best_fact;
231 }
232 /* coverty[dead_error_line] */
233 else if (k_best == 2)
234 {
235 dg.threads.z *= best_fact;
236 dg.grids.z /= best_fact;
237 }
238
239 tot_block *= best_fact;
240
241 (*jj[k_best])++;
242 }
243}
244
245
246#endif /* OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ */
dim3_ grids
number of grid for the kernel execution
Definition map_grid.hpp:481
dim3_ threads
number of treads in each block
Definition map_grid.hpp:478
unsigned int z
size in z dimension
Definition map_grid.hpp:471
unsigned int x
size in x dimension
Definition map_grid.hpp:465
unsigned int y
size in y dimension
Definition map_grid.hpp:468