OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
main.cu
1 #ifdef __NVCC__
2 
3 #include "Vector/map_vector.hpp"
4 #include "util/stat/common_statistics.hpp"
5 
6 #define NELEMENTS 67108864
7 
9 template<typename vector_type, typename vector_type2>
10 __global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
11 {
12  grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
13  blockIdx.y * blockDim.y + threadIdx.y,
14  blockIdx.z * blockDim.z + threadIdx.z});
15 
16  float a = vd_in.template get<0>(p)[0];
17 
18  vd_out.template get<0>(p) = a;
19 
20  vd_out.template get<1>(p)[0] = a;
21  vd_out.template get<1>(p)[1] = a;
22 
23  vd_out.template get<2>(p)[0][0] = a;
24  vd_out.template get<2>(p)[0][1] = a;
25  vd_out.template get<2>(p)[1][0] = a;
26  vd_out.template get<2>(p)[1][1] = a;
27  vd_in.template get<0>(p)[1] = a;
28 }
29 
30 template<typename vector_type, typename vector_type2>
31 __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
32 {
33  grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
34  blockIdx.y * blockDim.y + threadIdx.y,
35  blockIdx.z * blockDim.z + threadIdx.z});
36 
37  float a = vd_out.template get<0>(p);
38 
39  float b = vd_out.template get<1>(p)[0];
40  float c = vd_out.template get<1>(p)[1];
41 
42  float d = vd_out.template get<2>(p)[0][0];
43  float e = vd_out.template get<2>(p)[0][1];
44  float f = vd_out.template get<2>(p)[1][0];
45  float g = vd_out.template get<2>(p)[1][1];
46 
47  float h = vd_in.template get<0>(p)[0];
48  vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
49 }
50 
51 
52 template<typename in_type, typename out_type>
53 void check_write(in_type & in, out_type & out)
54 {
55  out.template deviceToHost<0,1,2>();
56  in.template deviceToHost<0>();
57 
58  bool success = true;
59  auto it = in.getIterator();
60  while (it.isNext())
61  {
62  auto i = it.get();
63 
64  float a = in.template get<0>(i)[0];
65 
66  if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
67  {
68  success &= a != 0;
69  }
70 
71  success &= out.template get<0>(i) == a;
72 
73  success &= out.template get<1>(i)[0] == a;
74  success &= out.template get<1>(i)[1] == a;
75 
76  success &= out.template get<2>(i)[0][0] == a;
77  success &= out.template get<2>(i)[0][1] == a;
78  success &= out.template get<2>(i)[1][0] == a;
79  success &= out.template get<2>(i)[1][1] == a;
80 
81  success &= in.template get<0>(i)[1] == a;
82 
83  if (success == false)
84  {
85  std::cout << "FAIL " << a << " " << i.to_string() << " " << out.template get<0>(i) << " " << out.template get<1>(i)[0]
86  << out.template get<1>(i)[1] << " " << out.template get<2>(i)[0][0]
87  << out.template get<2>(i)[0][1] << " " << out.template get<2>(i)[1][0]
88  << out.template get<2>(i)[1][0] << " " << out.template get<2>(i)[1][1]
89  << std::endl;
90  break;
91  }
92 
93  ++it;
94  }
95 
96  if (success == false)
97  {
98  std::cout << "FAIL WRITE" << std::endl;
99  exit(1);
100  }
101 }
102 
103 template<typename in_type, typename out_type>
104 void check_read(in_type & in, out_type & out)
105 {
106  out.template deviceToHost<0,1,2>();
107  in.template deviceToHost<0>();
108 
109  bool success = true;
110  auto it = in.getIterator();
111  while (it.isNext())
112  {
113  auto i = it.get();
114 
115  float a = out.template get<0>(i);
116 
117  if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
118  {
119  success &= a != 0;
120  }
121 
122  float b = out.template get<1>(i)[0];
123  float c = out.template get<1>(i)[1];
124 
125  float d = out.template get<2>(i)[0][0];
126  float e = out.template get<2>(i)[0][1];
127  float f = out.template get<2>(i)[1][0];
128  float g = out.template get<2>(i)[1][1];
129 
130  float h = in.template get<0>(i)[0];
131 
132  success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h);
133 
134  if (success == false)
135  {
136  std::cout << "FAIL READ " << i.to_string() << " " << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl;
137  exit(1);
138  }
139 
140  ++it;
141  }
142 }
143 
144 template<typename vector_type, typename vector_type2>
145 __global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in)
146 {
147  grid_key_dx<3,int> i({blockIdx.x * blockDim.x + threadIdx.x,
148  blockIdx.y * blockDim.y + threadIdx.y,
149  blockIdx.z * blockDim.z + threadIdx.z});
150 
151  vd_in.template get<0>(i)[0] = i.get(0) + i.get(1) + i.get(2);
152  vd_in.template get<0>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
153 
154  vd_out.template get<0>(i) = i.get(0) + i.get(1) + i.get(2)+200.0;
155 
156  vd_out.template get<1>(i)[0] = i.get(0) + i.get(1) + i.get(2);
157  vd_out.template get<1>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
158 
159  vd_out.template get<2>(i)[0][0] = i.get(0) + i.get(1) + i.get(2);
160  vd_out.template get<2>(i)[0][1] = i.get(0) + i.get(1) + i.get(2)+100.0;
161  vd_out.template get<2>(i)[1][0] = i.get(0) + i.get(1) + i.get(2)+200.0;
162  vd_out.template get<2>(i)[1][1] = i.get(0) + i.get(1) + i.get(2)+300.0;
163 }
164 
165 template<typename vin_type, typename vout_type>
166 void initialize_buf(vin_type & in, vout_type & out)
167 {
168  auto ite = out.getGPUIterator({0,0,0},{511,511,255});
169  CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
170 }
171 
172 int main(int argc, char *argv[])
173 {
174  init_wrappers();
175 
178 
179  int nele = NELEMENTS;
180 
181  size_t sz[3] = {512,512,256};
182  out.resize(sz);
183  in.resize(sz);
184 
185  out.setMemory();
186  in.setMemory();
187 
188  initialize_buf(in,out);
189 
190  // Read write test with TLS
191 
192  auto ite = out.getGPUIterator({0,0,0},{511,511,255});
193 
195  res.resize(100);
196 
197  for (int i = 0 ; i < 110 ; i++)
198  {
199  cudaDeviceSynchronize();
200  timer t;
201  t.start();
202 
203 
204  CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
205 
206  cudaDeviceSynchronize();
207 
208  t.stop();
209 
210  if (i >=10)
211  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
212 
213  std::cout << "Time: " << t.getwct() << std::endl;
214  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
215  }
216 
217  double mean_write_tls = 0.0;
218  double dev_write_tls = 0.0;
219  standard_deviation(res,mean_write_tls,dev_write_tls);
220 
221  check_write(in,out);
222 
223  initialize_buf(in,out);
224 
225  for (int i = 0 ; i < 110 ; i++)
226  {
227  cudaDeviceSynchronize();
228  timer t;
229  t.start();
230 
231 
232  CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
233 
234  cudaDeviceSynchronize();
235 
236  t.stop();
237 
238  if (i >=10)
239  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
240 
241  std::cout << "Time: " << t.getwct() << std::endl;
242  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
243  }
244 
245  double mean_read_tls = 0.0;
246  double dev_read_tls = 0.0;
247  standard_deviation(res,mean_read_tls,dev_read_tls);
248 
249  check_read(in,out);
250 
252 
254 
255  initialize_buf(in,out);
256 
257  for (int i = 0 ; i < 110 ; i++)
258  {
259  cudaDeviceSynchronize();
260  timer t;
261  t.start();
262 
263  auto vd_out = out.toKernel();
264  auto vd_in = in.toKernel();
265 
266  auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
267  {
268  grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
269  blockIdx.y * blockDim.y + threadIdx.y,
270  blockIdx.z * blockDim.z + threadIdx.z});
271 
272  float a = vd_in.template get<0>(p)[0];
273 
274  vd_out.template get<0>(p) = a;
275 
276  vd_out.template get<1>(p)[0] = a;
277  vd_out.template get<1>(p)[1] = a;
278 
279  vd_out.template get<2>(p)[0][0] = a;
280  vd_out.template get<2>(p)[0][1] = a;
281  vd_out.template get<2>(p)[1][0] = a;
282  vd_out.template get<2>(p)[1][1] = a;
283  vd_in.template get<0>(p)[1] = a;
284  };
285 
286  CUDA_LAUNCH_LAMBDA(ite, lamb);
287 
288  cudaDeviceSynchronize();
289 
290  t.stop();
291 
292  if (i >=10)
293  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
294 
295  std::cout << "Time: " << t.getwct() << std::endl;
296  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
297  }
298 
299  double mean_write_lamb = 0.0;
300  double dev_write_lamb = 0.0;
301  standard_deviation(res,mean_write_lamb,dev_write_lamb);
302 
303  initialize_buf(in,out);
304 
305  for (int i = 0 ; i < 110 ; i++)
306  {
307  cudaDeviceSynchronize();
308  timer t;
309  t.start();
310 
311 
312  auto vd_out = out.toKernel();
313  auto vd_in = in.toKernel();
314 
315  auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
316  {
317  grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
318  blockIdx.y * blockDim.y + threadIdx.y,
319  blockIdx.z * blockDim.z + threadIdx.z});
320 
321  float a = vd_out.template get<0>(p);
322 
323  float b = vd_out.template get<1>(p)[0];
324  float c = vd_out.template get<1>(p)[1];
325 
326  float d = vd_out.template get<2>(p)[0][0];
327  float e = vd_out.template get<2>(p)[0][1];
328  float f = vd_out.template get<2>(p)[1][0];
329  float g = vd_out.template get<2>(p)[1][1];
330 
331  float h = vd_in.template get<0>(p)[0];
332  vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
333  };
334 
335  CUDA_LAUNCH_LAMBDA(ite, lamb);
336 
337  cudaDeviceSynchronize();
338 
339  t.stop();
340 
341  if (i >=10)
342  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
343 
344  std::cout << "Time: " << t.getwct() << std::endl;
345  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
346  }
347 
348  double mean_read_lamb = 0.0;
349  double dev_read_lamb = 0.0;
350  standard_deviation(res,mean_read_lamb,dev_read_lamb);
351 
352  // Array benchmark
353  initialize_buf(in,out);
354 
355  for (int i = 0 ; i < 110 ; i++)
356  {
357  cudaDeviceSynchronize();
358  timer t;
359  t.start();
360 
361  float * out_s = (float *)out.getDeviceBuffer<0>();
362  float * out_v = (float *)out.getDeviceBuffer<1>();
363  float * out_m = (float *)out.getDeviceBuffer<2>();
364  float * in_v = (float *)in.getDeviceBuffer<0>();
365 
366  int sz0 = sz[0];
367  int sz1 = sz[1];
368  int sz2 = sz[2];
369  int stride = out.size();
370 
371  auto lamb_arr_write = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
372  {
373  auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
374  auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
375  auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
376 
377  float a = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
378 
379  out_s[p1 + p2*sz0 + p3*sz0*sz1] = a;
380 
381  out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride] = a;
382  out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
383 
384  out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride ] = a;
385  out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride ] = a;
386  out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride ] = a;
387  out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride ] = a;
388  in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
389  };
390 
391  CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write);
392 
393  cudaDeviceSynchronize();
394 
395  t.stop();
396 
397  if (i >=10)
398  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
399 
400  std::cout << "Time ARR: " << t.getwct() << std::endl;
401  std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
402  }
403 
404  double mean_write_arr = 0.0;
405  double dev_write_arr = 0.0;
406  standard_deviation(res,mean_write_arr,dev_write_arr);
407 
408  check_write(in,out);
409 
410  for (int i = 0 ; i < 110 ; i++)
411  {
412  cudaDeviceSynchronize();
413  timer t;
414  t.start();
415 
416  float * out_s = (float *)out.getDeviceBuffer<0>();
417  float * out_v = (float *)out.getDeviceBuffer<1>();
418  float * out_m = (float *)out.getDeviceBuffer<2>();
419  float * in_v = (float *)in.getDeviceBuffer<0>();
420 
421  int sz0 = sz[0];
422  int sz1 = sz[1];
423  int sz2 = sz[2];
424  int stride = out.size();
425 
426  auto lamb_arr_red = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
427  {
428  auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
429  auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
430  auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
431 
432  float a = out_s[p1 + p2*sz0 + p3*sz0*sz1];
433 
434  float b = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
435  float c = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride];
436 
437  float d = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride];
438  float e = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride];
439  float f = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride];
440  float g = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride];
441 
442  float h = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
443  in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a+b+c+d+e+f+g+h;
444  };
445 
446  CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red);
447 
448  cudaDeviceSynchronize();
449 
450  t.stop();
451 
452  if (i >=10)
453  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
454 
455  std::cout << "Time ARR: " << t.getwct() << std::endl;
456  std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
457  }
458 
459  double mean_read_arr = 0.0;
460  double dev_read_arr = 0.0;
461  standard_deviation(res,mean_read_arr,dev_read_arr);
462 
463  check_read(in,out);
464 
466 
467  #ifdef CUDIFY_USE_CUDA
468 
469  for (int i = 0 ; i < 110 ; i++)
470  {
471  cudaDeviceSynchronize();
472  timer t;
473  t.start();
474 
475  float * a = (float *)in.getDeviceBuffer<0>();
476  float * b = (float *)out.getDeviceBuffer<1>();
477 
478  cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice);
479 
480  cudaDeviceSynchronize();
481 
482  t.stop();
483 
484  if (i >=10)
485  {res.get(i-10) = (double)nele*4*4 / t.getwct() * 1e-9;}
486 
487  std::cout << "Time: " << t.getwct() << std::endl;
488  std::cout << "BW: " << (double)nele*4*4 / t.getwct() * 1e-9 << " GB/s" << std::endl;
489  }
490 
491  double mean_read_mes = 0.0;
492  double dev_read_mes = 0.0;
493  standard_deviation(res,mean_read_mes,dev_read_mes);
494 
495  std::cout << "Average measured: " << mean_read_mes << " deviation: " << dev_read_mes << std::endl;
496 
497  #endif
498 
499  std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl;
500  std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl;
501 
502  std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl;
503  std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl;
504 
505  std::cout << "Average WRITE with array: " << mean_write_arr << " deviation: " << dev_write_arr << std::endl;
506  std::cout << "Average READ with array: " << mean_read_arr << " deviation: " << dev_read_arr << std::endl;
507 }
508 
509 #else
510 
511 int main(int argc, char *argv[])
512 {
513 }
514 
515 #endif
516 
grid_key_dx is the key to access any element in the grid
Definition: grid_key.hpp:18
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition: grid_key.hpp:503
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
Distributed vector.
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119