OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
main.cu
1 #ifdef __NVCC__
2 
3 #include "Vector/map_vector.hpp"
4 #include "util/stat/common_statistics.hpp"
5 
6 #define NELEMENTS 16777216
7 
9 template<typename vector_type, typename vector_type2>
10 __global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
11 {
12  auto p = blockIdx.x * blockDim.x + threadIdx.x;
13 
14  float a = vd_in.template get<0>(p)[0];
15 
16  vd_out.template get<0>(p) = a;
17 
18  vd_out.template get<1>(p)[0] = a;
19  vd_out.template get<1>(p)[1] = a;
20 
21  vd_out.template get<2>(p)[0][0] = a;
22  vd_out.template get<2>(p)[0][1] = a;
23  vd_out.template get<2>(p)[1][0] = a;
24  vd_out.template get<2>(p)[1][1] = a;
25  vd_in.template get<0>(p)[1] = a;
26 }
27 
28 template<typename vector_type, typename vector_type2>
29 __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
30 {
31  auto p = blockIdx.x * blockDim.x + threadIdx.x;
32 
33  float a = vd_out.template get<0>(p);
34 
35  float b = vd_out.template get<1>(p)[0];
36  float c = vd_out.template get<1>(p)[1];
37 
38  float d = vd_out.template get<2>(p)[0][0];
39  float e = vd_out.template get<2>(p)[0][1];
40  float f = vd_out.template get<2>(p)[1][0];
41  float g = vd_out.template get<2>(p)[1][1];
42 
43  float h = vd_in.template get<0>(p)[0];
44  vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
45 }
46 
47 
48 template<typename in_type, typename out_type>
49 void check_write(in_type & in, out_type & out)
50 {
51  out.template deviceToHost<0,1,2>();
52  in.template deviceToHost<0>();
53 
54  bool success = true;
55  for (int i = 0 ; i < NELEMENTS; i++)
56  {
57  float a = in.template get<0>(i)[0];
58 
59  success &= out.template get<0>(i) == a;
60 
61  success &= out.template get<1>(i)[0] == a;
62  success &= out.template get<1>(i)[1] == a;
63 
64  success &= out.template get<2>(i)[0][0] == a;
65  success &= out.template get<2>(i)[0][1] == a;
66  success &= out.template get<2>(i)[1][0] == a;
67  success &= out.template get<2>(i)[1][1] == a;
68 
69  success &= in.template get<0>(i)[1] == a;
70  }
71 
72  if (success == false)
73  {
74  std::cout << "FAIL WRITE" << std::endl;
75  exit(1);
76  }
77 }
78 
79 template<typename in_type, typename out_type>
80 void check_read(in_type & in, out_type & out)
81 {
82  out.template deviceToHost<0,1,2>();
83  in.template deviceToHost<0>();
84 
85  bool success = true;
86  for (int i = 0 ; i < NELEMENTS ; i++)
87  {
88  float a = out.template get<0>(i);
89 
90  float b = out.template get<1>(i)[0];
91  float c = out.template get<1>(i)[1];
92 
93  float d = out.template get<2>(i)[0][0];
94  float e = out.template get<2>(i)[0][1];
95  float f = out.template get<2>(i)[1][0];
96  float g = out.template get<2>(i)[1][1];
97 
98  float h = in.template get<0>(i)[0];
99 
100  success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h);
101 
102  if (success == false)
103  {
104  std::cout << "FAIL READ " << i << " " << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl;
105  exit(1);
106  }
107  }
108 }
109 
110 template<typename vector_type, typename vector_type2>
111 __global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in)
112 {
113  auto i = blockIdx.x * blockDim.x + threadIdx.x;
114 
115  vd_in.template get<0>(i)[0] = i;
116  vd_in.template get<0>(i)[1] = i+100.0;
117 
118  vd_out.template get<0>(i) = i+200.0;
119 
120  vd_out.template get<1>(i)[0] = i;
121  vd_out.template get<1>(i)[1] = i+100.0;
122 
123  vd_out.template get<2>(i)[0][0] = i;
124  vd_out.template get<2>(i)[0][1] = i+100.0;
125  vd_out.template get<2>(i)[1][0] = i+200.0;
126  vd_out.template get<2>(i)[1][1] = i+300.0;
127 }
128 
129 template<typename vin_type, typename vout_type>
130 void initialize_buf(vin_type in, vout_type out)
131 {
132  auto ite = out.getGPUIterator(256);
133  CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
134 }
135 
136 int main(int argc, char *argv[])
137 {
138  init_wrappers();
139 
142 
143  int nele = NELEMENTS;
144 
145  out.resize(nele);
146  in.resize(nele);
147 
148  initialize_buf(in,out);
149 
150  // Read write test with TLS
151 
152  auto ite = out.getGPUIterator(256);
153 
155  res.resize(100);
156 
157  for (int i = 0 ; i < 110 ; i++)
158  {
159  cudaDeviceSynchronize();
160  timer t;
161  t.start();
162 
163 
164  CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
165 
166  cudaDeviceSynchronize();
167 
168  t.stop();
169 
170  if (i >=10)
171  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
172 
173  std::cout << "Time: " << t.getwct() << std::endl;
174  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
175  }
176 
177  double mean_write_tls = 0.0;
178  double dev_write_tls = 0.0;
179  standard_deviation(res,mean_write_tls,dev_write_tls);
180 
181  check_write(in,out);
182 
183  initialize_buf(in,out);
184 
185  for (int i = 0 ; i < 110 ; i++)
186  {
187  cudaDeviceSynchronize();
188  timer t;
189  t.start();
190 
191 
192  CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
193 
194  cudaDeviceSynchronize();
195 
196  t.stop();
197 
198  if (i >=10)
199  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
200 
201  std::cout << "Time: " << t.getwct() << std::endl;
202  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
203  }
204 
205  double mean_read_tls = 0.0;
206  double dev_read_tls = 0.0;
207  standard_deviation(res,mean_read_tls,dev_read_tls);
208 
209  check_read(in,out);
210 
212 
214 
215  initialize_buf(in,out);
216 
217  for (int i = 0 ; i < 110 ; i++)
218  {
219  cudaDeviceSynchronize();
220  timer t;
221  t.start();
222 
223  auto vd_out = out.toKernel();
224  auto vd_in = in.toKernel();
225 
226  auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
227  {
228  auto p = blockIdx.x * blockDim.x + threadIdx.x;
229 
230  float a = vd_in.template get<0>(p)[0];
231 
232  vd_out.template get<0>(p) = a;
233 
234  vd_out.template get<1>(p)[0] = a;
235  vd_out.template get<1>(p)[1] = a;
236 
237  vd_out.template get<2>(p)[0][0] = a;
238  vd_out.template get<2>(p)[0][1] = a;
239  vd_out.template get<2>(p)[1][0] = a;
240  vd_out.template get<2>(p)[1][1] = a;
241  vd_in.template get<0>(p)[1] = a;
242  };
243 
244  CUDA_LAUNCH_LAMBDA(ite, lamb);
245 
246  cudaDeviceSynchronize();
247 
248  t.stop();
249 
250  if (i >=10)
251  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
252 
253  std::cout << "Time: " << t.getwct() << std::endl;
254  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
255  }
256 
257  double mean_write_lamb = 0.0;
258  double dev_write_lamb = 0.0;
259  standard_deviation(res,mean_write_lamb,dev_write_lamb);
260 
261  initialize_buf(in,out);
262 
263  for (int i = 0 ; i < 110 ; i++)
264  {
265  cudaDeviceSynchronize();
266  timer t;
267  t.start();
268 
269 
270  auto vd_out = out.toKernel();
271  auto vd_in = in.toKernel();
272 
273  auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
274  {
275  auto p = blockIdx.x * blockDim.x + threadIdx.x;
276 
277  float a = vd_out.template get<0>(p);
278 
279  float b = vd_out.template get<1>(p)[0];
280  float c = vd_out.template get<1>(p)[1];
281 
282  float d = vd_out.template get<2>(p)[0][0];
283  float e = vd_out.template get<2>(p)[0][1];
284  float f = vd_out.template get<2>(p)[1][0];
285  float g = vd_out.template get<2>(p)[1][1];
286 
287  float h = vd_in.template get<0>(p)[0];
288  vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
289  };
290 
291  CUDA_LAUNCH_LAMBDA(ite, lamb);
292 
293  cudaDeviceSynchronize();
294 
295  t.stop();
296 
297  if (i >=10)
298  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
299 
300  std::cout << "Time: " << t.getwct() << std::endl;
301  std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
302  }
303 
304  double mean_read_lamb = 0.0;
305  double dev_read_lamb = 0.0;
306  standard_deviation(res,mean_read_lamb,dev_read_lamb);
307 
308  // Array benchmark
309 
310  for (int i = 0 ; i < 110 ; i++)
311  {
312  cudaDeviceSynchronize();
313  timer t;
314  t.start();
315 
316  float * out_s = (float *)out.getDeviceBuffer<0>();
317  float * out_v = (float *)out.getDeviceBuffer<1>();
318  float * out_m = (float *)out.getDeviceBuffer<2>();
319  float * in_v = (float *)in.getDeviceBuffer<0>();
320 
321  int stride = out.capacity();
322 
323  auto lamb_arr_write = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
324  {
325  auto p = blockIdx.x * blockDim.x + threadIdx.x;
326 
327  float a = in_v[p + 0*stride];
328 
329  out_s[p] = a;
330 
331  out_v[p + 0*stride] = a;
332  out_v[p + 1*stride] = a;
333 
334  out_m[p + 0*2*stride + 0*stride ] = a;
335  out_m[p + 0*2*stride + 1*stride ] = a;
336  out_m[p + 1*2*stride + 0*stride ] = a;
337  out_m[p + 1*2*stride + 1*stride ] = a;
338  in_v[p + 1*stride] = a;
339  };
340 
341  CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write);
342 
343  cudaDeviceSynchronize();
344 
345  t.stop();
346 
347  if (i >=10)
348  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
349 
350  std::cout << "Time ARR: " << t.getwct() << std::endl;
351  std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
352  }
353 
354  double mean_write_arr = 0.0;
355  double dev_write_arr = 0.0;
356  standard_deviation(res,mean_write_arr,dev_write_arr);
357 
358  check_write(in,out);
359 
360  for (int i = 0 ; i < 110 ; i++)
361  {
362  cudaDeviceSynchronize();
363  timer t;
364  t.start();
365 
366  float * out_s = (float *)out.getDeviceBuffer<0>();
367  float * out_v = (float *)out.getDeviceBuffer<1>();
368  float * out_m = (float *)out.getDeviceBuffer<2>();
369  float * in_v = (float *)in.getDeviceBuffer<0>();
370 
371  int stride = out.capacity();
372 
373  auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
374  {
375  auto p = blockIdx.x * blockDim.x + threadIdx.x;
376 
377  float a = out_s[p];
378 
379  float b = out_v[p + 0*stride];
380  float c = out_v[p + 1*stride];
381 
382  float d = out_m[p + 0*2*stride + 0*stride];
383  float e = out_m[p + 0*2*stride + 1*stride];
384  float f = out_m[p + 1*2*stride + 0*stride];
385  float g = out_m[p + 1*2*stride + 1*stride];
386 
387  float h = in_v[p + 0*stride];
388  in_v[p + 1*stride] = a+b+c+d+e+f+g+h;
389  };
390 
391  CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red);
392 
393  cudaDeviceSynchronize();
394 
395  t.stop();
396 
397  if (i >=10)
398  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
399 
400  std::cout << "Time ARR: " << t.getwct() << std::endl;
401  std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
402  }
403 
404  double mean_read_arr = 0.0;
405  double dev_read_arr = 0.0;
406  standard_deviation(res,mean_read_arr,dev_read_arr);
407 
408  check_read(in,out);
409 
411 
412  #ifdef CUDIFY_USE_OPENMP
413 
414  for (int i = 0 ; i < 110 ; i++)
415  {
416  timer t;
417  t.start();
418 
419  float * out_s = (float *)out.getDeviceBuffer<0>();
420  float * out_v = (float *)out.getDeviceBuffer<1>();
421  float * out_m = (float *)out.getDeviceBuffer<2>();
422  float * in_v = (float *)in.getDeviceBuffer<0>();
423 
424  int stride = out.capacity();
425 
426  auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __host__ (int p)
427  {
428  float a = out_s[p];
429 
430  float b = out_v[p + 0*stride];
431  float c = out_v[p + 1*stride];
432 
433  float d = out_m[p + 0*2*stride + 0*stride];
434  float e = out_m[p + 0*2*stride + 1*stride];
435  float f = out_m[p + 1*2*stride + 0*stride];
436  float g = out_m[p + 1*2*stride + 1*stride];
437 
438  float h = in_v[p + 0*stride];
439  in_v[p + 1*stride] = a+b+c+d+e+f+g+h;
440  };
441 
442  for (int i = 0 ; i < NELEMENTS ; i++)
443  {
444  lamb_arr_red(i);
445  }
446 
447  t.stop();
448 
449  if (i >=10)
450  {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
451 
452  std::cout << "Time ARR: " << t.getwct() << std::endl;
453  std::cout << "BW 1-CORE ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
454  }
455 
456  #endif
457 
459 
460  #ifdef CUDIFY_USE_CUDA
461 
462  for (int i = 0 ; i < 110 ; i++)
463  {
464  cudaDeviceSynchronize();
465  timer t;
466  t.start();
467 
468  float * a = (float *)in.getDeviceBuffer<0>();
469  float * b = (float *)out.getDeviceBuffer<1>();
470 
471  cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice);
472 
473  cudaDeviceSynchronize();
474 
475  t.stop();
476 
477  if (i >=10)
478  {res.get(i-10) = (double)nele*4*4 / t.getwct() * 1e-9;}
479 
480  std::cout << "Time: " << t.getwct() << std::endl;
481  std::cout << "BW: " << (double)nele*4*4 / t.getwct() * 1e-9 << " GB/s" << std::endl;
482  }
483 
484  double mean_read_mes = 0.0;
485  double dev_read_mes = 0.0;
486  standard_deviation(res,mean_read_mes,dev_read_mes);
487 
488  std::cout << "Average measured: " << mean_read_mes << " deviation: " << dev_read_mes << std::endl;
489 
490  #endif
491 
492 // std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl;
493 // std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl;
494 
495  std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl;
496  std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl;
497 
498  std::cout << "Average WRITE with array: " << mean_write_arr << " deviation: " << dev_write_arr << std::endl;
499  std::cout << "Average READ with array: " << mean_read_arr << " deviation: " << dev_read_arr << std::endl;
500 }
501 
502 #else
503 
504 int main(int argc, char *argv[])
505 {
506 }
507 
508 #endif
509 
double getwct()
Return the elapsed real time.
Definition: timer.hpp:130
void start()
Start the timer.
Definition: timer.hpp:90
Distributed vector.
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for cpu time benchmarking.
Definition: timer.hpp:27
void stop()
Stop the timer.
Definition: timer.hpp:119