OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
main.cu
1#ifdef __NVCC__
2
3#include "Vector/map_vector.hpp"
4#include "util/stat/common_statistics.hpp"
5
6#define NELEMENTS 16777216
7
9template<typename vector_type, typename vector_type2>
10__global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
11{
12 auto p = blockIdx.x * blockDim.x + threadIdx.x;
13
14 float a = vd_in.template get<0>(p)[0];
15
16 vd_out.template get<0>(p) = a;
17
18 vd_out.template get<1>(p)[0] = a;
19 vd_out.template get<1>(p)[1] = a;
20
21 vd_out.template get<2>(p)[0][0] = a;
22 vd_out.template get<2>(p)[0][1] = a;
23 vd_out.template get<2>(p)[1][0] = a;
24 vd_out.template get<2>(p)[1][1] = a;
25 vd_in.template get<0>(p)[1] = a;
26}
27
28template<typename vector_type, typename vector_type2>
29__global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
30{
31 auto p = blockIdx.x * blockDim.x + threadIdx.x;
32
33 float a = vd_out.template get<0>(p);
34
35 float b = vd_out.template get<1>(p)[0];
36 float c = vd_out.template get<1>(p)[1];
37
38 float d = vd_out.template get<2>(p)[0][0];
39 float e = vd_out.template get<2>(p)[0][1];
40 float f = vd_out.template get<2>(p)[1][0];
41 float g = vd_out.template get<2>(p)[1][1];
42
43 float h = vd_in.template get<0>(p)[0];
44 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
45}
46
47
48template<typename in_type, typename out_type>
49void check_write(in_type & in, out_type & out)
50{
51 out.template deviceToHost<0,1,2>();
52 in.template deviceToHost<0>();
53
54 bool success = true;
55 for (int i = 0 ; i < NELEMENTS; i++)
56 {
57 float a = in.template get<0>(i)[0];
58
59 success &= out.template get<0>(i) == a;
60
61 success &= out.template get<1>(i)[0] == a;
62 success &= out.template get<1>(i)[1] == a;
63
64 success &= out.template get<2>(i)[0][0] == a;
65 success &= out.template get<2>(i)[0][1] == a;
66 success &= out.template get<2>(i)[1][0] == a;
67 success &= out.template get<2>(i)[1][1] == a;
68
69 success &= in.template get<0>(i)[1] == a;
70 }
71
72 if (success == false)
73 {
74 std::cout << "FAIL WRITE" << std::endl;
75 exit(1);
76 }
77}
78
79template<typename in_type, typename out_type>
80void check_read(in_type & in, out_type & out)
81{
82 out.template deviceToHost<0,1,2>();
83 in.template deviceToHost<0>();
84
85 bool success = true;
86 for (int i = 0 ; i < NELEMENTS ; i++)
87 {
88 float a = out.template get<0>(i);
89
90 float b = out.template get<1>(i)[0];
91 float c = out.template get<1>(i)[1];
92
93 float d = out.template get<2>(i)[0][0];
94 float e = out.template get<2>(i)[0][1];
95 float f = out.template get<2>(i)[1][0];
96 float g = out.template get<2>(i)[1][1];
97
98 float h = in.template get<0>(i)[0];
99
100 success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h);
101
102 if (success == false)
103 {
104 std::cout << "FAIL READ " << i << " " << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl;
105 exit(1);
106 }
107 }
108}
109
110template<typename vector_type, typename vector_type2>
111__global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in)
112{
113 auto i = blockIdx.x * blockDim.x + threadIdx.x;
114
115 vd_in.template get<0>(i)[0] = i;
116 vd_in.template get<0>(i)[1] = i+100.0;
117
118 vd_out.template get<0>(i) = i+200.0;
119
120 vd_out.template get<1>(i)[0] = i;
121 vd_out.template get<1>(i)[1] = i+100.0;
122
123 vd_out.template get<2>(i)[0][0] = i;
124 vd_out.template get<2>(i)[0][1] = i+100.0;
125 vd_out.template get<2>(i)[1][0] = i+200.0;
126 vd_out.template get<2>(i)[1][1] = i+300.0;
127}
128
129template<typename vin_type, typename vout_type>
130void initialize_buf(vin_type in, vout_type out)
131{
132 auto ite = out.getGPUIterator(256);
133 CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
134}
135
136int main(int argc, char *argv[])
137{
138 init_wrappers();
139
142
143 int nele = NELEMENTS;
144
145 out.resize(nele);
146 in.resize(nele);
147
148 initialize_buf(in,out);
149
150 // Read write test with TLS
151
152 auto ite = out.getGPUIterator(256);
153
155 res.resize(100);
156
157 for (int i = 0 ; i < 110 ; i++)
158 {
159 cudaDeviceSynchronize();
160 timer t;
161 t.start();
162
163
164 CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
165
166 cudaDeviceSynchronize();
167
168 t.stop();
169
170 if (i >=10)
171 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
172
173 std::cout << "Time: " << t.getwct() << std::endl;
174 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
175 }
176
177 double mean_write_tls = 0.0;
178 double dev_write_tls = 0.0;
179 standard_deviation(res,mean_write_tls,dev_write_tls);
180
181 check_write(in,out);
182
183 initialize_buf(in,out);
184
185 for (int i = 0 ; i < 110 ; i++)
186 {
187 cudaDeviceSynchronize();
188 timer t;
189 t.start();
190
191
192 CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
193
194 cudaDeviceSynchronize();
195
196 t.stop();
197
198 if (i >=10)
199 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
200
201 std::cout << "Time: " << t.getwct() << std::endl;
202 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
203 }
204
205 double mean_read_tls = 0.0;
206 double dev_read_tls = 0.0;
207 standard_deviation(res,mean_read_tls,dev_read_tls);
208
209 check_read(in,out);
210
212
214
215 initialize_buf(in,out);
216
217 for (int i = 0 ; i < 110 ; i++)
218 {
219 cudaDeviceSynchronize();
220 timer t;
221 t.start();
222
223 auto vd_out = out.toKernel();
224 auto vd_in = in.toKernel();
225
226 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
227 {
228 auto p = blockIdx.x * blockDim.x + threadIdx.x;
229
230 float a = vd_in.template get<0>(p)[0];
231
232 vd_out.template get<0>(p) = a;
233
234 vd_out.template get<1>(p)[0] = a;
235 vd_out.template get<1>(p)[1] = a;
236
237 vd_out.template get<2>(p)[0][0] = a;
238 vd_out.template get<2>(p)[0][1] = a;
239 vd_out.template get<2>(p)[1][0] = a;
240 vd_out.template get<2>(p)[1][1] = a;
241 vd_in.template get<0>(p)[1] = a;
242 };
243
244 CUDA_LAUNCH_LAMBDA(ite, lamb);
245
246 cudaDeviceSynchronize();
247
248 t.stop();
249
250 if (i >=10)
251 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
252
253 std::cout << "Time: " << t.getwct() << std::endl;
254 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
255 }
256
257 double mean_write_lamb = 0.0;
258 double dev_write_lamb = 0.0;
259 standard_deviation(res,mean_write_lamb,dev_write_lamb);
260
261 initialize_buf(in,out);
262
263 for (int i = 0 ; i < 110 ; i++)
264 {
265 cudaDeviceSynchronize();
266 timer t;
267 t.start();
268
269
270 auto vd_out = out.toKernel();
271 auto vd_in = in.toKernel();
272
273 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
274 {
275 auto p = blockIdx.x * blockDim.x + threadIdx.x;
276
277 float a = vd_out.template get<0>(p);
278
279 float b = vd_out.template get<1>(p)[0];
280 float c = vd_out.template get<1>(p)[1];
281
282 float d = vd_out.template get<2>(p)[0][0];
283 float e = vd_out.template get<2>(p)[0][1];
284 float f = vd_out.template get<2>(p)[1][0];
285 float g = vd_out.template get<2>(p)[1][1];
286
287 float h = vd_in.template get<0>(p)[0];
288 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
289 };
290
291 CUDA_LAUNCH_LAMBDA(ite, lamb);
292
293 cudaDeviceSynchronize();
294
295 t.stop();
296
297 if (i >=10)
298 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
299
300 std::cout << "Time: " << t.getwct() << std::endl;
301 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
302 }
303
304 double mean_read_lamb = 0.0;
305 double dev_read_lamb = 0.0;
306 standard_deviation(res,mean_read_lamb,dev_read_lamb);
307
308 // Array benchmark
309
310 for (int i = 0 ; i < 110 ; i++)
311 {
312 cudaDeviceSynchronize();
313 timer t;
314 t.start();
315
316 float * out_s = (float *)out.getDeviceBuffer<0>();
317 float * out_v = (float *)out.getDeviceBuffer<1>();
318 float * out_m = (float *)out.getDeviceBuffer<2>();
319 float * in_v = (float *)in.getDeviceBuffer<0>();
320
321 int stride = out.capacity();
322
323 auto lamb_arr_write = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
324 {
325 auto p = blockIdx.x * blockDim.x + threadIdx.x;
326
327 float a = in_v[p + 0*stride];
328
329 out_s[p] = a;
330
331 out_v[p + 0*stride] = a;
332 out_v[p + 1*stride] = a;
333
334 out_m[p + 0*2*stride + 0*stride ] = a;
335 out_m[p + 0*2*stride + 1*stride ] = a;
336 out_m[p + 1*2*stride + 0*stride ] = a;
337 out_m[p + 1*2*stride + 1*stride ] = a;
338 in_v[p + 1*stride] = a;
339 };
340
341 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write);
342
343 cudaDeviceSynchronize();
344
345 t.stop();
346
347 if (i >=10)
348 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
349
350 std::cout << "Time ARR: " << t.getwct() << std::endl;
351 std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
352 }
353
354 double mean_write_arr = 0.0;
355 double dev_write_arr = 0.0;
356 standard_deviation(res,mean_write_arr,dev_write_arr);
357
358 check_write(in,out);
359
360 for (int i = 0 ; i < 110 ; i++)
361 {
362 cudaDeviceSynchronize();
363 timer t;
364 t.start();
365
366 float * out_s = (float *)out.getDeviceBuffer<0>();
367 float * out_v = (float *)out.getDeviceBuffer<1>();
368 float * out_m = (float *)out.getDeviceBuffer<2>();
369 float * in_v = (float *)in.getDeviceBuffer<0>();
370
371 int stride = out.capacity();
372
373 auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
374 {
375 auto p = blockIdx.x * blockDim.x + threadIdx.x;
376
377 float a = out_s[p];
378
379 float b = out_v[p + 0*stride];
380 float c = out_v[p + 1*stride];
381
382 float d = out_m[p + 0*2*stride + 0*stride];
383 float e = out_m[p + 0*2*stride + 1*stride];
384 float f = out_m[p + 1*2*stride + 0*stride];
385 float g = out_m[p + 1*2*stride + 1*stride];
386
387 float h = in_v[p + 0*stride];
388 in_v[p + 1*stride] = a+b+c+d+e+f+g+h;
389 };
390
391 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red);
392
393 cudaDeviceSynchronize();
394
395 t.stop();
396
397 if (i >=10)
398 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
399
400 std::cout << "Time ARR: " << t.getwct() << std::endl;
401 std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
402 }
403
404 double mean_read_arr = 0.0;
405 double dev_read_arr = 0.0;
406 standard_deviation(res,mean_read_arr,dev_read_arr);
407
408 check_read(in,out);
409
411
412 #ifdef CUDIFY_USE_OPENMP
413
414 for (int i = 0 ; i < 110 ; i++)
415 {
416 timer t;
417 t.start();
418
419 float * out_s = (float *)out.getDeviceBuffer<0>();
420 float * out_v = (float *)out.getDeviceBuffer<1>();
421 float * out_m = (float *)out.getDeviceBuffer<2>();
422 float * in_v = (float *)in.getDeviceBuffer<0>();
423
424 int stride = out.capacity();
425
426 auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __host__ (int p)
427 {
428 float a = out_s[p];
429
430 float b = out_v[p + 0*stride];
431 float c = out_v[p + 1*stride];
432
433 float d = out_m[p + 0*2*stride + 0*stride];
434 float e = out_m[p + 0*2*stride + 1*stride];
435 float f = out_m[p + 1*2*stride + 0*stride];
436 float g = out_m[p + 1*2*stride + 1*stride];
437
438 float h = in_v[p + 0*stride];
439 in_v[p + 1*stride] = a+b+c+d+e+f+g+h;
440 };
441
442 for (int i = 0 ; i < NELEMENTS ; i++)
443 {
444 lamb_arr_red(i);
445 }
446
447 t.stop();
448
449 if (i >=10)
450 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
451
452 std::cout << "Time ARR: " << t.getwct() << std::endl;
453 std::cout << "BW 1-CORE ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
454 }
455
456 #endif
457
459
460 #ifdef CUDIFY_USE_CUDA
461
462 for (int i = 0 ; i < 110 ; i++)
463 {
464 cudaDeviceSynchronize();
465 timer t;
466 t.start();
467
468 float * a = (float *)in.getDeviceBuffer<0>();
469 float * b = (float *)out.getDeviceBuffer<1>();
470
471 cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice);
472
473 cudaDeviceSynchronize();
474
475 t.stop();
476
477 if (i >=10)
478 {res.get(i-10) = (double)nele*4*4 / t.getwct() * 1e-9;}
479
480 std::cout << "Time: " << t.getwct() << std::endl;
481 std::cout << "BW: " << (double)nele*4*4 / t.getwct() * 1e-9 << " GB/s" << std::endl;
482 }
483
484 double mean_read_mes = 0.0;
485 double dev_read_mes = 0.0;
486 standard_deviation(res,mean_read_mes,dev_read_mes);
487
488 std::cout << "Average measured: " << mean_read_mes << " deviation: " << dev_read_mes << std::endl;
489
490 #endif
491
492// std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl;
493// std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl;
494
495 std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl;
496 std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl;
497
498 std::cout << "Average WRITE with array: " << mean_write_arr << " deviation: " << dev_write_arr << std::endl;
499 std::cout << "Average READ with array: " << mean_read_arr << " deviation: " << dev_read_arr << std::endl;
500}
501
502#else
503
504int main(int argc, char *argv[])
505{
506}
507
508#endif
509
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
Distributed vector.