OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
main.cu
1#ifdef __NVCC__
2
3#include "Vector/map_vector.hpp"
4#include "util/stat/common_statistics.hpp"
5
6#define NELEMENTS 67108864
7
9template<typename vector_type, typename vector_type2>
10__global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
11{
12 grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
13 blockIdx.y * blockDim.y + threadIdx.y,
14 blockIdx.z * blockDim.z + threadIdx.z});
15
16 float a = vd_in.template get<0>(p)[0];
17
18 vd_out.template get<0>(p) = a;
19
20 vd_out.template get<1>(p)[0] = a;
21 vd_out.template get<1>(p)[1] = a;
22
23 vd_out.template get<2>(p)[0][0] = a;
24 vd_out.template get<2>(p)[0][1] = a;
25 vd_out.template get<2>(p)[1][0] = a;
26 vd_out.template get<2>(p)[1][1] = a;
27 vd_in.template get<0>(p)[1] = a;
28}
29
30template<typename vector_type, typename vector_type2>
31__global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
32{
33 grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
34 blockIdx.y * blockDim.y + threadIdx.y,
35 blockIdx.z * blockDim.z + threadIdx.z});
36
37 float a = vd_out.template get<0>(p);
38
39 float b = vd_out.template get<1>(p)[0];
40 float c = vd_out.template get<1>(p)[1];
41
42 float d = vd_out.template get<2>(p)[0][0];
43 float e = vd_out.template get<2>(p)[0][1];
44 float f = vd_out.template get<2>(p)[1][0];
45 float g = vd_out.template get<2>(p)[1][1];
46
47 float h = vd_in.template get<0>(p)[0];
48 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
49}
50
51
52template<typename in_type, typename out_type>
53void check_write(in_type & in, out_type & out)
54{
55 out.template deviceToHost<0,1,2>();
56 in.template deviceToHost<0>();
57
58 bool success = true;
59 auto it = in.getIterator();
60 while (it.isNext())
61 {
62 auto i = it.get();
63
64 float a = in.template get<0>(i)[0];
65
66 if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
67 {
68 success &= a != 0;
69 }
70
71 success &= out.template get<0>(i) == a;
72
73 success &= out.template get<1>(i)[0] == a;
74 success &= out.template get<1>(i)[1] == a;
75
76 success &= out.template get<2>(i)[0][0] == a;
77 success &= out.template get<2>(i)[0][1] == a;
78 success &= out.template get<2>(i)[1][0] == a;
79 success &= out.template get<2>(i)[1][1] == a;
80
81 success &= in.template get<0>(i)[1] == a;
82
83 if (success == false)
84 {
85 std::cout << "FAIL " << a << " " << i.to_string() << " " << out.template get<0>(i) << " " << out.template get<1>(i)[0]
86 << out.template get<1>(i)[1] << " " << out.template get<2>(i)[0][0]
87 << out.template get<2>(i)[0][1] << " " << out.template get<2>(i)[1][0]
88 << out.template get<2>(i)[1][0] << " " << out.template get<2>(i)[1][1]
89 << std::endl;
90 break;
91 }
92
93 ++it;
94 }
95
96 if (success == false)
97 {
98 std::cout << "FAIL WRITE" << std::endl;
99 exit(1);
100 }
101}
102
103template<typename in_type, typename out_type>
104void check_read(in_type & in, out_type & out)
105{
106 out.template deviceToHost<0,1,2>();
107 in.template deviceToHost<0>();
108
109 bool success = true;
110 auto it = in.getIterator();
111 while (it.isNext())
112 {
113 auto i = it.get();
114
115 float a = out.template get<0>(i);
116
117 if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
118 {
119 success &= a != 0;
120 }
121
122 float b = out.template get<1>(i)[0];
123 float c = out.template get<1>(i)[1];
124
125 float d = out.template get<2>(i)[0][0];
126 float e = out.template get<2>(i)[0][1];
127 float f = out.template get<2>(i)[1][0];
128 float g = out.template get<2>(i)[1][1];
129
130 float h = in.template get<0>(i)[0];
131
132 success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h);
133
134 if (success == false)
135 {
136 std::cout << "FAIL READ " << i.to_string() << " " << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl;
137 exit(1);
138 }
139
140 ++it;
141 }
142}
143
144template<typename vector_type, typename vector_type2>
145__global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in)
146{
147 grid_key_dx<3,int> i({blockIdx.x * blockDim.x + threadIdx.x,
148 blockIdx.y * blockDim.y + threadIdx.y,
149 blockIdx.z * blockDim.z + threadIdx.z});
150
151 vd_in.template get<0>(i)[0] = i.get(0) + i.get(1) + i.get(2);
152 vd_in.template get<0>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
153
154 vd_out.template get<0>(i) = i.get(0) + i.get(1) + i.get(2)+200.0;
155
156 vd_out.template get<1>(i)[0] = i.get(0) + i.get(1) + i.get(2);
157 vd_out.template get<1>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
158
159 vd_out.template get<2>(i)[0][0] = i.get(0) + i.get(1) + i.get(2);
160 vd_out.template get<2>(i)[0][1] = i.get(0) + i.get(1) + i.get(2)+100.0;
161 vd_out.template get<2>(i)[1][0] = i.get(0) + i.get(1) + i.get(2)+200.0;
162 vd_out.template get<2>(i)[1][1] = i.get(0) + i.get(1) + i.get(2)+300.0;
163}
164
165template<typename vin_type, typename vout_type>
166void initialize_buf(vin_type & in, vout_type & out)
167{
168 auto ite = out.getGPUIterator({0,0,0},{511,511,255});
169 CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
170}
171
172int main(int argc, char *argv[])
173{
174 init_wrappers();
175
178
179 int nele = NELEMENTS;
180
181 size_t sz[3] = {512,512,256};
182 out.resize(sz);
183 in.resize(sz);
184
185 out.setMemory();
186 in.setMemory();
187
188 initialize_buf(in,out);
189
190 // Read write test with TLS
191
192 auto ite = out.getGPUIterator({0,0,0},{511,511,255});
193
195 res.resize(100);
196
197 for (int i = 0 ; i < 110 ; i++)
198 {
199 cudaDeviceSynchronize();
200 timer t;
201 t.start();
202
203
204 CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
205
206 cudaDeviceSynchronize();
207
208 t.stop();
209
210 if (i >=10)
211 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
212
213 std::cout << "Time: " << t.getwct() << std::endl;
214 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
215 }
216
217 double mean_write_tls = 0.0;
218 double dev_write_tls = 0.0;
219 standard_deviation(res,mean_write_tls,dev_write_tls);
220
221 check_write(in,out);
222
223 initialize_buf(in,out);
224
225 for (int i = 0 ; i < 110 ; i++)
226 {
227 cudaDeviceSynchronize();
228 timer t;
229 t.start();
230
231
232 CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
233
234 cudaDeviceSynchronize();
235
236 t.stop();
237
238 if (i >=10)
239 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
240
241 std::cout << "Time: " << t.getwct() << std::endl;
242 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
243 }
244
245 double mean_read_tls = 0.0;
246 double dev_read_tls = 0.0;
247 standard_deviation(res,mean_read_tls,dev_read_tls);
248
249 check_read(in,out);
250
252
254
255 initialize_buf(in,out);
256
257 for (int i = 0 ; i < 110 ; i++)
258 {
259 cudaDeviceSynchronize();
260 timer t;
261 t.start();
262
263 auto vd_out = out.toKernel();
264 auto vd_in = in.toKernel();
265
266 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
267 {
268 grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
269 blockIdx.y * blockDim.y + threadIdx.y,
270 blockIdx.z * blockDim.z + threadIdx.z});
271
272 float a = vd_in.template get<0>(p)[0];
273
274 vd_out.template get<0>(p) = a;
275
276 vd_out.template get<1>(p)[0] = a;
277 vd_out.template get<1>(p)[1] = a;
278
279 vd_out.template get<2>(p)[0][0] = a;
280 vd_out.template get<2>(p)[0][1] = a;
281 vd_out.template get<2>(p)[1][0] = a;
282 vd_out.template get<2>(p)[1][1] = a;
283 vd_in.template get<0>(p)[1] = a;
284 };
285
286 CUDA_LAUNCH_LAMBDA(ite, lamb);
287
288 cudaDeviceSynchronize();
289
290 t.stop();
291
292 if (i >=10)
293 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
294
295 std::cout << "Time: " << t.getwct() << std::endl;
296 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
297 }
298
299 double mean_write_lamb = 0.0;
300 double dev_write_lamb = 0.0;
301 standard_deviation(res,mean_write_lamb,dev_write_lamb);
302
303 initialize_buf(in,out);
304
305 for (int i = 0 ; i < 110 ; i++)
306 {
307 cudaDeviceSynchronize();
308 timer t;
309 t.start();
310
311
312 auto vd_out = out.toKernel();
313 auto vd_in = in.toKernel();
314
315 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
316 {
317 grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
318 blockIdx.y * blockDim.y + threadIdx.y,
319 blockIdx.z * blockDim.z + threadIdx.z});
320
321 float a = vd_out.template get<0>(p);
322
323 float b = vd_out.template get<1>(p)[0];
324 float c = vd_out.template get<1>(p)[1];
325
326 float d = vd_out.template get<2>(p)[0][0];
327 float e = vd_out.template get<2>(p)[0][1];
328 float f = vd_out.template get<2>(p)[1][0];
329 float g = vd_out.template get<2>(p)[1][1];
330
331 float h = vd_in.template get<0>(p)[0];
332 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
333 };
334
335 CUDA_LAUNCH_LAMBDA(ite, lamb);
336
337 cudaDeviceSynchronize();
338
339 t.stop();
340
341 if (i >=10)
342 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
343
344 std::cout << "Time: " << t.getwct() << std::endl;
345 std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
346 }
347
348 double mean_read_lamb = 0.0;
349 double dev_read_lamb = 0.0;
350 standard_deviation(res,mean_read_lamb,dev_read_lamb);
351
352 // Array benchmark
353 initialize_buf(in,out);
354
355 for (int i = 0 ; i < 110 ; i++)
356 {
357 cudaDeviceSynchronize();
358 timer t;
359 t.start();
360
361 float * out_s = (float *)out.getDeviceBuffer<0>();
362 float * out_v = (float *)out.getDeviceBuffer<1>();
363 float * out_m = (float *)out.getDeviceBuffer<2>();
364 float * in_v = (float *)in.getDeviceBuffer<0>();
365
366 int sz0 = sz[0];
367 int sz1 = sz[1];
368 int sz2 = sz[2];
369 int stride = out.size();
370
371 auto lamb_arr_write = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
372 {
373 auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
374 auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
375 auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
376
377 float a = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
378
379 out_s[p1 + p2*sz0 + p3*sz0*sz1] = a;
380
381 out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride] = a;
382 out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
383
384 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride ] = a;
385 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride ] = a;
386 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride ] = a;
387 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride ] = a;
388 in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
389 };
390
391 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write);
392
393 cudaDeviceSynchronize();
394
395 t.stop();
396
397 if (i >=10)
398 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
399
400 std::cout << "Time ARR: " << t.getwct() << std::endl;
401 std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
402 }
403
404 double mean_write_arr = 0.0;
405 double dev_write_arr = 0.0;
406 standard_deviation(res,mean_write_arr,dev_write_arr);
407
408 check_write(in,out);
409
410 for (int i = 0 ; i < 110 ; i++)
411 {
412 cudaDeviceSynchronize();
413 timer t;
414 t.start();
415
416 float * out_s = (float *)out.getDeviceBuffer<0>();
417 float * out_v = (float *)out.getDeviceBuffer<1>();
418 float * out_m = (float *)out.getDeviceBuffer<2>();
419 float * in_v = (float *)in.getDeviceBuffer<0>();
420
421 int sz0 = sz[0];
422 int sz1 = sz[1];
423 int sz2 = sz[2];
424 int stride = out.size();
425
426 auto lamb_arr_red = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
427 {
428 auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
429 auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
430 auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
431
432 float a = out_s[p1 + p2*sz0 + p3*sz0*sz1];
433
434 float b = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
435 float c = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride];
436
437 float d = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride];
438 float e = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride];
439 float f = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride];
440 float g = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride];
441
442 float h = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
443 in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a+b+c+d+e+f+g+h;
444 };
445
446 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red);
447
448 cudaDeviceSynchronize();
449
450 t.stop();
451
452 if (i >=10)
453 {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
454
455 std::cout << "Time ARR: " << t.getwct() << std::endl;
456 std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl;
457 }
458
459 double mean_read_arr = 0.0;
460 double dev_read_arr = 0.0;
461 standard_deviation(res,mean_read_arr,dev_read_arr);
462
463 check_read(in,out);
464
466
467 #ifdef CUDIFY_USE_CUDA
468
469 for (int i = 0 ; i < 110 ; i++)
470 {
471 cudaDeviceSynchronize();
472 timer t;
473 t.start();
474
475 float * a = (float *)in.getDeviceBuffer<0>();
476 float * b = (float *)out.getDeviceBuffer<1>();
477
478 cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice);
479
480 cudaDeviceSynchronize();
481
482 t.stop();
483
484 if (i >=10)
485 {res.get(i-10) = (double)nele*4*4 / t.getwct() * 1e-9;}
486
487 std::cout << "Time: " << t.getwct() << std::endl;
488 std::cout << "BW: " << (double)nele*4*4 / t.getwct() * 1e-9 << " GB/s" << std::endl;
489 }
490
491 double mean_read_mes = 0.0;
492 double dev_read_mes = 0.0;
493 standard_deviation(res,mean_read_mes,dev_read_mes);
494
495 std::cout << "Average measured: " << mean_read_mes << " deviation: " << dev_read_mes << std::endl;
496
497 #endif
498
499 std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl;
500 std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl;
501
502 std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl;
503 std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl;
504
505 std::cout << "Average WRITE with array: " << mean_write_arr << " deviation: " << dev_write_arr << std::endl;
506 std::cout << "Average READ with array: " << mean_read_arr << " deviation: " << dev_read_arr << std::endl;
507}
508
509#else
510
511int main(int argc, char *argv[])
512{
513}
514
515#endif
516
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
Definition timer.hpp:28
void stop()
Stop the timer.
Definition timer.hpp:119
void start()
Start the timer.
Definition timer.hpp:90
double getwct()
Return the elapsed real time.
Definition timer.hpp:130
Distributed vector.