3#include "Vector/map_vector.hpp"
4#include "util/stat/common_statistics.hpp"
6#define NELEMENTS 67108864
9template<
typename vector_type,
typename vector_type2>
13 blockIdx.y * blockDim.y + threadIdx.y,
14 blockIdx.z * blockDim.z + threadIdx.z});
16 float a = vd_in.template get<0>(p)[0];
18 vd_out.template get<0>(p) = a;
20 vd_out.template get<1>(p)[0] = a;
21 vd_out.template get<1>(p)[1] = a;
23 vd_out.template get<2>(p)[0][0] = a;
24 vd_out.template get<2>(p)[0][1] = a;
25 vd_out.template get<2>(p)[1][0] = a;
26 vd_out.template get<2>(p)[1][1] = a;
27 vd_in.template get<0>(p)[1] = a;
30template<
typename vector_type,
typename vector_type2>
34 blockIdx.y * blockDim.y + threadIdx.y,
35 blockIdx.z * blockDim.z + threadIdx.z});
37 float a = vd_out.template get<0>(p);
39 float b = vd_out.template get<1>(p)[0];
40 float c = vd_out.template get<1>(p)[1];
42 float d = vd_out.template get<2>(p)[0][0];
43 float e = vd_out.template get<2>(p)[0][1];
44 float f = vd_out.template get<2>(p)[1][0];
45 float g = vd_out.template get<2>(p)[1][1];
47 float h = vd_in.template get<0>(p)[0];
48 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
52template<
typename in_type,
typename out_type>
53void check_write(in_type & in, out_type & out)
55 out.template deviceToHost<0,1,2>();
56 in.template deviceToHost<0>();
59 auto it = in.getIterator();
64 float a = in.template get<0>(i)[0];
66 if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
71 success &= out.template get<0>(i) == a;
73 success &= out.template get<1>(i)[0] == a;
74 success &= out.template get<1>(i)[1] == a;
76 success &= out.template get<2>(i)[0][0] == a;
77 success &= out.template get<2>(i)[0][1] == a;
78 success &= out.template get<2>(i)[1][0] == a;
79 success &= out.template get<2>(i)[1][1] == a;
81 success &= in.template get<0>(i)[1] == a;
85 std::cout <<
"FAIL " << a <<
" " << i.to_string() <<
" " << out.template get<0>(i) <<
" " << out.template get<1>(i)[0]
86 << out.template get<1>(i)[1] <<
" " << out.template get<2>(i)[0][0]
87 << out.template get<2>(i)[0][1] <<
" " << out.template get<2>(i)[1][0]
88 << out.template get<2>(i)[1][0] <<
" " << out.template get<2>(i)[1][1]
98 std::cout <<
"FAIL WRITE" << std::endl;
103template<
typename in_type,
typename out_type>
104void check_read(in_type & in, out_type & out)
106 out.template deviceToHost<0,1,2>();
107 in.template deviceToHost<0>();
110 auto it = in.getIterator();
115 float a = out.template get<0>(i);
117 if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
122 float b = out.template get<1>(i)[0];
123 float c = out.template get<1>(i)[1];
125 float d = out.template get<2>(i)[0][0];
126 float e = out.template get<2>(i)[0][1];
127 float f = out.template get<2>(i)[1][0];
128 float g = out.template get<2>(i)[1][1];
130 float h = in.template get<0>(i)[0];
132 success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h);
134 if (success ==
false)
136 std::cout <<
"FAIL READ " << i.to_string() <<
" " << in.template get<0>(i)[1] <<
" != " << a+b+c+d+e+f+g+h << std::endl;
144template<
typename vector_type,
typename vector_type2>
148 blockIdx.y * blockDim.y + threadIdx.y,
149 blockIdx.z * blockDim.z + threadIdx.z});
151 vd_in.template get<0>(i)[0] = i.get(0) + i.get(1) + i.get(2);
152 vd_in.template get<0>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
154 vd_out.template get<0>(i) = i.get(0) + i.get(1) + i.get(2)+200.0;
156 vd_out.template get<1>(i)[0] = i.get(0) + i.get(1) + i.get(2);
157 vd_out.template get<1>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
159 vd_out.template get<2>(i)[0][0] = i.get(0) + i.get(1) + i.get(2);
160 vd_out.template get<2>(i)[0][1] = i.get(0) + i.get(1) + i.get(2)+100.0;
161 vd_out.template get<2>(i)[1][0] = i.get(0) + i.get(1) + i.get(2)+200.0;
162 vd_out.template get<2>(i)[1][1] = i.get(0) + i.get(1) + i.get(2)+300.0;
165template<
typename vin_type,
typename vout_type>
166void initialize_buf(vin_type & in, vout_type & out)
168 auto ite = out.getGPUIterator({0,0,0},{511,511,255});
169 CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
172int main(
int argc,
char *argv[])
179 int nele = NELEMENTS;
181 size_t sz[3] = {512,512,256};
188 initialize_buf(in,out);
192 auto ite = out.getGPUIterator({0,0,0},{511,511,255});
197 for (
int i = 0 ; i < 110 ; i++)
199 cudaDeviceSynchronize();
204 CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
206 cudaDeviceSynchronize();
211 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
213 std::cout <<
"Time: " << t.
getwct() << std::endl;
214 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
217 double mean_write_tls = 0.0;
218 double dev_write_tls = 0.0;
219 standard_deviation(res,mean_write_tls,dev_write_tls);
223 initialize_buf(in,out);
225 for (
int i = 0 ; i < 110 ; i++)
227 cudaDeviceSynchronize();
232 CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
234 cudaDeviceSynchronize();
239 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
241 std::cout <<
"Time: " << t.
getwct() << std::endl;
242 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
245 double mean_read_tls = 0.0;
246 double dev_read_tls = 0.0;
247 standard_deviation(res,mean_read_tls,dev_read_tls);
255 initialize_buf(in,out);
257 for (
int i = 0 ; i < 110 ; i++)
259 cudaDeviceSynchronize();
263 auto vd_out = out.toKernel();
264 auto vd_in = in.toKernel();
266 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
269 blockIdx.y * blockDim.y + threadIdx.y,
270 blockIdx.z * blockDim.z + threadIdx.z});
272 float a = vd_in.template get<0>(p)[0];
274 vd_out.template get<0>(p) = a;
276 vd_out.template get<1>(p)[0] = a;
277 vd_out.template get<1>(p)[1] = a;
279 vd_out.template get<2>(p)[0][0] = a;
280 vd_out.template get<2>(p)[0][1] = a;
281 vd_out.template get<2>(p)[1][0] = a;
282 vd_out.template get<2>(p)[1][1] = a;
283 vd_in.template get<0>(p)[1] = a;
286 CUDA_LAUNCH_LAMBDA(ite, lamb);
288 cudaDeviceSynchronize();
293 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
295 std::cout <<
"Time: " << t.
getwct() << std::endl;
296 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
299 double mean_write_lamb = 0.0;
300 double dev_write_lamb = 0.0;
301 standard_deviation(res,mean_write_lamb,dev_write_lamb);
303 initialize_buf(in,out);
305 for (
int i = 0 ; i < 110 ; i++)
307 cudaDeviceSynchronize();
312 auto vd_out = out.toKernel();
313 auto vd_in = in.toKernel();
315 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
318 blockIdx.y * blockDim.y + threadIdx.y,
319 blockIdx.z * blockDim.z + threadIdx.z});
321 float a = vd_out.template get<0>(p);
323 float b = vd_out.template get<1>(p)[0];
324 float c = vd_out.template get<1>(p)[1];
326 float d = vd_out.template get<2>(p)[0][0];
327 float e = vd_out.template get<2>(p)[0][1];
328 float f = vd_out.template get<2>(p)[1][0];
329 float g = vd_out.template get<2>(p)[1][1];
331 float h = vd_in.template get<0>(p)[0];
332 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
335 CUDA_LAUNCH_LAMBDA(ite, lamb);
337 cudaDeviceSynchronize();
342 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
344 std::cout <<
"Time: " << t.
getwct() << std::endl;
345 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
348 double mean_read_lamb = 0.0;
349 double dev_read_lamb = 0.0;
350 standard_deviation(res,mean_read_lamb,dev_read_lamb);
353 initialize_buf(in,out);
355 for (
int i = 0 ; i < 110 ; i++)
357 cudaDeviceSynchronize();
361 float * out_s = (
float *)out.getDeviceBuffer<0>();
362 float * out_v = (
float *)out.getDeviceBuffer<1>();
363 float * out_m = (
float *)out.getDeviceBuffer<2>();
364 float * in_v = (
float *)in.getDeviceBuffer<0>();
369 int stride = out.size();
371 auto lamb_arr_write = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
373 auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
374 auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
375 auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
377 float a = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
379 out_s[p1 + p2*sz0 + p3*sz0*sz1] = a;
381 out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride] = a;
382 out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
384 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride ] = a;
385 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride ] = a;
386 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride ] = a;
387 out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride ] = a;
388 in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
391 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write);
393 cudaDeviceSynchronize();
398 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
400 std::cout <<
"Time ARR: " << t.
getwct() << std::endl;
401 std::cout <<
"BW ARR: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
404 double mean_write_arr = 0.0;
405 double dev_write_arr = 0.0;
406 standard_deviation(res,mean_write_arr,dev_write_arr);
410 for (
int i = 0 ; i < 110 ; i++)
412 cudaDeviceSynchronize();
416 float * out_s = (
float *)out.getDeviceBuffer<0>();
417 float * out_v = (
float *)out.getDeviceBuffer<1>();
418 float * out_m = (
float *)out.getDeviceBuffer<2>();
419 float * in_v = (
float *)in.getDeviceBuffer<0>();
424 int stride = out.size();
426 auto lamb_arr_red = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
428 auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
429 auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
430 auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
432 float a = out_s[p1 + p2*sz0 + p3*sz0*sz1];
434 float b = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
435 float c = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride];
437 float d = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride];
438 float e = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride];
439 float f = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride];
440 float g = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride];
442 float h = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
443 in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a+b+c+d+e+f+g+h;
446 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red);
448 cudaDeviceSynchronize();
453 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
455 std::cout <<
"Time ARR: " << t.
getwct() << std::endl;
456 std::cout <<
"BW ARR: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
459 double mean_read_arr = 0.0;
460 double dev_read_arr = 0.0;
461 standard_deviation(res,mean_read_arr,dev_read_arr);
467 #ifdef CUDIFY_USE_CUDA
469 for (
int i = 0 ; i < 110 ; i++)
471 cudaDeviceSynchronize();
475 float * a = (
float *)in.getDeviceBuffer<0>();
476 float * b = (
float *)out.getDeviceBuffer<1>();
478 cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice);
480 cudaDeviceSynchronize();
485 {res.get(i-10) = (double)nele*4*4 / t.
getwct() * 1e-9;}
487 std::cout <<
"Time: " << t.
getwct() << std::endl;
488 std::cout <<
"BW: " << (double)nele*4*4 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
491 double mean_read_mes = 0.0;
492 double dev_read_mes = 0.0;
493 standard_deviation(res,mean_read_mes,dev_read_mes);
495 std::cout <<
"Average measured: " << mean_read_mes <<
" deviation: " << dev_read_mes << std::endl;
499 std::cout <<
"Average READ with TLS: " << mean_read_tls <<
" deviation: " << dev_read_tls << std::endl;
500 std::cout <<
"Average WRITE with TLS: " << mean_write_tls <<
" deviation: " << dev_write_tls << std::endl;
502 std::cout <<
"Average READ with lamb: " << mean_read_lamb <<
" deviation: " << dev_read_lamb << std::endl;
503 std::cout <<
"Average WRITE with lamb: " << mean_write_lamb <<
" deviation: " << dev_write_lamb << std::endl;
505 std::cout <<
"Average WRITE with array: " << mean_write_arr <<
" deviation: " << dev_write_arr << std::endl;
506 std::cout <<
"Average READ with array: " << mean_read_arr <<
" deviation: " << dev_read_arr << std::endl;
511int main(
int argc,
char *argv[])
grid_key_dx is the key to access any element in the grid
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
void stop()
Stop the timer.
void start()
Start the timer.
double getwct()
Return the elapsed real time.