3#include "Vector/map_vector.hpp"
4#include "util/stat/common_statistics.hpp"
6#define NELEMENTS 16777216
9template<
typename vector_type,
typename vector_type2>
12 auto p = blockIdx.x * blockDim.x + threadIdx.x;
14 float a = vd_in.template get<0>(p)[0];
16 vd_out.template get<0>(p) = a;
18 vd_out.template get<1>(p)[0] = a;
19 vd_out.template get<1>(p)[1] = a;
21 vd_out.template get<2>(p)[0][0] = a;
22 vd_out.template get<2>(p)[0][1] = a;
23 vd_out.template get<2>(p)[1][0] = a;
24 vd_out.template get<2>(p)[1][1] = a;
25 vd_in.template get<0>(p)[1] = a;
28template<
typename vector_type,
typename vector_type2>
31 auto p = blockIdx.x * blockDim.x + threadIdx.x;
33 float a = vd_out.template get<0>(p);
35 float b = vd_out.template get<1>(p)[0];
36 float c = vd_out.template get<1>(p)[1];
38 float d = vd_out.template get<2>(p)[0][0];
39 float e = vd_out.template get<2>(p)[0][1];
40 float f = vd_out.template get<2>(p)[1][0];
41 float g = vd_out.template get<2>(p)[1][1];
43 float h = vd_in.template get<0>(p)[0];
44 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
48template<
typename in_type,
typename out_type>
49void check_write(in_type & in, out_type & out)
51 out.template deviceToHost<0,1,2>();
52 in.template deviceToHost<0>();
55 for (
int i = 0 ; i < NELEMENTS; i++)
57 float a = in.template get<0>(i)[0];
59 success &= out.template get<0>(i) == a;
61 success &= out.template get<1>(i)[0] == a;
62 success &= out.template get<1>(i)[1] == a;
64 success &= out.template get<2>(i)[0][0] == a;
65 success &= out.template get<2>(i)[0][1] == a;
66 success &= out.template get<2>(i)[1][0] == a;
67 success &= out.template get<2>(i)[1][1] == a;
69 success &= in.template get<0>(i)[1] == a;
74 std::cout <<
"FAIL WRITE" << std::endl;
79template<
typename in_type,
typename out_type>
80void check_read(in_type & in, out_type & out)
82 out.template deviceToHost<0,1,2>();
83 in.template deviceToHost<0>();
86 for (
int i = 0 ; i < NELEMENTS ; i++)
88 float a = out.template get<0>(i);
90 float b = out.template get<1>(i)[0];
91 float c = out.template get<1>(i)[1];
93 float d = out.template get<2>(i)[0][0];
94 float e = out.template get<2>(i)[0][1];
95 float f = out.template get<2>(i)[1][0];
96 float g = out.template get<2>(i)[1][1];
98 float h = in.template get<0>(i)[0];
100 success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h);
102 if (success ==
false)
104 std::cout <<
"FAIL READ " << i <<
" " << in.template get<0>(i)[1] <<
" != " << a+b+c+d+e+f+g+h << std::endl;
110template<
typename vector_type,
typename vector_type2>
113 auto i = blockIdx.x * blockDim.x + threadIdx.x;
115 vd_in.template get<0>(i)[0] = i;
116 vd_in.template get<0>(i)[1] = i+100.0;
118 vd_out.template get<0>(i) = i+200.0;
120 vd_out.template get<1>(i)[0] = i;
121 vd_out.template get<1>(i)[1] = i+100.0;
123 vd_out.template get<2>(i)[0][0] = i;
124 vd_out.template get<2>(i)[0][1] = i+100.0;
125 vd_out.template get<2>(i)[1][0] = i+200.0;
126 vd_out.template get<2>(i)[1][1] = i+300.0;
129template<
typename vin_type,
typename vout_type>
130void initialize_buf(vin_type in, vout_type out)
132 auto ite = out.getGPUIterator(256);
133 CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
136int main(
int argc,
char *argv[])
143 int nele = NELEMENTS;
148 initialize_buf(in,out);
152 auto ite = out.getGPUIterator(256);
157 for (
int i = 0 ; i < 110 ; i++)
159 cudaDeviceSynchronize();
164 CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
166 cudaDeviceSynchronize();
171 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
173 std::cout <<
"Time: " << t.
getwct() << std::endl;
174 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
177 double mean_write_tls = 0.0;
178 double dev_write_tls = 0.0;
179 standard_deviation(res,mean_write_tls,dev_write_tls);
183 initialize_buf(in,out);
185 for (
int i = 0 ; i < 110 ; i++)
187 cudaDeviceSynchronize();
192 CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
194 cudaDeviceSynchronize();
199 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
201 std::cout <<
"Time: " << t.
getwct() << std::endl;
202 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
205 double mean_read_tls = 0.0;
206 double dev_read_tls = 0.0;
207 standard_deviation(res,mean_read_tls,dev_read_tls);
215 initialize_buf(in,out);
217 for (
int i = 0 ; i < 110 ; i++)
219 cudaDeviceSynchronize();
223 auto vd_out = out.toKernel();
224 auto vd_in = in.toKernel();
226 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
228 auto p = blockIdx.x * blockDim.x + threadIdx.x;
230 float a = vd_in.template get<0>(p)[0];
232 vd_out.template get<0>(p) = a;
234 vd_out.template get<1>(p)[0] = a;
235 vd_out.template get<1>(p)[1] = a;
237 vd_out.template get<2>(p)[0][0] = a;
238 vd_out.template get<2>(p)[0][1] = a;
239 vd_out.template get<2>(p)[1][0] = a;
240 vd_out.template get<2>(p)[1][1] = a;
241 vd_in.template get<0>(p)[1] = a;
244 CUDA_LAUNCH_LAMBDA(ite, lamb);
246 cudaDeviceSynchronize();
251 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
253 std::cout <<
"Time: " << t.
getwct() << std::endl;
254 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
257 double mean_write_lamb = 0.0;
258 double dev_write_lamb = 0.0;
259 standard_deviation(res,mean_write_lamb,dev_write_lamb);
261 initialize_buf(in,out);
263 for (
int i = 0 ; i < 110 ; i++)
265 cudaDeviceSynchronize();
270 auto vd_out = out.toKernel();
271 auto vd_in = in.toKernel();
273 auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
275 auto p = blockIdx.x * blockDim.x + threadIdx.x;
277 float a = vd_out.template get<0>(p);
279 float b = vd_out.template get<1>(p)[0];
280 float c = vd_out.template get<1>(p)[1];
282 float d = vd_out.template get<2>(p)[0][0];
283 float e = vd_out.template get<2>(p)[0][1];
284 float f = vd_out.template get<2>(p)[1][0];
285 float g = vd_out.template get<2>(p)[1][1];
287 float h = vd_in.template get<0>(p)[0];
288 vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
291 CUDA_LAUNCH_LAMBDA(ite, lamb);
293 cudaDeviceSynchronize();
298 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
300 std::cout <<
"Time: " << t.
getwct() << std::endl;
301 std::cout <<
"BW: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
304 double mean_read_lamb = 0.0;
305 double dev_read_lamb = 0.0;
306 standard_deviation(res,mean_read_lamb,dev_read_lamb);
310 for (
int i = 0 ; i < 110 ; i++)
312 cudaDeviceSynchronize();
316 float * out_s = (
float *)out.getDeviceBuffer<0>();
317 float * out_v = (
float *)out.getDeviceBuffer<1>();
318 float * out_m = (
float *)out.getDeviceBuffer<2>();
319 float * in_v = (
float *)in.getDeviceBuffer<0>();
321 int stride = out.capacity();
323 auto lamb_arr_write = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
325 auto p = blockIdx.x * blockDim.x + threadIdx.x;
327 float a = in_v[p + 0*stride];
331 out_v[p + 0*stride] = a;
332 out_v[p + 1*stride] = a;
334 out_m[p + 0*2*stride + 0*stride ] = a;
335 out_m[p + 0*2*stride + 1*stride ] = a;
336 out_m[p + 1*2*stride + 0*stride ] = a;
337 out_m[p + 1*2*stride + 1*stride ] = a;
338 in_v[p + 1*stride] = a;
341 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write);
343 cudaDeviceSynchronize();
348 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
350 std::cout <<
"Time ARR: " << t.
getwct() << std::endl;
351 std::cout <<
"BW ARR: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
354 double mean_write_arr = 0.0;
355 double dev_write_arr = 0.0;
356 standard_deviation(res,mean_write_arr,dev_write_arr);
360 for (
int i = 0 ; i < 110 ; i++)
362 cudaDeviceSynchronize();
366 float * out_s = (
float *)out.getDeviceBuffer<0>();
367 float * out_v = (
float *)out.getDeviceBuffer<1>();
368 float * out_m = (
float *)out.getDeviceBuffer<2>();
369 float * in_v = (
float *)in.getDeviceBuffer<0>();
371 int stride = out.capacity();
373 auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
375 auto p = blockIdx.x * blockDim.x + threadIdx.x;
379 float b = out_v[p + 0*stride];
380 float c = out_v[p + 1*stride];
382 float d = out_m[p + 0*2*stride + 0*stride];
383 float e = out_m[p + 0*2*stride + 1*stride];
384 float f = out_m[p + 1*2*stride + 0*stride];
385 float g = out_m[p + 1*2*stride + 1*stride];
387 float h = in_v[p + 0*stride];
388 in_v[p + 1*stride] = a+b+c+d+e+f+g+h;
391 CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red);
393 cudaDeviceSynchronize();
398 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
400 std::cout <<
"Time ARR: " << t.
getwct() << std::endl;
401 std::cout <<
"BW ARR: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
404 double mean_read_arr = 0.0;
405 double dev_read_arr = 0.0;
406 standard_deviation(res,mean_read_arr,dev_read_arr);
412 #ifdef CUDIFY_USE_OPENMP
414 for (
int i = 0 ; i < 110 ; i++)
419 float * out_s = (
float *)out.getDeviceBuffer<0>();
420 float * out_v = (
float *)out.getDeviceBuffer<1>();
421 float * out_m = (
float *)out.getDeviceBuffer<2>();
422 float * in_v = (
float *)in.getDeviceBuffer<0>();
424 int stride = out.capacity();
426 auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __host__ (
int p)
430 float b = out_v[p + 0*stride];
431 float c = out_v[p + 1*stride];
433 float d = out_m[p + 0*2*stride + 0*stride];
434 float e = out_m[p + 0*2*stride + 1*stride];
435 float f = out_m[p + 1*2*stride + 0*stride];
436 float g = out_m[p + 1*2*stride + 1*stride];
438 float h = in_v[p + 0*stride];
439 in_v[p + 1*stride] = a+b+c+d+e+f+g+h;
442 for (
int i = 0 ; i < NELEMENTS ; i++)
450 {res.get(i-10) = (double)nele*4*9 / t.
getwct() * 1e-9;}
452 std::cout <<
"Time ARR: " << t.
getwct() << std::endl;
453 std::cout <<
"BW 1-CORE ARR: " << (double)nele*4*9 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
460 #ifdef CUDIFY_USE_CUDA
462 for (
int i = 0 ; i < 110 ; i++)
464 cudaDeviceSynchronize();
468 float * a = (
float *)in.getDeviceBuffer<0>();
469 float * b = (
float *)out.getDeviceBuffer<1>();
471 cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice);
473 cudaDeviceSynchronize();
478 {res.get(i-10) = (double)nele*4*4 / t.
getwct() * 1e-9;}
480 std::cout <<
"Time: " << t.
getwct() << std::endl;
481 std::cout <<
"BW: " << (double)nele*4*4 / t.
getwct() * 1e-9 <<
" GB/s" << std::endl;
484 double mean_read_mes = 0.0;
485 double dev_read_mes = 0.0;
486 standard_deviation(res,mean_read_mes,dev_read_mes);
488 std::cout <<
"Average measured: " << mean_read_mes <<
" deviation: " << dev_read_mes << std::endl;
495 std::cout <<
"Average READ with lamb: " << mean_read_lamb <<
" deviation: " << dev_read_lamb << std::endl;
496 std::cout <<
"Average WRITE with lamb: " << mean_write_lamb <<
" deviation: " << dev_write_lamb << std::endl;
498 std::cout <<
"Average WRITE with array: " << mean_write_arr <<
" deviation: " << dev_write_arr << std::endl;
499 std::cout <<
"Average READ with array: " << mean_read_arr <<
" deviation: " << dev_read_arr << std::endl;
504int main(
int argc,
char *argv[])
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
void stop()
Stop the timer.
void start()
Start the timer.
double getwct()
Return the elapsed real time.