OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
cudify_unit_test.cu
1 #include "config.h"
2 #define BOOST_TEST_DYN_LINK
3 #include <boost/test/unit_test.hpp>
4 
5 #include <type_traits>
6 
7 std::is_trivially_copyable<int> b;
8 
9 #include "util/cuda_launch.hpp"
10 #include "memory/CudaMemory.cuh"
11 
12 #ifdef CUDIFY_USE_OPENMP
13 BOOST_AUTO_TEST_SUITE( cudify_tests_openmp )
14 #elif defined(CUDIFY_USE_CUDA)
15 BOOST_AUTO_TEST_SUITE( cudify_tests_cuda )
16 #elif defined(CUDIFY_USE_HIP)
17 BOOST_AUTO_TEST_SUITE( cudify_tests_hip )
18 #else
19 BOOST_AUTO_TEST_SUITE( cudify_tests_sequencial )
20 #endif
21 
22 struct par_struct
23 {
24  float * ptr;
25 };
26 
27 struct ite_g
28 {
29  dim3 wthr;
30  dim3 thr;
31 
32  size_t nblocks()
33  {
34  return wthr.x * wthr.y * wthr.z;
35  }
36 
37  size_t nthrs()
38  {
39  return thr.x * thr.y * thr.z;
40  }
41 };
42 
43 template<typename T>
44 __global__ void test1(float * array,T p)
45 {
46  size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
47  size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
48  size_t idx_z = blockIdx.z * blockDim.z + threadIdx.z;
49 
50  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 5.0;
51 
52  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 17.0;
53 }
54 
55 template<typename T>
56 __global__ void test1_syncthreads(T p, float * array)
57 {
58  __shared__ int cnt;
59 
60  cnt = 0;
61 
62  __syncthreads();
63 
64  size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
65  size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
66  size_t idx_z = blockIdx.z * blockDim.z + threadIdx.z;
67 
68  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 5.0;
69 
70  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 17.0;
71 
72  atomicAdd(&cnt,1);
73 
74  __syncthreads();
75 
76  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
77 
78  __syncthreads();
79 
80  atomicAdd(&cnt,1);
81 
82  __syncthreads();
83 
84  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
85 }
86 
87 
88 BOOST_AUTO_TEST_CASE( cudify_on_test_test )
89 {
90  init_wrappers();
91 
92  CudaMemory mem;
93  mem.allocate(16*16*16*sizeof(float));
94 
95  CudaMemory mem2;
96  mem2.allocate(16*16*16*sizeof(float));
97 
98  float * array_ptr = (float *)mem.getDevicePointer();
99 
100  par_struct p;
101  p.ptr = (float *)mem2.getDevicePointer();
102 
103  ite_g g;
104 
105  g.wthr = dim3(4,4,4);
106  g.thr = dim3(4,4,4);
107 
108  CUDA_LAUNCH(test1,g,array_ptr,p);
109 
110  mem.deviceToHost();
111  mem2.deviceToHost();
112 
113  float * ptr1 = (float *)mem.getPointer();
114  float * ptr2 = (float *)mem2.getPointer();
115 
116  bool check = true;
117  for (int i = 0 ; i < 16*16*16; i++)
118  {
119  check &= ptr1[i] == 5.0;
120  check &= ptr2[i] == 17.0;
121  }
122 
123  BOOST_REQUIRE_EQUAL(check,true);
124 }
125 
126 BOOST_AUTO_TEST_CASE( cudify_on_test_test2)
127 {
128  init_wrappers();
129 
130  CudaMemory mem;
131  mem.allocate(16*16*16*sizeof(float));
132 
133  CudaMemory mem2;
134  mem2.allocate(16*16*16*sizeof(float));
135 
136  float * array_ptr = (float *)mem.getDevicePointer();
137 
138  par_struct p;
139  p.ptr = (float *)mem2.getDevicePointer();
140 
141  ite_g g;
142 
143  g.wthr = dim3(4,4,4);
144  g.thr = dim3(4,4,4);
145 
146  CUDA_LAUNCH(test1_syncthreads,g,p,array_ptr);
147 
148  mem.deviceToHost();
149  mem2.deviceToHost();
150 
151  float * ptr1 = (float *)mem.getPointer();
152  float * ptr2 = (float *)mem2.getPointer();
153 
154  bool check = true;
155  for (int i = 0 ; i < 16*16*16; i++)
156  {
157  //std::cout << i << " " << ptr1[i] << " " << ptr2[i] << std::endl;
158 
159  check &= ptr1[i] == 64.0;
160  check &= ptr2[i] == 128.0;
161  }
162 
163  BOOST_REQUIRE_EQUAL(check,true);
164 }
165 
166 
167 BOOST_AUTO_TEST_CASE( cudify_on_test_test2_lambda)
168 {
169  init_wrappers();
170 
171  CudaMemory mem;
172  mem.allocate(16*16*16*sizeof(float));
173 
174  CudaMemory mem2;
175  mem2.allocate(16*16*16*sizeof(float));
176 
177  float * array_ptr = (float *)mem.getDevicePointer();
178 
179  par_struct p;
180  p.ptr = (float *)mem2.getDevicePointer();
181 
182  ite_g g;
183 
184  float * array = array_ptr;
185 
186  g.wthr = dim3(4,4,4);
187  g.thr = dim3(4,4,4);
188 
189  auto lambda_f = [array,p] __device__ (dim3 & blockIdx, dim3 & threadIdx){
190  __shared__ int cnt;
191 
192  cnt = 0;
193 
194  __syncthreads();
195 
196  size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
197  size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
198  size_t idx_z = blockIdx.z * blockDim.z + threadIdx.z;
199 
200  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 5.0;
201 
202  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 17.0;
203 
204  atomicAdd(&cnt,1);
205 
206  __syncthreads();
207 
208  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
209 
210  __syncthreads();
211 
212  atomicAdd(&cnt,1);
213 
214  __syncthreads();
215 
216  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
217  };
218 
219  CUDA_LAUNCH_LAMBDA(g, lambda_f);
220 
221  mem.deviceToHost();
222  mem2.deviceToHost();
223 
224  float * ptr1 = (float *)mem.getPointer();
225  float * ptr2 = (float *)mem2.getPointer();
226 
227  bool check = true;
228  for (int i = 0 ; i < 16*16*16; i++)
229  {
230  //std::cout << i << " " << ptr1[i] << " " << ptr2[i] << std::endl;
231 
232  check &= ptr1[i] == 64.0;
233  check &= ptr2[i] == 128.0;
234  }
235 
236  BOOST_REQUIRE_EQUAL(check,true);
237 }
238 
239 BOOST_AUTO_TEST_CASE( cudify_type_chack)
240 {
241  float3 f3;
242 
243  f3.x = 0.0;
244  f3.y = 1.0;
245  f3.z = 2.0;
246 
247  BOOST_REQUIRE_EQUAL(f3.x,0.0);
248  BOOST_REQUIRE_EQUAL(f3.y,1.0);
249  BOOST_REQUIRE_EQUAL(f3.z,2.0);
250 
251  float4 f4 = make_float4(0.0,1.0,2.0,3.0);
252 
253 
254  BOOST_REQUIRE_EQUAL(f4.x,0.0);
255  BOOST_REQUIRE_EQUAL(f4.y,1.0);
256  BOOST_REQUIRE_EQUAL(f4.z,2.0);
257  BOOST_REQUIRE_EQUAL(f4.w,3.0);
258 }
259 
260 BOOST_AUTO_TEST_SUITE_END()
261 
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:352
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:497
virtual void deviceToHost()
Move memory from device to host.
Definition: CudaMemory.cu:367