OpenFPM  5.2.0
Project that contain the implementation of distributed structures
cudify_unit_test.cu
1 #include "config.h"
2 #define BOOST_TEST_DYN_LINK
3 #include <boost/test/unit_test.hpp>
4 
5 #include "util/cuda_util.hpp"
6 #include "memory/CudaMemory.cuh"
7 
8 #ifdef CUDIFY_USE_OPENMP
9 BOOST_AUTO_TEST_SUITE( cudify_tests_openmp )
10 #elif defined(CUDIFY_USE_CUDA)
11 BOOST_AUTO_TEST_SUITE( cudify_tests_cuda )
12 #elif defined(CUDIFY_USE_HIP)
13 BOOST_AUTO_TEST_SUITE( cudify_tests_hip )
14 #else
15 BOOST_AUTO_TEST_SUITE( cudify_tests_sequential )
16 #endif
17 
18 struct par_struct
19 {
20  float * ptr;
21 };
22 
23 struct ite_g
24 {
25  dim3 wthr;
26  dim3 thr;
27 
28  size_t nblocks() const
29  {
30  return wthr.x * wthr.y * wthr.z;
31  }
32 
33  size_t nthrs() const
34  {
35  return thr.x * thr.y * thr.z;
36  }
37 };
38 
39 template<typename T>
40 __global__ void test1(float * array,T p)
41 {
42  size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
43  size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
44  size_t idx_z = blockIdx.z * blockDim.z + threadIdx.z;
45 
46  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 5.0;
47 
48  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 17.0;
49 }
50 
51 template<typename T>
52 __global__ void test1_syncthreads(T p, float * array)
53 {
54  __shared__ int cnt;
55 
56  cnt = 0;
57 
58  __syncthreads();
59 
60  size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
61  size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
62  size_t idx_z = blockIdx.z * blockDim.z + threadIdx.z;
63 
64  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 5.0;
65 
66  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 17.0;
67 
68  atomicAdd(&cnt,1);
69 
70  __syncthreads();
71 
72  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
73 
74  __syncthreads();
75 
76  atomicAdd(&cnt,1);
77 
78  __syncthreads();
79 
80  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
81 }
82 
83 
84 BOOST_AUTO_TEST_CASE( cudify_on_test_test )
85 {
86  init_wrappers();
87 
88  CudaMemory mem;
89  mem.allocate(16*16*16*sizeof(float));
90 
91  CudaMemory mem2;
92  mem2.allocate(16*16*16*sizeof(float));
93 
94  float * array_ptr = (float *)mem.getDevicePointer();
95 
96  par_struct p;
97  p.ptr = (float *)mem2.getDevicePointer();
98 
99  ite_g g;
100 
101  g.wthr = dim3(4,4,4);
102  g.thr = dim3(4,4,4);
103 
104  CUDA_LAUNCH(test1,g,array_ptr,p);
105 
106  mem.deviceToHost();
107  mem2.deviceToHost();
108 
109  float * ptr1 = (float *)mem.getPointer();
110  float * ptr2 = (float *)mem2.getPointer();
111 
112  bool check = true;
113  for (int i = 0 ; i < 16*16*16; i++)
114  {
115  check &= ptr1[i] == 5.0;
116  check &= ptr2[i] == 17.0;
117  }
118 
119  BOOST_REQUIRE_EQUAL(check,true);
120 }
121 
122 BOOST_AUTO_TEST_CASE( cudify_on_test_test2)
123 {
124  init_wrappers();
125 
126  CudaMemory mem;
127  mem.allocate(16*16*16*sizeof(float));
128 
129  CudaMemory mem2;
130  mem2.allocate(16*16*16*sizeof(float));
131 
132  float * array_ptr = (float *)mem.getDevicePointer();
133 
134  par_struct p;
135  p.ptr = (float *)mem2.getDevicePointer();
136 
137  ite_g g;
138 
139  g.wthr = dim3(4,4,4);
140  g.thr = dim3(4,4,4);
141 
142  CUDA_LAUNCH(test1_syncthreads,g,p,array_ptr);
143 
144  mem.deviceToHost();
145  mem2.deviceToHost();
146 
147  float * ptr1 = (float *)mem.getPointer();
148  float * ptr2 = (float *)mem2.getPointer();
149 
150  bool check = true;
151  for (int i = 0 ; i < 16*16*16; i++)
152  {
153  //std::cout << i << " " << ptr1[i] << " " << ptr2[i] << std::endl;
154 
155  check &= ptr1[i] == 64.0;
156  check &= ptr2[i] == 128.0;
157  }
158 
159  BOOST_REQUIRE_EQUAL(check,true);
160 }
161 
162 
163 BOOST_AUTO_TEST_CASE( cudify_on_test_test2_lambda)
164 {
165  init_wrappers();
166 
167  CudaMemory mem;
168  mem.allocate(16*16*16*sizeof(float));
169 
170  CudaMemory mem2;
171  mem2.allocate(16*16*16*sizeof(float));
172 
173  float * array_ptr = (float *)mem.getDevicePointer();
174 
175  par_struct p;
176  p.ptr = (float *)mem2.getDevicePointer();
177 
178  ite_g g;
179 
180  float * array = array_ptr;
181 
182  g.wthr = dim3(4,4,4);
183  g.thr = dim3(4,4,4);
184 
185  auto lambda_f = [array,p] __device__ (dim3 & blockIdx, dim3 & threadIdx){
186  __shared__ int cnt;
187 
188  cnt = 0;
189 
190  __syncthreads();
191 
192  size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
193  size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
194  size_t idx_z = blockIdx.z * blockDim.z + threadIdx.z;
195 
196  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 5.0;
197 
198  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = 17.0;
199 
200  atomicAdd(&cnt,1);
201 
202  __syncthreads();
203 
204  array[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
205 
206  __syncthreads();
207 
208  atomicAdd(&cnt,1);
209 
210  __syncthreads();
211 
212  p.ptr[idx_z*gridDim.x*gridDim.y*blockDim.x*blockDim.y + idx_y*gridDim.x*blockDim.x + idx_x] = cnt;
213  };
214 
215  CUDA_LAUNCH_LAMBDA(g, lambda_f);
216 
217  mem.deviceToHost();
218  mem2.deviceToHost();
219 
220  float * ptr1 = (float *)mem.getPointer();
221  float * ptr2 = (float *)mem2.getPointer();
222 
223  bool check = true;
224  for (int i = 0 ; i < 16*16*16; i++)
225  {
226  //std::cout << i << " " << ptr1[i] << " " << ptr2[i] << std::endl;
227 
228  check &= ptr1[i] == 64.0;
229  check &= ptr2[i] == 128.0;
230  }
231 
232  BOOST_REQUIRE_EQUAL(check,true);
233 }
234 
235 BOOST_AUTO_TEST_CASE( cudify_type_chack)
236 {
237  float3 f3;
238 
239  f3.x = 0.0;
240  f3.y = 1.0;
241  f3.z = 2.0;
242 
243  BOOST_REQUIRE_EQUAL(f3.x,0.0);
244  BOOST_REQUIRE_EQUAL(f3.y,1.0);
245  BOOST_REQUIRE_EQUAL(f3.z,2.0);
246 
247  float4 f4 = make_float4(0.0,1.0,2.0,3.0);
248 
249 
250  BOOST_REQUIRE_EQUAL(f4.x,0.0);
251  BOOST_REQUIRE_EQUAL(f4.y,1.0);
252  BOOST_REQUIRE_EQUAL(f4.z,2.0);
253  BOOST_REQUIRE_EQUAL(f4.w,3.0);
254 }
255 
256 BOOST_AUTO_TEST_SUITE_END()
257 
CudaMemory::allocate
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
CudaMemory::getDevicePointer
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:503
CudaMemory::getPointer
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:354
par_struct
Definition: cudify_unit_test.cu:18
CudaMemory
Definition: CudaMemory.cuh:58
CudaMemory::deviceToHost
virtual void deviceToHost()
Move memory from device to host.
Definition: CudaMemory.cu:369
ite_g
Definition: cudify_unit_test.cu:23