OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
map_vector_sparse_cuda_kernels_unit_tests.cu
1 #define BOOST_TEST_DYN_LINK
2 #include <boost/test/unit_test.hpp>
3 #include <Vector/map_vector.hpp>
4 #include <Vector/cuda/map_vector_sparse_cuda_kernels.cuh>
5 #include "util/cuda/scan_ofp.cuh"
6 #include "util/cuda/merge_ofp.cuh"
7 
8 BOOST_AUTO_TEST_SUITE( vector_sparse_cuda_kernels )
9 
10 BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use )
11 {
17 
18  int nblock = 100;
19  int nslot = 512;
20  block_insert.resize(nblock*nslot);
21  block_n.resize(nblock+1);
22  block_n_scan.resize(nblock+1);
23 
24  // fill block insert of some data
25  for (int i = 0 ; i < nblock ; i++)
26  {
27  block_n.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 511;
28  for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
29  {
30  block_insert.template get<0>(i*nslot + j) = ((float)rand() / (float)RAND_MAX) * 511;
31  }
32  }
33 
34  block_n.template get<0>(block_n.size()-1) = 0;
35  block_insert.template hostToDevice<0>();
36  block_n.template hostToDevice<0>();
37 
38  mgpu::ofp_context_t context;
39  openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context);
40 
41  block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1);
42  size_t n_ele = block_n_scan.template get<0>(block_n.size()-1);
43 
44  output_list_0.resize(n_ele);
45  output_list_1.resize(n_ele);
46 
47  dim3 wthr;
48  dim3 thr;
49 
50  wthr.x = nblock;
51  wthr.y = 1;
52  wthr.z = 1;
53  thr.x = 64;
54  thr.y = 1;
55  thr.z = 1;
56 
57  CUDA_LAUNCH_DIM3(construct_insert_list_key_only,wthr,thr,block_insert.toKernel(),block_n.toKernel(),
58  block_n_scan.toKernel(),output_list_0.toKernel(),output_list_1.toKernel(),
59  nslot);
60 
61  output_list_0.template deviceToHost<0>();
62  block_n_scan.template deviceToHost<0>();
63 
64  // we check
65 
66  bool check = true;
67 
68  // fill block insert of some data
69  for (int i = 0 ; i < nblock ; i++)
70  {
71  for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
72  {
73  check &= output_list_0.template get<0>(block_n_scan.template get<0>(i) + j) == block_insert.template get<0>(i*nslot + j);
74  }
75  }
76 
77  BOOST_REQUIRE_EQUAL(check,true);
78 }
79 
80 BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use_small_pool )
81 {
87 
88  int nblock = 100;
89  int nslot = 17;
90  block_insert.resize(nblock*nslot);
91  block_n.resize(nblock+1);
92  block_n_scan.resize(nblock+1);
93 
94  // fill block insert of some data
95  for (int i = 0 ; i < nblock ; i++)
96  {
97  block_n.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 16;
98  for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
99  {
100  block_insert.template get<0>(i*nslot + j) = ((float)rand() / (float)RAND_MAX) * 511;
101  }
102  }
103 
104  block_n.template get<0>(block_n.size()-1) = 0;
105  block_insert.template hostToDevice<0>();
106  block_n.template hostToDevice<0>();
107 
108  mgpu::ofp_context_t context;
109  openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context);
110 
111  block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1);
112  size_t n_ele = block_n_scan.template get<0>(block_n.size()-1);
113 
114  output_list_0.resize(n_ele);
115  output_list_1.resize(n_ele);
116 
117  auto ite = block_insert.getGPUIterator();
118 
119  CUDA_LAUNCH(construct_insert_list_key_only_small_pool,ite,block_insert.toKernel(),block_n.toKernel(),
120  block_n_scan.toKernel(),output_list_0.toKernel(),output_list_1.toKernel(),
121  nslot);
122 
123  output_list_0.template deviceToHost<0>();
124  block_n_scan.template deviceToHost<0>();
125 
126  // we check
127 
128  bool check = true;
129 
130  // fill block insert of some data
131  for (int i = 0 ; i < nblock ; i++)
132  {
133  for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
134  {
135  check &= output_list_0.template get<0>(block_n_scan.template get<0>(i) + j) == block_insert.template get<0>(i*nslot + j);
136  }
137  }
138 
139  BOOST_REQUIRE_EQUAL(check,true);
140 }
141 
142 BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use )
143 {
147 
150 
151  vct_index_old.resize(1024);
152 
153  for (size_t i = 0 ; i < vct_index_old.size() ; i++)
154  {
155  vct_index_old.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
156  vct_index_old.template get<1>(i) = i;
157  }
158 
159  vct_add_index.resize(100);
160 
161  for (size_t i = 0 ; i < vct_add_index.size() ; i++)
162  {
163  vct_add_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
164  vct_add_index.template get<1>(i) = i+vct_index_old.size();
165  }
166 
167  // Now we merge
168 
169  vct_index.resize(vct_add_index.size() + vct_index_old.size());
170 
171  mgpu::ofp_context_t ctx;
172 
173  // host to device
174  vct_index_old.template hostToDevice<0,1>();
175  vct_add_index.template hostToDevice<0,1>();
176 
177  openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
178  (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
179  (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),mgpu::less_t<int>(),ctx);
180 
181  vct_index.template deviceToHost<0,1>();
182 
183  // Check
184 
185  bool match = true;
186 
187  size_t i;
188  for ( i = 0 ; i < vct_index.size() - 1 ; i++)
189  {
190  match &= vct_index.template get<0>(i) <= vct_index.template get<0>(i+1);
191 
192  int a = vct_index.template get<1>(i);
193 
194  if (a >= vct_index_old.size())
195  {
196  a -= vct_index_old.size();
197  match &= vct_add_index.template get<0>(a) == vct_index.template get<0>(i);
198  }
199  else
200  {
201  match &= vct_index_old.template get<0>(a) == vct_index.template get<0>(i);
202  }
203  }
204 
205  int a = vct_index.template get<1>(i);
206 
207  if (a >= vct_index_old.size())
208  {
209  a -= vct_index_old.size();
210  match &= vct_add_index.template get<0>(a) == vct_index.template get<0>(i);
211  }
212  else
213  {
214  match &= vct_index_old.template get<0>(a) == vct_index.template get<0>(i);
215  }
216 
217  BOOST_REQUIRE_EQUAL(match,true);
218 }
219 
220 BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use )
221 {
224  openfpm::vector_gpu<aggregate<int>> merge_indexes;
226  openfpm::vector_gpu<aggregate<int>> vct_index_out;
227 
230 
233 
234  vct_index_old.resize(1024);
235  vct_data_old.resize(vct_index_old.size());
236 
237  for (size_t i = 0 ; i < vct_index_old.size() ; i++)
238  {
239  vct_index_old.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
240  vct_index_old.template get<1>(i) = i;
241 
242  vct_data_old.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
243  vct_data_old.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
244  vct_data_old.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
245  }
246 
247  vct_add_index.resize(100);
248  vct_add_data.resize(100);
249 
250  for (size_t i = 0 ; i < vct_add_index.size() ; i++)
251  {
252  vct_add_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
253  vct_add_index.template get<1>(i) = i+vct_index_old.size();
254 
255  vct_add_data.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
256  vct_add_data.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
257  vct_add_data.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
258  }
259 
260  // Now we merge
261 
262  vct_index.resize(vct_add_index.size() + vct_index_old.size());
263  merge_indexes.resize(vct_index.size());
264 
265  mgpu::ofp_context_t ctx;
266 
267  // host to device
268  vct_index_old.template hostToDevice<0,1>();
269  vct_add_index.template hostToDevice<0,1>();
270  vct_data_old.template hostToDevice<0,1,2>();
271  vct_add_data.template hostToDevice<0,1,2>();
272 
273  openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
274  (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
275  (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),mgpu::less_t<int>(),ctx);
276 
277  constexpr int bdim = 128;
278 
279  auto ite = vct_index.getGPUIterator(bdim);
280 
281  vct_index_out.resize(vct_index.size());
282  vct_data_out.resize(vct_index.size());
283  vct_tot_out.resize(ite.wthr.x);
284 
285  CUDA_LAUNCH_DIM3((solve_conflicts<decltype(vct_index.toKernel()),decltype(vct_data_old.toKernel()),decltype(vct_tot_out.toKernel()),bdim,sadd_<0>,smin_<1>,smax_<2>>),ite.wthr,ite.thr,vct_index.toKernel(),vct_data_old.toKernel(),
286  merge_indexes.toKernel(),vct_add_data.toKernel(),
287  vct_index_out.toKernel(),vct_data_out.toKernel(),
288  vct_tot_out.toKernel(),vct_data_old.size());
289 
290  vct_index.template deviceToHost<0>();
291  vct_data_out.template deviceToHost<0,1,2>();
292  vct_tot_out.template deviceToHost<0>();
293  vct_index_out.template deviceToHost<0>();
294  merge_indexes.template deviceToHost<0>();
295 
296  size_t cnt = 0;
297 
298  bool match = true;
299 
300  while (cnt * bdim < vct_index.size())
301  {
302  size_t scan_block = 0;
303  for (size_t i = 0 ; i < bdim - 1 && (cnt * bdim + i + 1) < vct_index.size() ; i++)
304  {
305  if (vct_index.template get<0>(cnt * bdim + i) == vct_index.template get<0>(cnt * bdim + i + 1))
306  {
307  // they match enable reduction pattern
308 
309  match &= vct_index_out.template get<0>(cnt * bdim + scan_block) == vct_index.template get<0>(cnt * bdim + i);
310 
311  int index1 = merge_indexes.template get<0>(cnt * bdim + i);
312  int index2 = merge_indexes.template get<0>(cnt * bdim + i + 1) - vct_index_old.size();
313 
314  match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_data_old.template get<0>(index1) + vct_add_data.template get<0>(index2);
315 
316  match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == smin_<1>::red(vct_data_old.template get<1>(index1),vct_add_data.template get<1>(index2));
317  match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == smax_<2>::red(vct_data_old.template get<2>(index1),vct_add_data.template get<2>(index2));
318 
319  i++;
320  }
321  else
322  {
323  match &= vct_index_out.template get<0>(cnt * bdim + scan_block) == vct_index.template get<0>(cnt * bdim + i);
324 
325  int index1 = merge_indexes.template get<0>(cnt * bdim + i);
326  if (index1 < vct_index_old.size())
327  {
328  match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_data_old.template get<0>(index1);
329  match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == vct_data_old.template get<1>(index1);
330  match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == vct_data_old.template get<2>(index1);
331  }
332  else
333  {
334  index1 -= vct_index_old.size();
335  match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_add_data.template get<0>(index1);
336  match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == vct_add_data.template get<1>(index1);
337  match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == vct_add_data.template get<2>(index1);
338  }
339  }
340  scan_block++;
341  }
342 
343  ++cnt;
344  }
345 
346  BOOST_REQUIRE_EQUAL(match,true);
347 }
348 
349 BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_realign_use )
350 {
353  openfpm::vector_gpu<aggregate<int>> vct_index_out;
355 
357 
358  vct_index.resize(1024);
359  vct_data.resize(vct_index.size());
360  vct_tot_out.resize(vct_index.size() / 128);
361 
362  for (size_t i = 0 ; i < vct_index.size() ; i++)
363  {
364  vct_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
365 
366  vct_data.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
367  vct_data.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
368  vct_data.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
369  }
370 
371  for (size_t i = 0 ; i < vct_tot_out.size() ; i++)
372  {
373  vct_tot_out.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
374  vct_tot_out.template get<2>(i) = 1;
375  }
376 
377  vct_index.template hostToDevice<0>();
378  vct_data.template hostToDevice<0,1,2>();
379  vct_tot_out.template hostToDevice<0,2>();
380 
381  mgpu::ofp_context_t ctx;
382  openfpm::scan((int *)vct_tot_out.getDeviceBuffer<0>(),vct_tot_out.size(),(int *)vct_tot_out.getDeviceBuffer<1>(),ctx);
383 
384  vct_tot_out.deviceToHost<0,1>();
385  vct_index_out.resize(vct_index.size());
386  vct_data_out.resize(vct_index.size());
387 
388  int nblock = vct_tot_out.size();
389  CUDA_LAUNCH_DIM3(realign,nblock,128,vct_index.toKernel(),vct_data.toKernel(),vct_index_out.toKernel(),vct_data_out.toKernel(),vct_tot_out.toKernel());
390 
391  vct_index_out.deviceToHost<0>();
392  vct_data_out.deviceToHost<0,1,2>();
393 
394  int base = vct_tot_out.template get<1>(vct_tot_out.size()-1);
395  int last = vct_tot_out.template get<0>(vct_tot_out.size()-1);
396 
397  vct_index_out.resize(base+last);
398 
399  size_t pr = 0;
400  bool match = true;
401  for (size_t i = 0 ; i < vct_tot_out.size() ; i++)
402  {
403  int tot = vct_tot_out.template get<0>(i);
404  for (size_t j = 0 ; j < tot ; j++)
405  {
406  match &= vct_index_out.template get<0>(pr) == vct_index.template get<0>(i*128 + j);
407  match &= vct_data_out.template get<0>(pr) == vct_data.template get<0>(i*128 + j);
408  match &= vct_data_out.template get<1>(pr) == vct_data.template get<1>(i*128 + j);
409  match &= vct_data_out.template get<2>(pr) == vct_data.template get<2>(i*128 + j);
410 
411  if (match == false)
412  {
413  std::cout << 0 << " " << pr << " " << i*128 + j << " " << vct_index_out.template get<0>(pr) << " " << vct_index.template get<0>(i*128 + j) << std::endl;
414  std::cout << 1 << " " << pr << " " << i*128 + j << " " << vct_data_out.template get<0>(pr) << " " << vct_data.template get<0>(i*128 + j) << std::endl;
415  std::cout << 2 << " " << pr << " " << i*128 + j << " " << vct_data_out.template get<1>(pr) << " " << vct_data.template get<1>(i*128 + j) << std::endl;
416  std::cout << 3 << " " << pr << " " << i*128 + j << " " << vct_data_out.template get<2>(pr) << " " << vct_data.template get<2>(i*128 + j) << std::endl;
417  }
418 
419  ++pr;
420  }
421  }
422 
423  BOOST_REQUIRE_EQUAL(vct_index_out.template get<0>(vct_index_out.size()-1), vct_index.template get<0>(last + vct_index.size() - 128 -1 ) );
424  BOOST_REQUIRE_EQUAL(match,true);
425 }
426 
427 BOOST_AUTO_TEST_SUITE_END()
size_t size()
Stub size.
Definition: map_vector.hpp:211
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202