1#define BOOST_TEST_DYN_LINK
2#include <boost/test/unit_test.hpp>
3#include <Vector/map_vector.hpp>
4#include <Vector/cuda/map_vector_sparse_cuda_kernels.cuh>
5#include "util/cuda/scan_ofp.cuh"
6#include "util/cuda/merge_ofp.cuh"
8BOOST_AUTO_TEST_SUITE( vector_sparse_cuda_kernels )
10BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use )
20 block_insert.resize(nblock*nslot);
21 block_n.resize(nblock+1);
22 block_n_scan.resize(nblock+1);
25 for (
int i = 0 ; i < nblock ; i++)
27 block_n.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 511;
28 for (
int j = 0 ; j < block_n.template get<0>(i) ; j++)
30 block_insert.template get<0>(i*nslot + j) = ((float)rand() / (float)RAND_MAX) * 511;
34 block_n.template get<0>(block_n.
size()-1) = 0;
35 block_insert.template hostToDevice<0>();
36 block_n.template hostToDevice<0>();
39 openfpm::scan((
int *)block_n.template getDeviceBuffer<0>(), block_n.
size(), (
int *)block_n_scan.template getDeviceBuffer<0>() , context);
41 block_n_scan.template deviceToHost<0>(block_n_scan.
size()-1,block_n_scan.
size()-1);
42 size_t n_ele = block_n_scan.template get<0>(block_n.
size()-1);
44 output_list_0.resize(n_ele);
45 output_list_1.resize(n_ele);
57 CUDA_LAUNCH_DIM3(construct_insert_list_key_only,wthr,thr,block_insert.toKernel(),block_n.toKernel(),
58 block_n_scan.toKernel(),output_list_0.toKernel(),output_list_1.toKernel(),
61 output_list_0.template deviceToHost<0>();
62 block_n_scan.template deviceToHost<0>();
69 for (
int i = 0 ; i < nblock ; i++)
71 for (
int j = 0 ; j < block_n.template get<0>(i) ; j++)
73 check &= output_list_0.template get<0>(block_n_scan.template get<0>(i) + j) == block_insert.template get<0>(i*nslot + j);
77 BOOST_REQUIRE_EQUAL(check,
true);
80BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use_small_pool )
90 block_insert.resize(nblock*nslot);
91 block_n.resize(nblock+1);
92 block_n_scan.resize(nblock+1);
95 for (
int i = 0 ; i < nblock ; i++)
97 block_n.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 16;
98 for (
int j = 0 ; j < block_n.template get<0>(i) ; j++)
100 block_insert.template get<0>(i*nslot + j) = ((float)rand() / (float)RAND_MAX) * 511;
104 block_n.template get<0>(block_n.
size()-1) = 0;
105 block_insert.template hostToDevice<0>();
106 block_n.template hostToDevice<0>();
109 openfpm::scan((
int *)block_n.template getDeviceBuffer<0>(), block_n.
size(), (
int *)block_n_scan.template getDeviceBuffer<0>() , context);
111 block_n_scan.template deviceToHost<0>(block_n_scan.
size()-1,block_n_scan.
size()-1);
112 size_t n_ele = block_n_scan.template get<0>(block_n.
size()-1);
114 output_list_0.resize(n_ele);
115 output_list_1.resize(n_ele);
117 auto ite = block_insert.getGPUIterator();
119 CUDA_LAUNCH(construct_insert_list_key_only_small_pool,ite,block_insert.toKernel(),block_n.toKernel(),
120 block_n_scan.toKernel(),output_list_0.toKernel(),output_list_1.toKernel(),
123 output_list_0.template deviceToHost<0>();
124 block_n_scan.template deviceToHost<0>();
131 for (
int i = 0 ; i < nblock ; i++)
133 for (
int j = 0 ; j < block_n.template get<0>(i) ; j++)
135 check &= output_list_0.template get<0>(block_n_scan.template get<0>(i) + j) == block_insert.template get<0>(i*nslot + j);
139 BOOST_REQUIRE_EQUAL(check,
true);
142BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use )
151 vct_index_old.resize(1024);
153 for (
size_t i = 0 ; i < vct_index_old.
size() ; i++)
155 vct_index_old.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
156 vct_index_old.template get<1>(i) = i;
159 vct_add_index.resize(100);
161 for (
size_t i = 0 ; i < vct_add_index.
size() ; i++)
163 vct_add_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
164 vct_add_index.template get<1>(i) = i+vct_index_old.
size();
169 vct_index.resize(vct_add_index.
size() + vct_index_old.
size());
174 vct_index_old.template hostToDevice<0,1>();
175 vct_add_index.template hostToDevice<0,1>();
177 openfpm::merge((
int *)vct_index_old.template getDeviceBuffer<0>(),(
int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.
size(),
178 (
int *)vct_add_index.template getDeviceBuffer<0>(),(
int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.
size(),
179 (
int *)vct_index.template getDeviceBuffer<0>(),(
int *)vct_index.template getDeviceBuffer<1>(),
gpu::less_t<int>(),ctx);
181 vct_index.template deviceToHost<0,1>();
188 for ( i = 0 ; i < vct_index.
size() - 1 ; i++)
190 match &= vct_index.template get<0>(i) <= vct_index.template get<0>(i+1);
192 int a = vct_index.template get<1>(i);
194 if (a >= vct_index_old.
size())
196 a -= vct_index_old.
size();
197 match &= vct_add_index.template get<0>(a) == vct_index.template get<0>(i);
201 match &= vct_index_old.template get<0>(a) == vct_index.template get<0>(i);
205 int a = vct_index.template get<1>(i);
207 if (a >= vct_index_old.
size())
209 a -= vct_index_old.
size();
210 match &= vct_add_index.template get<0>(a) == vct_index.template get<0>(i);
214 match &= vct_index_old.template get<0>(a) == vct_index.template get<0>(i);
217 BOOST_REQUIRE_EQUAL(match,
true);
220BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use )
234 vct_index_old.resize(1024);
235 vct_data_old.resize(vct_index_old.
size());
237 for (
size_t i = 0 ; i < vct_index_old.
size() ; i++)
239 vct_index_old.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
240 vct_index_old.template get<1>(i) = i;
242 vct_data_old.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
243 vct_data_old.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
244 vct_data_old.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
247 vct_add_index.resize(100);
248 vct_add_data.resize(100);
250 for (
size_t i = 0 ; i < vct_add_index.
size() ; i++)
252 vct_add_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
253 vct_add_index.template get<1>(i) = i+vct_index_old.
size();
255 vct_add_data.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
256 vct_add_data.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
257 vct_add_data.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
262 vct_index.resize(vct_add_index.
size() + vct_index_old.
size());
263 merge_indexes.resize(vct_index.
size());
268 vct_index_old.template hostToDevice<0,1>();
269 vct_add_index.template hostToDevice<0,1>();
270 vct_data_old.template hostToDevice<0,1,2>();
271 vct_add_data.template hostToDevice<0,1,2>();
273 openfpm::merge((
int *)vct_index_old.template getDeviceBuffer<0>(),(
int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.
size(),
274 (
int *)vct_add_index.template getDeviceBuffer<0>(),(
int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.
size(),
275 (
int *)vct_index.template getDeviceBuffer<0>(),(
int *)merge_indexes.template getDeviceBuffer<0>(),
gpu::less_t<int>(),ctx);
277 constexpr int bdim = 128;
279 auto ite = vct_index.getGPUIterator(bdim);
281 vct_index_out.resize(vct_index.
size());
282 vct_data_out.resize(vct_index.
size());
283 vct_tot_out.resize(ite.wthr.x);
285 CUDA_LAUNCH_DIM3((solve_conflicts<
decltype(vct_index.toKernel()),
decltype(vct_data_old.toKernel()),
decltype(vct_tot_out.toKernel()),bdim,
sadd_<0>,
smin_<1>,
smax_<2>>),ite.wthr,ite.thr,vct_index.toKernel(),vct_data_old.toKernel(),
286 merge_indexes.toKernel(),vct_add_data.toKernel(),
287 vct_index_out.toKernel(),vct_data_out.toKernel(),
288 vct_tot_out.toKernel(),vct_data_old.
size());
290 vct_index.template deviceToHost<0>();
291 vct_data_out.template deviceToHost<0,1,2>();
292 vct_tot_out.template deviceToHost<0>();
293 vct_index_out.template deviceToHost<0>();
294 merge_indexes.template deviceToHost<0>();
300 while (cnt * bdim < vct_index.
size())
302 size_t scan_block = 0;
303 for (
size_t i = 0 ; i < bdim - 1 && (cnt * bdim + i + 1) < vct_index.
size() ; i++)
305 if (vct_index.template get<0>(cnt * bdim + i) == vct_index.template get<0>(cnt * bdim + i + 1))
309 match &= vct_index_out.template get<0>(cnt * bdim + scan_block) == vct_index.template get<0>(cnt * bdim + i);
311 int index1 = merge_indexes.template get<0>(cnt * bdim + i);
312 int index2 = merge_indexes.template get<0>(cnt * bdim + i + 1) - vct_index_old.
size();
314 match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_data_old.template get<0>(index1) + vct_add_data.template get<0>(index2);
316 match &= vct_data_out.template get<1>(cnt * bdim + scan_block) ==
smin_<1>::red(vct_data_old.template get<1>(index1),vct_add_data.template get<1>(index2));
317 match &= vct_data_out.template get<2>(cnt * bdim + scan_block) ==
smax_<2>::red(vct_data_old.template get<2>(index1),vct_add_data.template get<2>(index2));
323 match &= vct_index_out.template get<0>(cnt * bdim + scan_block) == vct_index.template get<0>(cnt * bdim + i);
325 int index1 = merge_indexes.template get<0>(cnt * bdim + i);
326 if (index1 < vct_index_old.
size())
328 match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_data_old.template get<0>(index1);
329 match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == vct_data_old.template get<1>(index1);
330 match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == vct_data_old.template get<2>(index1);
334 index1 -= vct_index_old.
size();
335 match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_add_data.template get<0>(index1);
336 match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == vct_add_data.template get<1>(index1);
337 match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == vct_add_data.template get<2>(index1);
346 BOOST_REQUIRE_EQUAL(match,
true);
349BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_realign_use )
358 vct_index.resize(1024);
359 vct_data.resize(vct_index.
size());
360 vct_tot_out.resize(vct_index.
size() / 128);
362 for (
size_t i = 0 ; i < vct_index.
size() ; i++)
364 vct_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
366 vct_data.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
367 vct_data.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
368 vct_data.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
371 for (
size_t i = 0 ; i < vct_tot_out.
size() ; i++)
373 vct_tot_out.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
374 vct_tot_out.template get<2>(i) = 1;
377 vct_index.template hostToDevice<0>();
378 vct_data.template hostToDevice<0,1,2>();
379 vct_tot_out.template hostToDevice<0,2>();
382 openfpm::scan((
int *)vct_tot_out.getDeviceBuffer<0>(),vct_tot_out.
size(),(
int *)vct_tot_out.getDeviceBuffer<1>(),ctx);
384 vct_tot_out.deviceToHost<0,1>();
385 vct_index_out.resize(vct_index.
size());
386 vct_data_out.resize(vct_index.
size());
388 int nblock = vct_tot_out.
size();
389 CUDA_LAUNCH_DIM3(realign,nblock,128,vct_index.toKernel(),vct_data.toKernel(),vct_index_out.toKernel(),vct_data_out.toKernel(),vct_tot_out.toKernel());
391 vct_index_out.deviceToHost<0>();
392 vct_data_out.deviceToHost<0,1,2>();
394 int base = vct_tot_out.template get<1>(vct_tot_out.
size()-1);
395 int last = vct_tot_out.template get<0>(vct_tot_out.
size()-1);
397 vct_index_out.resize(base+last);
401 for (
size_t i = 0 ; i < vct_tot_out.
size() ; i++)
403 int tot = vct_tot_out.template get<0>(i);
404 for (
size_t j = 0 ; j < tot ; j++)
406 match &= vct_index_out.template get<0>(pr) == vct_index.template get<0>(i*128 + j);
407 match &= vct_data_out.template get<0>(pr) == vct_data.template get<0>(i*128 + j);
408 match &= vct_data_out.template get<1>(pr) == vct_data.template get<1>(i*128 + j);
409 match &= vct_data_out.template get<2>(pr) == vct_data.template get<2>(i*128 + j);
413 std::cout << 0 <<
" " << pr <<
" " << i*128 + j <<
" " << vct_index_out.template get<0>(pr) <<
" " << vct_index.template get<0>(i*128 + j) << std::endl;
414 std::cout << 1 <<
" " << pr <<
" " << i*128 + j <<
" " << vct_data_out.template get<0>(pr) <<
" " << vct_data.template get<0>(i*128 + j) << std::endl;
415 std::cout << 2 <<
" " << pr <<
" " << i*128 + j <<
" " << vct_data_out.template get<1>(pr) <<
" " << vct_data.template get<1>(i*128 + j) << std::endl;
416 std::cout << 3 <<
" " << pr <<
" " << i*128 + j <<
" " << vct_data_out.template get<2>(pr) <<
" " << vct_data.template get<2>(i*128 + j) << std::endl;
423 BOOST_REQUIRE_EQUAL(vct_index_out.template get<0>(vct_index_out.
size()-1), vct_index.template get<0>(last + vct_index.
size() - 128 -1 ) );
424 BOOST_REQUIRE_EQUAL(match,
true);
427BOOST_AUTO_TEST_SUITE_END()
Implementation of 1-D std::vector like structure.