OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
map_vector_sparse_cuda_kernels_unit_tests.cu
1#define BOOST_TEST_DYN_LINK
2#include <boost/test/unit_test.hpp>
3#include <Vector/map_vector.hpp>
4#include <Vector/cuda/map_vector_sparse_cuda_kernels.cuh>
5#include "util/cuda/scan_ofp.cuh"
6#include "util/cuda/merge_ofp.cuh"
7
8BOOST_AUTO_TEST_SUITE( vector_sparse_cuda_kernels )
9
10BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use )
11{
17
18 int nblock = 100;
19 int nslot = 512;
20 block_insert.resize(nblock*nslot);
21 block_n.resize(nblock+1);
22 block_n_scan.resize(nblock+1);
23
24 // fill block insert of some data
25 for (int i = 0 ; i < nblock ; i++)
26 {
27 block_n.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 511;
28 for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
29 {
30 block_insert.template get<0>(i*nslot + j) = ((float)rand() / (float)RAND_MAX) * 511;
31 }
32 }
33
34 block_n.template get<0>(block_n.size()-1) = 0;
35 block_insert.template hostToDevice<0>();
36 block_n.template hostToDevice<0>();
37
38 gpu::ofp_context_t context;
39 openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context);
40
41 block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1);
42 size_t n_ele = block_n_scan.template get<0>(block_n.size()-1);
43
44 output_list_0.resize(n_ele);
45 output_list_1.resize(n_ele);
46
47 dim3 wthr;
48 dim3 thr;
49
50 wthr.x = nblock;
51 wthr.y = 1;
52 wthr.z = 1;
53 thr.x = 64;
54 thr.y = 1;
55 thr.z = 1;
56
57 CUDA_LAUNCH_DIM3(construct_insert_list_key_only,wthr,thr,block_insert.toKernel(),block_n.toKernel(),
58 block_n_scan.toKernel(),output_list_0.toKernel(),output_list_1.toKernel(),
59 nslot);
60
61 output_list_0.template deviceToHost<0>();
62 block_n_scan.template deviceToHost<0>();
63
64 // we check
65
66 bool check = true;
67
68 // fill block insert of some data
69 for (int i = 0 ; i < nblock ; i++)
70 {
71 for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
72 {
73 check &= output_list_0.template get<0>(block_n_scan.template get<0>(i) + j) == block_insert.template get<0>(i*nslot + j);
74 }
75 }
76
77 BOOST_REQUIRE_EQUAL(check,true);
78}
79
80BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use_small_pool )
81{
87
88 int nblock = 100;
89 int nslot = 17;
90 block_insert.resize(nblock*nslot);
91 block_n.resize(nblock+1);
92 block_n_scan.resize(nblock+1);
93
94 // fill block insert of some data
95 for (int i = 0 ; i < nblock ; i++)
96 {
97 block_n.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 16;
98 for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
99 {
100 block_insert.template get<0>(i*nslot + j) = ((float)rand() / (float)RAND_MAX) * 511;
101 }
102 }
103
104 block_n.template get<0>(block_n.size()-1) = 0;
105 block_insert.template hostToDevice<0>();
106 block_n.template hostToDevice<0>();
107
108 gpu::ofp_context_t context;
109 openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context);
110
111 block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1);
112 size_t n_ele = block_n_scan.template get<0>(block_n.size()-1);
113
114 output_list_0.resize(n_ele);
115 output_list_1.resize(n_ele);
116
117 auto ite = block_insert.getGPUIterator();
118
119 CUDA_LAUNCH(construct_insert_list_key_only_small_pool,ite,block_insert.toKernel(),block_n.toKernel(),
120 block_n_scan.toKernel(),output_list_0.toKernel(),output_list_1.toKernel(),
121 nslot);
122
123 output_list_0.template deviceToHost<0>();
124 block_n_scan.template deviceToHost<0>();
125
126 // we check
127
128 bool check = true;
129
130 // fill block insert of some data
131 for (int i = 0 ; i < nblock ; i++)
132 {
133 for (int j = 0 ; j < block_n.template get<0>(i) ; j++)
134 {
135 check &= output_list_0.template get<0>(block_n_scan.template get<0>(i) + j) == block_insert.template get<0>(i*nslot + j);
136 }
137 }
138
139 BOOST_REQUIRE_EQUAL(check,true);
140}
141
142BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use )
143{
147
150
151 vct_index_old.resize(1024);
152
153 for (size_t i = 0 ; i < vct_index_old.size() ; i++)
154 {
155 vct_index_old.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
156 vct_index_old.template get<1>(i) = i;
157 }
158
159 vct_add_index.resize(100);
160
161 for (size_t i = 0 ; i < vct_add_index.size() ; i++)
162 {
163 vct_add_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
164 vct_add_index.template get<1>(i) = i+vct_index_old.size();
165 }
166
167 // Now we merge
168
169 vct_index.resize(vct_add_index.size() + vct_index_old.size());
170
172
173 // host to device
174 vct_index_old.template hostToDevice<0,1>();
175 vct_add_index.template hostToDevice<0,1>();
176
177 openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
178 (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
179 (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),gpu::less_t<int>(),ctx);
180
181 vct_index.template deviceToHost<0,1>();
182
183 // Check
184
185 bool match = true;
186
187 size_t i;
188 for ( i = 0 ; i < vct_index.size() - 1 ; i++)
189 {
190 match &= vct_index.template get<0>(i) <= vct_index.template get<0>(i+1);
191
192 int a = vct_index.template get<1>(i);
193
194 if (a >= vct_index_old.size())
195 {
196 a -= vct_index_old.size();
197 match &= vct_add_index.template get<0>(a) == vct_index.template get<0>(i);
198 }
199 else
200 {
201 match &= vct_index_old.template get<0>(a) == vct_index.template get<0>(i);
202 }
203 }
204
205 int a = vct_index.template get<1>(i);
206
207 if (a >= vct_index_old.size())
208 {
209 a -= vct_index_old.size();
210 match &= vct_add_index.template get<0>(a) == vct_index.template get<0>(i);
211 }
212 else
213 {
214 match &= vct_index_old.template get<0>(a) == vct_index.template get<0>(i);
215 }
216
217 BOOST_REQUIRE_EQUAL(match,true);
218}
219
220BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use )
221{
227
230
233
234 vct_index_old.resize(1024);
235 vct_data_old.resize(vct_index_old.size());
236
237 for (size_t i = 0 ; i < vct_index_old.size() ; i++)
238 {
239 vct_index_old.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
240 vct_index_old.template get<1>(i) = i;
241
242 vct_data_old.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
243 vct_data_old.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
244 vct_data_old.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
245 }
246
247 vct_add_index.resize(100);
248 vct_add_data.resize(100);
249
250 for (size_t i = 0 ; i < vct_add_index.size() ; i++)
251 {
252 vct_add_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
253 vct_add_index.template get<1>(i) = i+vct_index_old.size();
254
255 vct_add_data.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
256 vct_add_data.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
257 vct_add_data.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
258 }
259
260 // Now we merge
261
262 vct_index.resize(vct_add_index.size() + vct_index_old.size());
263 merge_indexes.resize(vct_index.size());
264
266
267 // host to device
268 vct_index_old.template hostToDevice<0,1>();
269 vct_add_index.template hostToDevice<0,1>();
270 vct_data_old.template hostToDevice<0,1,2>();
271 vct_add_data.template hostToDevice<0,1,2>();
272
273 openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
274 (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
275 (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),gpu::less_t<int>(),ctx);
276
277 constexpr int bdim = 128;
278
279 auto ite = vct_index.getGPUIterator(bdim);
280
281 vct_index_out.resize(vct_index.size());
282 vct_data_out.resize(vct_index.size());
283 vct_tot_out.resize(ite.wthr.x);
284
285 CUDA_LAUNCH_DIM3((solve_conflicts<decltype(vct_index.toKernel()),decltype(vct_data_old.toKernel()),decltype(vct_tot_out.toKernel()),bdim,sadd_<0>,smin_<1>,smax_<2>>),ite.wthr,ite.thr,vct_index.toKernel(),vct_data_old.toKernel(),
286 merge_indexes.toKernel(),vct_add_data.toKernel(),
287 vct_index_out.toKernel(),vct_data_out.toKernel(),
288 vct_tot_out.toKernel(),vct_data_old.size());
289
290 vct_index.template deviceToHost<0>();
291 vct_data_out.template deviceToHost<0,1,2>();
292 vct_tot_out.template deviceToHost<0>();
293 vct_index_out.template deviceToHost<0>();
294 merge_indexes.template deviceToHost<0>();
295
296 size_t cnt = 0;
297
298 bool match = true;
299
300 while (cnt * bdim < vct_index.size())
301 {
302 size_t scan_block = 0;
303 for (size_t i = 0 ; i < bdim - 1 && (cnt * bdim + i + 1) < vct_index.size() ; i++)
304 {
305 if (vct_index.template get<0>(cnt * bdim + i) == vct_index.template get<0>(cnt * bdim + i + 1))
306 {
307 // they match enable reduction pattern
308
309 match &= vct_index_out.template get<0>(cnt * bdim + scan_block) == vct_index.template get<0>(cnt * bdim + i);
310
311 int index1 = merge_indexes.template get<0>(cnt * bdim + i);
312 int index2 = merge_indexes.template get<0>(cnt * bdim + i + 1) - vct_index_old.size();
313
314 match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_data_old.template get<0>(index1) + vct_add_data.template get<0>(index2);
315
316 match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == smin_<1>::red(vct_data_old.template get<1>(index1),vct_add_data.template get<1>(index2));
317 match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == smax_<2>::red(vct_data_old.template get<2>(index1),vct_add_data.template get<2>(index2));
318
319 i++;
320 }
321 else
322 {
323 match &= vct_index_out.template get<0>(cnt * bdim + scan_block) == vct_index.template get<0>(cnt * bdim + i);
324
325 int index1 = merge_indexes.template get<0>(cnt * bdim + i);
326 if (index1 < vct_index_old.size())
327 {
328 match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_data_old.template get<0>(index1);
329 match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == vct_data_old.template get<1>(index1);
330 match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == vct_data_old.template get<2>(index1);
331 }
332 else
333 {
334 index1 -= vct_index_old.size();
335 match &= vct_data_out.template get<0>(cnt * bdim + scan_block) == vct_add_data.template get<0>(index1);
336 match &= vct_data_out.template get<1>(cnt * bdim + scan_block) == vct_add_data.template get<1>(index1);
337 match &= vct_data_out.template get<2>(cnt * bdim + scan_block) == vct_add_data.template get<2>(index1);
338 }
339 }
340 scan_block++;
341 }
342
343 ++cnt;
344 }
345
346 BOOST_REQUIRE_EQUAL(match,true);
347}
348
349BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_realign_use )
350{
355
357
358 vct_index.resize(1024);
359 vct_data.resize(vct_index.size());
360 vct_tot_out.resize(vct_index.size() / 128);
361
362 for (size_t i = 0 ; i < vct_index.size() ; i++)
363 {
364 vct_index.template get<0>(i) = 17*(float)rand()/(float)RAND_MAX + i * 17;
365
366 vct_data.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
367 vct_data.template get<1>(i) = 128*(float)rand()/(float)RAND_MAX;
368 vct_data.template get<2>(i) = 128*(float)rand()/(float)RAND_MAX;
369 }
370
371 for (size_t i = 0 ; i < vct_tot_out.size() ; i++)
372 {
373 vct_tot_out.template get<0>(i) = 128*(float)rand()/(float)RAND_MAX;
374 vct_tot_out.template get<2>(i) = 1;
375 }
376
377 vct_index.template hostToDevice<0>();
378 vct_data.template hostToDevice<0,1,2>();
379 vct_tot_out.template hostToDevice<0,2>();
380
382 openfpm::scan((int *)vct_tot_out.getDeviceBuffer<0>(),vct_tot_out.size(),(int *)vct_tot_out.getDeviceBuffer<1>(),ctx);
383
384 vct_tot_out.deviceToHost<0,1>();
385 vct_index_out.resize(vct_index.size());
386 vct_data_out.resize(vct_index.size());
387
388 int nblock = vct_tot_out.size();
389 CUDA_LAUNCH_DIM3(realign,nblock,128,vct_index.toKernel(),vct_data.toKernel(),vct_index_out.toKernel(),vct_data_out.toKernel(),vct_tot_out.toKernel());
390
391 vct_index_out.deviceToHost<0>();
392 vct_data_out.deviceToHost<0,1,2>();
393
394 int base = vct_tot_out.template get<1>(vct_tot_out.size()-1);
395 int last = vct_tot_out.template get<0>(vct_tot_out.size()-1);
396
397 vct_index_out.resize(base+last);
398
399 size_t pr = 0;
400 bool match = true;
401 for (size_t i = 0 ; i < vct_tot_out.size() ; i++)
402 {
403 int tot = vct_tot_out.template get<0>(i);
404 for (size_t j = 0 ; j < tot ; j++)
405 {
406 match &= vct_index_out.template get<0>(pr) == vct_index.template get<0>(i*128 + j);
407 match &= vct_data_out.template get<0>(pr) == vct_data.template get<0>(i*128 + j);
408 match &= vct_data_out.template get<1>(pr) == vct_data.template get<1>(i*128 + j);
409 match &= vct_data_out.template get<2>(pr) == vct_data.template get<2>(i*128 + j);
410
411 if (match == false)
412 {
413 std::cout << 0 << " " << pr << " " << i*128 + j << " " << vct_index_out.template get<0>(pr) << " " << vct_index.template get<0>(i*128 + j) << std::endl;
414 std::cout << 1 << " " << pr << " " << i*128 + j << " " << vct_data_out.template get<0>(pr) << " " << vct_data.template get<0>(i*128 + j) << std::endl;
415 std::cout << 2 << " " << pr << " " << i*128 + j << " " << vct_data_out.template get<1>(pr) << " " << vct_data.template get<1>(i*128 + j) << std::endl;
416 std::cout << 3 << " " << pr << " " << i*128 + j << " " << vct_data_out.template get<2>(pr) << " " << vct_data.template get<2>(i*128 + j) << std::endl;
417 }
418
419 ++pr;
420 }
421 }
422
423 BOOST_REQUIRE_EQUAL(vct_index_out.template get<0>(vct_index_out.size()-1), vct_index.template get<0>(last + vct_index.size() - 128 -1 ) );
424 BOOST_REQUIRE_EQUAL(match,true);
425}
426
427BOOST_AUTO_TEST_SUITE_END()
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.