OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
CellList_gpu_test.cu
1/*
2 * CellList_gpu_test.cpp
3 *
4 * Created on: Jun 13, 2018
5 * Author: i-bird
6 */
7
8#define BOOST_GPU_ENABLED __host__ __device__
9#include "util/cuda_launch.hpp"
10#include "config.h"
11#define BOOST_TEST_DYN_LINK
12#include <boost/test/unit_test.hpp>
13
14#include "util/cuda_util.hpp"
15#include "cuda/CellList_gpu.hpp"
16#include "CellList.hpp"
17#include "util/boost/boost_array_openfpm.hpp"
18#include "Point_test.hpp"
19#include "util/cuda_util.hpp"
20
21BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
22
23template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
24void test_sub_index()
25{
28
29 // fill with some particles
30
32
33 // create 3 particles
34
35 Point<dim,T> p1({0.2,0.2,0.2});
36 Point<dim,T> p2({0.9,0.2,0.2});
37 Point<dim,T> p3({0.2,0.9,0.2});
38 Point<dim,T> p4({0.2,0.2,0.9});
39 Point<dim,T> p5({0.9,0.9,0.2});
40 Point<dim,T> p6({0.9,0.2,0.9});
41 Point<dim,T> p7({0.2,0.9,0.9});
42 Point<dim,T> p8({0.9,0.9,0.9});
43 Point<dim,T> p9({0.0,0.0,0.0});
44 Point<dim,T> p10({0.205,0.205,0.205});
45
46 pl.add(p1);
47 pl.add(p2);
48 pl.add(p3);
49 pl.add(p4);
50 pl.add(p5);
51 pl.add(p6);
52 pl.add(p7);
53 pl.add(p8);
54 pl.add(p9);
55 pl.add(p10);
56
60
61 for (size_t i = 0 ; i < dim ; i++)
62 {
63 div[i] = 17;
64 spacing[i] = 0.1;
65 off[i] = 0;
66 }
67
68 cl_n.resize(17*17*17);
69 cl_n.template fill<0>(0);
70 //CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
71
72 part_ids.resize(pl.size());
73
74 size_t sz[3] = {17,17,17};
75 grid_sm<3,void> gr(sz);
76
77 auto ite = pl.getGPUIterator();
78
79 pl.template hostToDevice<0>();
80
82 Point<dim,T> pt;
83
85
86
87 CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
88 spacing,
89 off,
90 t,
91 pl.size(),
92 0,
93 pl.toKernel(),
94 cl_n.toKernel(),
95 part_ids.toKernel());
96
97 cl_n.template deviceToHost<0>();
98 part_ids.template deviceToHost<0>();
99
100 // I have to find != 0 in the cell with particles
101
102 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
103 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
104 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
105 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
106 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
107 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
108 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
109 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
110 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
111 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
112
113 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
114 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
115 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
116 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
117 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
118 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
119 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
120 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
121 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
122}
123
124
125template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
126void test_sub_index2()
127{
130
131 // fill with some particles
132
134
135 // Make the test more complicated the test to pass because make different capacity of pl and part_ids
136 pl.resize(256);
137 pl.resize(0);
138
139 // create 3 particles
140
141 Point<dim,T> p1({0.2,0.2,0.2});
142 Point<dim,T> p2({0.9,0.2,0.2});
143 Point<dim,T> p3({0.2,0.9,0.2});
144 Point<dim,T> p4({0.2,0.2,0.9});
145 Point<dim,T> p5({0.9,0.9,0.2});
146 Point<dim,T> p6({0.9,0.2,0.9});
147 Point<dim,T> p7({0.2,0.9,0.9});
148 Point<dim,T> p8({0.9,0.9,0.9});
149 Point<dim,T> p9({0.0,0.0,0.0});
150 Point<dim,T> p10({0.205,0.205,0.205});
151
152 Point<dim,T> pt({-0.3,-0.3,-0.3});
153
154 p1 += pt;
155 p2 += pt;
156 p3 += pt;
157 p4 += pt;
158 p5 += pt;
159 p6 += pt;
160 p7 += pt;
161 p8 += pt;
162 p9 += pt;
163 p10 += pt;
164
165
166 pl.add(p1);
167 pl.add(p2);
168 pl.add(p3);
169 pl.add(p4);
170 pl.add(p5);
171 pl.add(p6);
172 pl.add(p7);
173 pl.add(p8);
174 pl.add(p9);
175 pl.add(p10);
176
180
181 for (size_t i = 0 ; i < dim ; i++)
182 {
183 div[i] = 17;
184 spacing[i] = 0.1;
185 off[i] = 0;
186 }
187
188 cl_n.resize(17*17*17);
189 cl_n.template fill<0>(0);
190// CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
191
192 part_ids.resize(pl.size());
193
194 size_t sz[3] = {17,17,17};
195 grid_sm<3,void> gr(sz);
196
197 auto ite = pl.getGPUIterator();
198
199 pl.template hostToDevice<0>();
200
201 Matrix<dim,T> mt;
202
203 shift_only<dim,T> t(mt,pt);
204
205
206 CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div,
207 spacing,
208 off,
209 t,
210 pl.size(),
211 0,
212 pl.toKernel(),
213 cl_n.toKernel(),
214 part_ids.toKernel());
215
216 cl_n.template deviceToHost<0>();
217 part_ids.template deviceToHost<0>();
218
219 // I have to find != 0 in the cell with particles
220
221 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
222 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
223 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
224 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
225 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
226 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
227 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
228 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
229 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
230 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
231
232 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
233 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
234 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
235 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
236 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
237 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
238 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
239 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
240 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
241}
242
243template<unsigned int dim, typename T>
244void create_n_part(int n_part,
246 CellList<dim,T, Mem_fast<>> & cl)
247{
248 pl.resize(n_part);
249
250 auto it = pl.getIterator();
251
252 while(it.isNext())
253 {
254 auto p = it.get();
255
256 pl.template get<0>(p)[0] = (double)rand()/RAND_MAX;
257 pl.template get<0>(p)[1] = (double)rand()/RAND_MAX;
258 pl.template get<0>(p)[2] = (double)rand()/RAND_MAX;
259
260 Point<dim,T> xp;
261 xp.get(0) = pl.template get<0>(p)[0];
262 xp.get(1) = pl.template get<0>(p)[1];
263 xp.get(2) = pl.template get<0>(p)[2];
264
265 size_t c = cl.getCell(xp);
266 cl.addCell(c,p);
267
268 ++it;
269 }
270}
271
272template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
273void create_starts_and_parts_ids(CellList<dim,T, Mem_fast<>> & cl,
275 size_t n_part,
276 size_t n_cell,
280{
281 // Construct starts and part_ids
282
283 part_ids.resize(n_part);
284 starts.resize(n_cell);
285 cells.resize(n_part);
286
288
289 size_t start = 0;
290
291 while (itg.isNext())
292 {
293 auto cell = itg.get();
294
295 size_t clin = gr.LinId(cell);
296
297 for (size_t j = 0 ; j < cl.getNelements(clin) ; j++)
298 {
299 size_t p_id = cl.get(clin,j);
300
301 part_ids.template get<0>(p_id)[0] = clin;
302
303 part_ids.template get<0>(p_id)[1] = j;
304
305 cells.template get<0>(start+j) = p_id;
306 }
307 starts.template get<0>(clin) = start;
308 start += cl.getNelements(clin);
309
310 ++itg;
311 }
312}
313
314template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
315void test_fill_cell()
316{
317#ifndef MAKE_CELLLIST_DETERMINISTIC
318
323
324
325 // CellList to check the result
326
327 Box<dim,T> domain;
328
329 typename openfpm::array<T,dim> spacing;
332
333 size_t div_host[dim];
334
335 size_t tot = 1;
336 for (size_t i = 0 ; i < dim ; i++)
337 {
338 div_host[i] = 10;
339 div_c[i] = 10;
340 tot *= div_host[i];
341 spacing[i] = 0.1;
342 domain.setLow(i,0.0);
343 domain.setHigh(i,1.0);
344 off[i] = 0;
345 }
346
347 CellList<dim,T, Mem_fast<>> cl(domain,div_host,0);
349
350 create_n_part(5000,pl,cl);
351
352 grid_sm<dim,void> gr(div_host);
353
354 create_starts_and_parts_ids(cl,gr,pl.size(),tot,starts,part_ids,cells_out);
355
356 bool check = true;
357 cells.resize(pl.size());
358 for (size_t i = 0 ; i < gr.size() - 1 ; i++)
359 {
360 size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
361
362 check &= (tot_p == cl.getNelements(i));
363
364 grid_key_dx<dim> key = gr.InvLinId(i);
365
366 // Now we check the ids
367
368 for (size_t j = 0 ; j < cl.getNelements(i) ; j++)
369 {
370 size_t p_id = cl.get(i,j);
371
372 check &= part_ids.template get<0>(p_id)[0] == i;
373 }
374 }
375
376 BOOST_REQUIRE(check == true);
377
378 auto itgg = part_ids.getGPUIterator();
379
380 starts.template hostToDevice<0>();
381 part_ids.template hostToDevice<0>();
382
383 // Here we test fill cell
384 CUDA_LAUNCH_DIM3((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg.wthr,itgg.thr,0,
385 div_c,
386 off,
387 part_ids.size(),
388 0,
389 starts.toKernel(),
390 part_ids.toKernel(),
391 cells.toKernel() );
392
393 cells.template deviceToHost<0>();
394
395 for (size_t i = 0 ; i < gr.size() - 1 ; i++)
396 {
397 size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
398
399 check &= (tot_p == cl.getNelements(i));
400
401 grid_key_dx<dim> key = gr.InvLinId(i);
402
403 // Now we check the ids
404
405 for (size_t j = 0 ; j < cl.getNelements(i) ; j++)
406 {
407 size_t p_id = cl.get(i,j);
408
409 size_t p_id2 = cells.template get<0>(starts.template get<0>(i) + j);
410
411 check &= (p_id == p_id2);
412 }
413 }
414
415 BOOST_REQUIRE(check == true);
416
417#endif
418}
419
420template<typename sparse_vector_type>
421__global__ void construct_cells(sparse_vector_type sv, grid_sm<3,void> gs)
422{
423 sv.init();
424
425 grid_key_dx<3> key1({5,5,5});
426 grid_key_dx<3> key2({5,5,6});
427 grid_key_dx<3> key3({5,6,5});
428 grid_key_dx<3> key4({5,6,6});
429 grid_key_dx<3> key5({6,5,5});
430 grid_key_dx<3> key6({6,5,6});
431 grid_key_dx<3> key7({6,6,5});
432 grid_key_dx<3> key8({6,6,6});
433
434 grid_key_dx<3> key9({7,7,7});
435
436 grid_key_dx<3> key10({9,9,9});
437
438 sv.template insert<0>(gs.LinId(key1)) = gs.LinId(key1);
439 sv.template insert<0>(gs.LinId(key2)) = gs.LinId(key2);
440 sv.template insert<0>(gs.LinId(key3)) = gs.LinId(key3);
441 sv.template insert<0>(gs.LinId(key4)) = gs.LinId(key4);
442 sv.template insert<0>(gs.LinId(key5)) = gs.LinId(key5);
443 sv.template insert<0>(gs.LinId(key6)) = gs.LinId(key6);
444 sv.template insert<0>(gs.LinId(key7)) = gs.LinId(key7);
445 sv.template insert<0>(gs.LinId(key8)) = gs.LinId(key8);
446 sv.template insert<0>(gs.LinId(key9)) = gs.LinId(key9);
447 sv.template insert<0>(gs.LinId(key10)) = gs.LinId(key10);
448
449 sv.flush_block_insert();
450}
451
452void test_cell_count_n()
453{
457
458 vs.template setBackground<0>(-1);
459
460 vs.setGPUInsertBuffer(1,32);
461
462 size_t sz[] = {17,17,17};
463 grid_sm<3,void> gs(sz);
464
465 CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.toKernel(),gs);
466
468
469 vs.flush<sadd_<0>>(ctx,flush_type::FLUSH_ON_DEVICE);
470
471 cells_nn.resize(11);
472 cells_nn.fill<0>(0);
473
474 grid_key_dx<3> start({0,0,0});
475 grid_key_dx<3> stop({2,2,2});
476 grid_key_dx<3> middle({1,1,1});
477
478 int mid = gs.LinId(middle);
479
480 grid_key_dx_iterator_sub<3> it(gs,start,stop);
481
482 while (it.isNext())
483 {
484 auto p = it.get();
485
486 cells_nn_test.add();
487 cells_nn_test.get<0>(cells_nn_test.size()-1) = (int)gs.LinId(p) - mid;
488
489 ++it;
490 }
491
492 cells_nn_test.template hostToDevice<0>();
493
494 auto itgg = vs.getGPUIterator();
495 CUDA_LAUNCH((count_nn_cells),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
496
497 cells_nn.deviceToHost<0>();
498
499 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(0),8);
500 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(1),8);
501 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(2),8);
502 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(3),8);
503 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(4),8);
504 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(5),8);
505 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(6),8);
506 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(7),9);
507 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(8),2);
508 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(9),1);
509
510 // now we scan
511 openfpm::scan((unsigned int *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (unsigned int *)cells_nn.template getDeviceBuffer<0>() , ctx);
512
514 cell_nn_list.resize(7*8 + 9 + 2 + 1);
515
516 CUDA_LAUNCH((fill_nn_cells),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
517
518 cell_nn_list.deviceToHost<0>();
519
520 // 8 NN
521 for (size_t i = 0 ; i < 7 ; i++)
522 {
523 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+0),1535);
524 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+1),1536);
525 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+2),1552);
526 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+3),1553);
527 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+4),1824);
528 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+5),1825);
529 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+6),1841);
530 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+7),1842);
531 }
532
533 // 9 NN
534 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+0),1535);
535 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+1),1536);
536 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+2),1552);
537 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+3),1553);
538 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+4),1824);
539 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+5),1825);
540 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+6),1841);
541 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+7),1842);
542 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+8),2149);
543
544 // 2 NN
545 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9),1842);
546 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+1),2149);
547
548 // 1 NN
549 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+2),2763);
550}
551
552BOOST_AUTO_TEST_CASE( test_count_nn_cells )
553{
554 std::cout << "Test cell count nn" << std::endl;
555
556 test_cell_count_n();
557}
558
559BOOST_AUTO_TEST_CASE( test_subindex_funcs )
560{
561 std::cout << "Test cell list GPU base func" << "\n";
562
563 test_sub_index<3,float,int,unsigned char>();
564 test_sub_index2<3,float,int,unsigned char>();
565
566 std::cout << "End cell list GPU" << "\n";
567
568 // Test the cell list
569}
570
571BOOST_AUTO_TEST_CASE ( test_cell_fill )
572{
573 std::cout << "Test GPU fill cells" << "\n";
574
575 test_fill_cell<3,float,unsigned int, unsigned char>();
576
577 std::cout << "End GPU fill cells" << "\n";
578}
579
580template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
581void test_reorder_parts(size_t n_part)
582{
583 // Create n_part
590
593
594 // CellList to check the result
595
596 Box<dim,T> domain;
597
598 typename boost::array_openfpm<T,dim> spacing;
600
601 size_t div_host[dim];
602
603 size_t tot = 1;
604 for (size_t i = 0 ; i < dim ; i++)
605 {
606 div_host[i] = 10;
607 div_c[i] = tot;
608 tot *= div_host[i];
609 spacing[i] = 0.1;
610 domain.setLow(i,0.0);
611 domain.setHigh(i,1.0);
612 }
613
614 CellList<dim,T, Mem_fast<>> cl(domain,div_host,0);
617
618 create_n_part(n_part,pl,cl);
619 parts_prp.resize(n_part);
620 parts_prp_out.resize(n_part);
621 pl_out.resize(n_part);
622 sort_to_not_sort.resize(n_part);
623 non_sort_to_sort.resize(n_part);
624
625 auto p_it = parts_prp.getIterator();
626 while (p_it.isNext())
627 {
628 auto p = p_it.get();
629
630 parts_prp.template get<0>(p) = 10000 + p;
631 parts_prp.template get<1>(p) = 20000 + p;
632
633 parts_prp.template get<2>(p)[0] = 30000 + p;
634 parts_prp.template get<2>(p)[1] = 40000 + p;
635 parts_prp.template get<2>(p)[2] = 50000 + p;
636
637 parts_prp.template get<3>(p)[0][0] = 60000 + p;
638 parts_prp.template get<3>(p)[0][1] = 70000 + p;
639 parts_prp.template get<3>(p)[0][2] = 80000 + p;
640 parts_prp.template get<3>(p)[1][0] = 90000 + p;
641 parts_prp.template get<3>(p)[1][1] = 100000 + p;
642 parts_prp.template get<3>(p)[1][2] = 110000 + p;
643 parts_prp.template get<3>(p)[2][0] = 120000 + p;
644 parts_prp.template get<3>(p)[2][1] = 130000 + p;
645 parts_prp.template get<3>(p)[0][2] = 140000 + p;
646
647 ++p_it;
648 }
649
650 grid_sm<dim,void> gr(div_host);
651
652 create_starts_and_parts_ids(cl,gr,pl.size(),tot,starts,part_ids,cells_out);
653
654
655 auto itgg = pl.getGPUIterator();
656
657 cells_out.template hostToDevice<0>();
658
659 auto ite = pl.getGPUIterator();
660
661 parts_prp.template hostToDevice<0,1,2,3>();
662
663 // Here we test fill cell
664 CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()),
665 decltype(pl.toKernel()),
666 decltype(sort_to_not_sort.toKernel()),
667 decltype(cells_out.toKernel()),
668 cnt_type,
669 shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(),
670 parts_prp.toKernel(),
671 parts_prp_out.toKernel(),
672 pl.toKernel(),
673 pl_out.toKernel(),
674 sort_to_not_sort.toKernel(),
675 non_sort_to_sort.toKernel(),
676 cells_out.toKernel());
677
678 bool check = true;
679 parts_prp_out.template deviceToHost<0>();
680 sort_to_not_sort.template deviceToHost<0>();
681 non_sort_to_sort.template deviceToHost<0>();
682
683 size_t st = 0;
684 for (size_t i = 0 ; i < tot ; i++)
685 {
686 size_t n = cl.getNelements(i);
687
688 for (size_t j = 0 ; j < n ; j++)
689 {
690 size_t p = cl.get(i,j);
691
692 check &= parts_prp_out.template get<0>(st) == parts_prp.template get<0>(p);
693 check &= sort_to_not_sort.template get<0>(st) == p;
694 check &= non_sort_to_sort.template get<0>(p) == st;
695
696 st++;
697 }
698 }
699
700
701 BOOST_REQUIRE_EQUAL(check,true);
702}
703
704BOOST_AUTO_TEST_CASE ( test_reorder_particles )
705{
706 std::cout << "Test GPU reorder" << "\n";
707
708 test_reorder_parts<3,float,unsigned int, unsigned char>(5000);
709
710 std::cout << "End GPU reorder" << "\n";
711}
712
713template<unsigned int dim, typename T, typename CellS> void Test_cell_gpu(SpaceBox<dim,T> & box)
714{
715 //Space where is living the Cell list
716 //SpaceBox<dim,T> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
717
718 // Subdivisions
719 size_t div[dim] = {16,16,16};
720
721 // Origin
722 Point<dim,T> org({0.0,0.0,0.0});
723
724 // id Cell list
725 CellS cl2(box,div);
726
727 // vector of particles
728
731
734
735
736 // create 3 particles
737
738 Point<dim,double> p1({0.2,0.2,0.2});
739 Point<dim,double> p2({0.9,0.2,0.2});
740 Point<dim,double> p3({0.2,0.9,0.2});
741 Point<dim,double> p4({0.2,0.2,0.9});
742 Point<dim,double> p5({0.9,0.9,0.2});
743 Point<dim,double> p6({0.9,0.2,0.9});
744 Point<dim,double> p7({0.2,0.9,0.9});
745 Point<dim,double> p8({0.9,0.9,0.9});
746 Point<dim,double> p9({0.0,0.0,0.0});
747
748 pl.add(p1);
749 pl.add(p2);
750 pl.add(p3);
751 pl.add(p4);
752 pl.add(p5);
753 pl.add(p6);
754 pl.add(p7);
755 pl.add(p8);
756 pl.add(p9);
757
758 pl_prp.resize(pl.size());
759 pl_prp_out.resize(pl.size());
760 pl_out.resize(pl.size());
761
762 for (size_t i = 0 ; i < pl.size() ; i++)
763 {
764 pl_prp.template get<0>(i) = pl.template get<0>(i)[0];
765
766 pl_prp.template get<1>(i)[0] = pl.template get<0>(i)[0]+100.0;
767 pl_prp.template get<1>(i)[1] = pl.template get<0>(i)[1]+100.0;
768 pl_prp.template get<1>(i)[2] = pl.template get<0>(i)[2]+100.0;
769
770 pl_prp.template get<2>(i)[0][0] = pl.template get<0>(i)[0]+1000.0;
771 pl_prp.template get<2>(i)[0][1] = pl.template get<0>(i)[1]+1000.0;
772 pl_prp.template get<2>(i)[0][2] = pl.template get<0>(i)[2]+1000.0;
773
774 pl_prp.template get<2>(i)[1][0] = pl.template get<0>(i)[0]+2000.0;
775 pl_prp.template get<2>(i)[1][1] = pl.template get<0>(i)[1]+3000.0;
776 pl_prp.template get<2>(i)[1][2] = pl.template get<0>(i)[2]+4000.0;
777
778 pl_prp.template get<2>(i)[2][0] = pl.template get<0>(i)[0]+5000.0;
779 pl_prp.template get<2>(i)[2][1] = pl.template get<0>(i)[1]+6000.0;
780 pl_prp.template get<2>(i)[2][2] = pl.template get<0>(i)[2]+7000.0;
781 }
782
783 pl_prp.resize(pl.size());
784 pl_prp_out.resize(pl.size());
785
786 pl.template hostToDevice<0>();
787 pl_prp.template hostToDevice<0,1,2>();
788
789 // create an gpu context
790 gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
791 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context);
792
793 // Check
794
795 pl_prp_out.deviceToHost<0>();
796 pl_prp_out.deviceToHost<1>();
797 pl_prp_out.deviceToHost<2>();
798
800
802
803 pl_correct.add(p9);
804 pl_correct.add(p1);
805 pl_correct.add(p2);
806 pl_correct.add(p3);
807 pl_correct.add(p5);
808 pl_correct.add(p4);
809 pl_correct.add(p6);
810 pl_correct.add(p7);
811 pl_correct.add(p8);
812
813 for (size_t i = 0 ; i < pl_correct.size() ; i++)
814 {
815 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<0>(i),(float)pl_correct.template get<0>(i)[0]);
816 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[0],(float)(pl_correct.template get<0>(i)[0]+100.0));
817 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[1],(float)(pl_correct.template get<0>(i)[1]+100.0));
818 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[2],(float)(pl_correct.template get<0>(i)[2]+100.0));
819 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][0],(float)(pl_correct.template get<0>(i)[0] + 1000.0));
820 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][1],(float)(pl_correct.template get<0>(i)[1] + 1000.0));
821 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][2],(float)(pl_correct.template get<0>(i)[2] + 1000.0));
822 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][0],(float)(pl_correct.template get<0>(i)[0] + 2000.0));
823 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][1],(float)(pl_correct.template get<0>(i)[1] + 3000.0));
824 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][2],(float)(pl_correct.template get<0>(i)[2] + 4000.0));
825 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][0],(float)(pl_correct.template get<0>(i)[0] + 5000.0));
826 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][1],(float)(pl_correct.template get<0>(i)[1] + 6000.0));
827 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][2],(float)(pl_correct.template get<0>(i)[2] + 7000.0));
828 }
829
830 // Check the sort to non sort buffer
831
832 auto & vsrt = cl2.getSortToNonSort();
833 vsrt.template deviceToHost<0>();
834
835 BOOST_REQUIRE_EQUAL(vsrt.size(),9);
836
837 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(0),8);
838 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(1),0);
839 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(2),1);
840 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(3),2);
841 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(4),4);
842 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(5),3);
843 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(6),5);
844 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(7),6);
845 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(8),7);
846
847 auto & vnsrt = cl2.getNonSortToSort();
848
849 BOOST_REQUIRE_EQUAL(vnsrt.size(),9);
850
851 // Move to CPU
852
853 vnsrt.template deviceToHost<0>();
854
855 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(8),0);
856 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(0),1);
857 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(1),2);
858 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(2),3);
859 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(4),4);
860 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(3),5);
861 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(5),6);
862 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(6),7);
863 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(7),8);
864}
865
866
867BOOST_AUTO_TEST_CASE( CellList_gpu_use)
868{
869 std::cout << "Test cell list GPU" << "\n";
870
871 SpaceBox<3,double> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
872 SpaceBox<3,double> box2({-1.0f,-1.0f,-1.0f},{1.0f,1.0f,1.0f});
873
874 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory>>(box);
875
876 std::cout << "End cell list GPU" << "\n";
877
878 // Test the cell list
879}
880
881BOOST_AUTO_TEST_CASE( CellList_gpu_use_sparse )
882{
883 std::cout << "Test cell list GPU sparse" << "\n";
884
885 SpaceBox<3,double> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
886 SpaceBox<3,double> box2({-1.0f,-1.0f,-1.0f},{1.0f,1.0f,1.0f});
887
888 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory,no_transform_only<3,double>,unsigned int,int,true>> (box);
889
890 std::cout << "End cell list GPU sparse" << "\n";
891
892 // Test the cell list
893}
894
895template<unsigned int dim, typename vector_ps, typename vector_pr>
896void fill_random_parts(Box<dim,float> & box, vector_ps & vd_pos, vector_pr & vd_prp, size_t n)
897{
898 for (size_t i = 0 ; i < n ; i++)
899 {
901
902 p.get(0) = ((box.getHigh(0) - box.getLow(0) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(0);
903 p.get(1) = ((box.getHigh(1) - box.getLow(1) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(1);
904 p.get(2) = ((box.getHigh(2) - box.getLow(2) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(2);
905
906 vd_pos.add(p);
907 vd_prp.add();
908 vd_prp.last().template get<0>() = i % 3;
909 }
910}
911
912
913template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
914__global__ void calc_force_number(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
915{
916 int p = threadIdx.x + blockIdx.x * blockDim.x;
917
918 if (p >= pos.size()) return;
919
920 Point<3,float> xp = pos.template get<0>(p);
921
922 auto it = cl.getNNIterator(cl.getCell(xp));
923
924 while (it.isNext())
925 {
926 auto q = it.get_sort();
927 auto q_ns = it.get();
928
929 int s1 = s_t_ns.template get<0>(q);
930
931 atomicAdd(&vn.template get<0>(s1), 1);
932
933 ++it;
934 }
935}
936
937template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
938__global__ void calc_force_number_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
939{
940 int p = threadIdx.x + blockIdx.x * blockDim.x;
941
942 if (p >= pos.size()) return;
943
944 Point<3,float> xp = pos.template get<0>(p);
945
946 auto it = cl.getNNIterator(cl.getCell(xp));
947
948 while (it.isNext())
949 {
950 auto q = it.get_sort();
951 auto q_ns = it.get();
952
953 int s1 = s_t_ns.template get<0>(q);
954
955 ++vn.template get<0>(p);
956
957 ++it;
958 }
959}
960
961template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
962__global__ void calc_force_number_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn, unsigned int start)
963{
964 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
965
966 if (p >= pos.size()) return;
967
968 Point<3,float> xp = pos.template get<0>(p);
969
970 auto it = cl.getNNIteratorBox(cl.getCell(xp));
971
972 while (it.isNext())
973 {
974 auto q = it.get_sort();
975
976 int s1 = s_t_ns.template get<0>(q);
977
978 atomicAdd(&vn.template get<0>(s1), 1);
979
980 ++it;
981 }
982}
983
984template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
985__global__ void calc_force_number_box_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn, unsigned int start)
986{
987 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
988
989 if (p >= pos.size()) return;
990
991 Point<3,float> xp = pos.template get<0>(p);
992
993 auto it = cl.getNNIteratorBox(cl.getCell(xp));
994
995 while (it.isNext())
996 {
997 auto q = it.get_sort();
998
999 ++vn.template get<0>(p);
1000
1001 ++it;
1002 }
1003}
1004
1005template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1006__global__ void calc_force_number_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
1007{
1008 int p = threadIdx.x + blockIdx.x * blockDim.x;
1009
1010 if (p >= pos.size()) return;
1011
1012 Point<3,float> xp = pos.template get<0>(p);
1013
1014 auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1015
1016 while (it.isNext())
1017 {
1018 auto q = it.get_sort();
1019
1020 int s1 = s_t_ns.template get<0>(q);
1021
1022 atomicAdd(&vn.template get<0>(s1), 1);
1023
1024 ++it;
1025 }
1026}
1027
1028template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1029__global__ void calc_force_list_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1030{
1031 int p = threadIdx.x + blockIdx.x * blockDim.x;
1032
1033 if (p >= pos.size()) return;
1034
1035 Point<3,float> xp = pos.template get<0>(p);
1036 int start_list = v_nscan.template get<0>(p);
1037
1038 auto it = cl.getNNIteratorBox(cl.getCell(xp));
1039
1040 while (it.isNext())
1041 {
1042 auto q = it.get_sort();
1043
1044 int s1 = s_t_ns.template get<0>(q);
1045
1046 v_list.template get<0>(start_list) = s1;
1047
1048 ++start_list;
1049 ++it;
1050 }
1051}
1052
1053template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1054__global__ void calc_force_list(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1055{
1056 int p = threadIdx.x + blockIdx.x * blockDim.x;
1057
1058 if (p >= pos.size()) return;
1059
1060 Point<3,float> xp = pos.template get<0>(p);
1061 int start_list = v_nscan.template get<0>(p);
1062
1063 auto it = cl.getNNIterator(cl.getCell(xp));
1064
1065 while (it.isNext())
1066 {
1067 auto q = it.get_sort();
1068
1069 int s1 = s_t_ns.template get<0>(q);
1070
1071 v_list.template get<0>(start_list) = s1;
1072
1073 ++start_list;
1074 ++it;
1075 }
1076}
1077
1078template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1079__global__ void calc_force_list_box_partial(vector_pos pos,
1080 vector_ns s_t_ns,
1081 CellList_type cl,
1082 vector_n_type v_nscan,
1083 vector_n_type v_nscan_part,
1084 vector_n_type v_list)
1085{
1086 int p = threadIdx.x + blockIdx.x * blockDim.x;
1087
1088 if (p >= pos.size()) return;
1089
1090 Point<3,float> xp = pos.template get<0>(p);
1091 int start_list = v_nscan.template get<0>(p) + v_nscan_part.template get<0>(p);
1092
1093 auto it = cl.getNNIteratorBox(cl.getCell(xp));
1094
1095 while (it.isNext())
1096 {
1097 auto q = it.get_sort();
1098
1099 int s1 = s_t_ns.template get<0>(q);
1100
1101 v_list.template get<0>(start_list) = s1;
1102
1103 ++start_list;
1104 ++it;
1105 }
1106}
1107
1108template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1109__global__ void calc_force_list_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1110{
1111 int p = threadIdx.x + blockIdx.x * blockDim.x;
1112
1113 if (p >= pos.size()) return;
1114
1115 Point<3,float> xp = pos.template get<0>(p);
1116 int start_list = v_nscan.template get<0>(p);
1117
1118 auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1119
1120 while (it.isNext())
1121 {
1122 auto q = it.get_sort();
1123
1124 int s1 = s_t_ns.template get<0>(q);
1125
1126 v_list.template get<0>(start_list) = s1;
1127
1128 ++start_list;
1129 ++it;
1130 }
1131}
1132
1133template<unsigned int impl>
1135{
1136 template<typename CellS, typename Cells_cpu_type, typename T>
1137 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1138 {
1139 }
1140
1141 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1142 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1143 {
1144 auto ite = pl.getGPUIterator();
1145
1146 CUDA_LAUNCH((calc_force_number),ite,pl.toKernel(),s_t_ns.toKernel(),
1147 cl2.toKernel(),n_out.toKernel());
1148 }
1149
1150 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1151 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2,n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1152 {
1153 auto ite = pl.getGPUIterator();
1154
1155 CUDA_LAUNCH((calc_force_list),ite,pl.toKernel(),
1156 s_t_ns.toKernel(),
1157 cl2.toKernel(),
1158 n_out_scan.toKernel(),
1159 nn_list.toKernel());
1160 }
1161
1162 template<typename NN_type>
1163 static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIterator(cell))
1164 {
1165 return nn.getNNIterator(cell);
1166 }
1167};
1168
1169template<>
1171{
1172 template<typename CellS, typename Cells_cpu_type, typename T>
1173 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1174 {
1175 cl2.setRadius(radius);
1176 cl_cpu.setRadius(radius);
1177 }
1178
1179 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1180 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1181 {
1182 auto ite = pl.getGPUIterator();
1183
1184 CUDA_LAUNCH((calc_force_number_rad<decltype(pl.toKernel()),
1185 decltype(s_t_ns.toKernel()),
1186 decltype(cl2.toKernel()),
1187 decltype(n_out.toKernel())>),
1188 ite,pl.toKernel(),
1189 s_t_ns.toKernel(),
1190 cl2.toKernel(),
1191 n_out.toKernel());
1192 }
1193
1194 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1195 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1196 {
1197 auto ite = pl.getGPUIterator();
1198
1199 CUDA_LAUNCH((calc_force_list_rad<decltype(pl.toKernel()),
1200 decltype(s_t_ns.toKernel()),
1201 decltype(cl2.toKernel()),
1202 decltype(nn_list.toKernel())>),
1203 ite,pl.toKernel(),
1204 s_t_ns.toKernel(),
1205 cl2.toKernel(),
1206 n_out_scan.toKernel(),
1207 nn_list.toKernel());
1208 }
1209
1210 template<typename NN_type>
1211 static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1212 {
1213 return nn.getNNIteratorRadius(cell);
1214 }
1215};
1216
1217template<>
1219{
1220 template<typename CellS, typename Cells_cpu_type, typename T>
1221 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1222 {
1223 cl2.setRadius(radius);
1224 cl_cpu.setRadius(radius);
1225 }
1226
1227 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1228 static void calc_num_noato(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1229 {
1230 auto ite = s_t_ns.getGPUIterator();
1231
1232 CUDA_LAUNCH((calc_force_number_box_noato<decltype(pl.toKernel()),
1233 decltype(s_t_ns.toKernel()),
1234 decltype(cl2.toKernel()),
1235 decltype(n_out.toKernel())>),
1236 ite,pl.toKernel(),
1237 s_t_ns.toKernel(),
1238 cl2.toKernel(),
1239 n_out.toKernel(),
1240 start);
1241 }
1242
1243 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1244 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1245 {
1246 auto ite = s_t_ns.getGPUIterator();
1247
1248 CUDA_LAUNCH((calc_force_number_box<decltype(pl.toKernel()),
1249 decltype(s_t_ns.toKernel()),
1250 decltype(cl2.toKernel()),
1251 decltype(n_out.toKernel())>),
1252 ite,
1253 pl.toKernel(),
1254 s_t_ns.toKernel(),
1255 cl2.toKernel(),
1256 n_out.toKernel(),
1257 start);
1258 }
1259
1260 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1261 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1262 {
1263 auto ite = s_t_ns.getGPUIterator();
1264
1265 CUDA_LAUNCH((calc_force_list_box<decltype(pl.toKernel()),
1266 decltype(s_t_ns.toKernel()),
1267 decltype(cl2.toKernel()),
1268 decltype(nn_list.toKernel())>),
1269 ite,pl.toKernel(),
1270 s_t_ns.toKernel(),
1271 cl2.toKernel(),
1272 n_out_scan.toKernel(),
1273 nn_list.toKernel());
1274 }
1275
1276 template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1277 static void calc_list_partial(pl_type & pl,
1278 s_t_ns_type & s_t_ns,
1279 cl2_type & cl2,
1280 n_out_scan_type & n_out_scan,
1281 n_out_scan_type & n_out_scan_partial,
1282 nn_list_type & nn_list)
1283 {
1284 auto ite = s_t_ns.getGPUIterator();
1285
1286 CUDA_LAUNCH((calc_force_list_box_partial),ite,pl.toKernel(),
1287 s_t_ns.toKernel(),
1288 cl2.toKernel(),
1289 n_out_scan.toKernel(),
1290 n_out_scan_partial.toKernel(),
1291 nn_list.toKernel());
1292 }
1293
1294 template<typename NN_type>
1295 static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1296 {
1297 return nn.getNNIteratorRadius(cell);
1298 }
1299};
1300
1301template<unsigned int dim, typename T, typename CellS, int impl>
1302void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& div)[dim],int box_nn = 1)
1303{
1304 // Origin
1305 Point<dim,T> org({0.0,0.0,0.0});
1306
1307 // id Cell list
1308 CellS cl2(box,div,2);
1309
1310 CellList<dim,T,Mem_fast<>,shift<dim,T>> cl_cpu(box,div,2);
1311
1312 cl2.setBoxNN(box_nn);
1313
1314 T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1315 execute_cl_test<impl>::set_radius(cl2,cl_cpu,radius);
1316
1317 // vector of particles
1318
1321
1324
1326
1327 // create random particles
1328
1329 fill_random_parts<3>(box,pl,pl_prp,npart);
1330
1331 pl_prp_out.resize(pl.size());
1332 pl_out.resize(pl.size());
1333 n_out.resize(pl.size()+1);
1334 n_out.fill<0>(0);
1335
1336 pl_prp.resize(pl.size());
1337 pl_prp_out.resize(pl.size());
1338
1339 pl.template hostToDevice<0>();
1340 pl_prp.template hostToDevice<0,1>();
1341
1342 // Construct an equivalent CPU cell-list
1343
1344 // construct
1345
1346 auto it2 = pl.getIterator();
1347
1348 while (it2.isNext())
1349 {
1350 auto p = it2.get();
1351
1352 Point<3,T> xp = pl.get(p);
1353
1354 cl_cpu.add(xp,p);
1355
1356 ++it2;
1357 }
1358
1359 size_t g_m = pl.size() / 2;
1360
1361 gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
1362 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
1363
1364 auto & s_t_ns = cl2.getSortToNonSort();
1365
1366 pl.template hostToDevice<0>();
1367
1368 execute_cl_test<impl>::calc_num(pl,s_t_ns,cl2,n_out,0);
1369
1370 // Domain particles
1371
1372
1373 auto & gdsi = cl2.getDomainSortIds();
1374 gdsi.template deviceToHost<0>();
1375 s_t_ns.template deviceToHost<0>();
1376
1377 bool match = true;
1378 for (size_t i = 0 ; i < g_m ; i++)
1379 {
1380 unsigned int p = gdsi.template get<0>(i);
1381
1382 match &= (s_t_ns.template get<0>(p) < g_m);
1383 }
1384
1385 BOOST_REQUIRE_EQUAL(match,true);
1386
1387 // Check
1388
1389 n_out.deviceToHost<0>();
1390
1391 {
1392 bool check = true;
1393 auto it = pl.getIterator();
1394
1395 while(it.isNext())
1396 {
1397 auto p = it.get();
1398
1399 Point<dim,T> xp = pl.get(p);
1400
1401 // Get NN iterator
1402
1403 auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp))*/;
1404
1405 size_t n_ele = 0;
1406 while (NN_it.isNext())
1407 {
1408 auto q = NN_it.get();
1409
1410 n_ele++;
1411
1412 ++NN_it;
1413 }
1414
1415 check &= n_ele == n_out.template get<0>(p);
1416
1417 if (check == false)
1418 {
1419 std::cout << p << " " << n_ele << " " << n_out.template get<0>(p) << " " << check << std::endl;
1420 break;
1421 }
1422
1423 ++it;
1424 }
1425 BOOST_REQUIRE_EQUAL(check,true);
1426 }
1427
1428 // now we scan the buffer
1429
1432
1433 n_out_scan.resize(pl.size()+1);
1434
1435 openfpm::scan((unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.size(),(unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1436 n_out_scan.template deviceToHost<0>();
1437
1438 if (n_out_scan.template get<0>(pl.size()) == 0)
1439 {return;}
1440
1441 nn_list.resize(n_out_scan.template get<0>(pl.size()));
1442
1443 // Now for each particle we construct the list
1444
1445 pl.template hostToDevice<0>();
1446
1447 execute_cl_test<impl>::calc_list(pl,s_t_ns,cl2,n_out_scan,nn_list);
1448
1449 nn_list.template deviceToHost<0>();
1450
1451 // Check
1452
1453 n_out.deviceToHost<0>();
1454
1455 {
1456 bool check = true;
1457 auto it = pl.getIterator();
1458
1459 while(it.isNext())
1460 {
1461 auto p = it.get();
1462
1463 Point<dim,T> xp = pl.get(p);
1464
1465 // Get NN iterator
1466
1467 openfpm::vector<int> cpu_list;
1468
1469 auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp));*/
1470
1471 while (NN_it.isNext())
1472 {
1473 auto q = NN_it.get();
1474
1475 cpu_list.add(q);
1476
1477 ++NN_it;
1478 }
1479
1480 openfpm::vector<int> gpu_list;
1481
1482 for (size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1483 {
1484 gpu_list.add(nn_list.template get<0>(i));
1485 }
1486
1487 // sort bost vector
1488
1489 cpu_list.sort();
1490 gpu_list.sort();
1491
1492 for (size_t j = 0 ; j < cpu_list.size() ; j++)
1493 {check &= cpu_list.get(j) == gpu_list.get(j);}
1494
1495 ++it;
1496 }
1497
1498 BOOST_REQUIRE_EQUAL(check,true);
1499
1500 }
1501}
1502
1503template<unsigned int dim, typename T, typename CellS, int impl>
1504void Test_cell_gpu_force_split(SpaceBox<dim,T> & box, size_t npart, const size_t (& div)[dim],int box_nn = 1)
1505{
1506 // Origin
1507 Point<dim,T> org({0.0,0.0,0.0});
1508
1509 // id Cell list
1510 CellS cl2_split1(box,div,2);
1511 CellS cl2_split2(box,div,2);
1512
1513 CellList<dim,T,Mem_fast<>,shift<dim,T>> cl_cpu(box,div,2);
1514
1515 cl2_split1.setBoxNN(box_nn);
1516 cl2_split2.setBoxNN(box_nn);
1517
1518 T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1519 execute_cl_test<impl>::set_radius(cl2_split1,cl_cpu,radius);
1520 execute_cl_test<impl>::set_radius(cl2_split2,cl_cpu,radius);
1521
1522 // vector of particles
1523
1526
1529
1532
1533 // create random particles
1534
1535 fill_random_parts<3>(box,pl,pl_prp,npart);
1536
1537 pl_prp_out.resize(pl.size());
1538 pl_out.resize(pl.size());
1539 n_out.resize(pl.size()+1);
1540 n_out.fill<0>(0);
1541
1542 pl_prp.resize(pl.size());
1543 pl_prp_out.resize(pl.size());
1544
1545 pl.template hostToDevice<0>();
1546 pl_prp.template hostToDevice<0,1>();
1547
1548 // Construct an equivalent CPU cell-list
1549
1550 // construct
1551
1552 auto it2 = pl.getIterator();
1553
1554 while (it2.isNext())
1555 {
1556 auto p = it2.get();
1557
1558 Point<3,T> xp = pl.get(p);
1559
1560 cl_cpu.add(xp,p);
1561
1562 ++it2;
1563 }
1564
1565 size_t g_m = pl.size() / 2;
1566
1567 gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
1568 cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
1569 cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
1570 auto & s_t_ns_s1 = cl2_split1.getSortToNonSort();
1571 auto & s_t_ns_s2 = cl2_split2.getSortToNonSort();
1572
1573 execute_cl_test<impl>::calc_num_noato(pl,s_t_ns_s1,cl2_split1,n_out,0);
1574 n_out_partial = n_out;
1575 execute_cl_test<impl>::calc_num_noato(pl,s_t_ns_s2,cl2_split2,n_out,0);
1576
1577 // Domain particles
1578
1579 auto & gdsi_s1 = cl2_split1.getDomainSortIds();
1580 gdsi_s1.template deviceToHost<0>();
1581 s_t_ns_s1.template deviceToHost<0>();
1582
1583 bool match = true;
1584 for (size_t i = 0 ; i < g_m ; i++)
1585 {
1586 unsigned int p = gdsi_s1.template get<0>(i);
1587
1588 match &= (s_t_ns_s1.template get<0>(p) < g_m);
1589 }
1590
1591 BOOST_REQUIRE_EQUAL(match,true);
1592
1593 // Check
1594
1595 n_out.deviceToHost<0>();
1596
1597 {
1598 bool check = true;
1599 auto it = pl.getIteratorTo(pl.size()/2-1);
1600
1601 while(it.isNext())
1602 {
1603 auto p = it.get();
1604
1605 Point<dim,T> xp = pl.get(p);
1606
1607 // Get NN iterator
1608
1609 auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp))*/;
1610
1611 size_t n_ele = 0;
1612 while (NN_it.isNext())
1613 {
1614 auto q = NN_it.get();
1615
1616 n_ele++;
1617
1618 ++NN_it;
1619 }
1620
1621 check &= n_ele == n_out.template get<0>(p);
1622
1623 if (check == false)
1624 {
1625 std::cout << p << " " << n_ele << " " << n_out.template get<0>(p) << " " << check << std::endl;
1626 break;
1627 }
1628
1629 ++it;
1630 }
1631 BOOST_REQUIRE_EQUAL(check,true);
1632 }
1633
1634 // now we scan the buffer
1635
1638
1639 n_out_scan.resize(n_out.size());
1640
1641 openfpm::scan((unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.size(),(unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1642
1643 n_out_scan.template deviceToHost<0>();
1644
1645 if (n_out_scan.template get<0>(pl.size()) == 0)
1646 {return;}
1647
1648 nn_list.resize(n_out_scan.template get<0>(pl.size()));
1649
1650 // Now for each particle we construct the list
1651
1652 pl.template hostToDevice<0>();
1653
1654 execute_cl_test<impl>::calc_list(pl,s_t_ns_s1,cl2_split1,n_out_scan,nn_list);
1655 execute_cl_test<impl>::calc_list_partial(pl,s_t_ns_s2,cl2_split2,n_out_scan,n_out_partial,nn_list);
1656
1657 nn_list.template deviceToHost<0>();
1658
1659 // Check
1660
1661 n_out.deviceToHost<0>();
1662
1663 {
1664 bool check = true;
1665 auto it = pl.getIteratorTo(pl.size()/2-1);
1666
1667 while(it.isNext())
1668 {
1669 auto p = it.get();
1670
1671 Point<dim,T> xp = pl.get(p);
1672
1673 // Get NN iterator
1674
1675 openfpm::vector<int> cpu_list;
1676
1677 auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp));*/
1678
1679 while (NN_it.isNext())
1680 {
1681 auto q = NN_it.get();
1682
1683 cpu_list.add(q);
1684
1685 ++NN_it;
1686 }
1687
1688 openfpm::vector<int> gpu_list;
1689
1690 for (size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1691 {
1692 gpu_list.add(nn_list.template get<0>(i));
1693 }
1694
1695 // sort both vector
1696
1697#ifndef MAKE_CELLLIST_DETERMINISTIC
1698
1699 cpu_list.sort();
1700 gpu_list.sort();
1701
1702#endif
1703
1704 for (size_t j = 0 ; j < cpu_list.size() ; j++)
1705 {check &= cpu_list.get(j) == gpu_list.get(j);}
1706
1707 if (check == false)
1708 {
1709 std::cout << "NPARTS: " << npart << std::endl;
1710
1711 for (size_t j = 0 ; j < cpu_list.size() ; j++)
1712 {std::cout << cpu_list.get(j) << " " << gpu_list.get(j) << std::endl;}
1713
1714 break;
1715 }
1716
1717 ++it;
1718 }
1719
1720 BOOST_REQUIRE_EQUAL(check,true);
1721
1722 }
1723}
1724
1725BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box)
1726{
1727 std::cout << "Test cell list GPU" << "\n";
1728
1729 SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1730 SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1731
1732 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1733 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1734
1735 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1736 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1737
1738 std::cout << "End cell list GPU" << "\n";
1739
1740 // Test the cell list
1741}
1742
1743BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split)
1744{
1745 std::cout << "Test cell list GPU split" << "\n";
1746
1747 SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1748 SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1749
1750 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1751 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1752
1753 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1754 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1755
1756 std::cout << "End cell list GPU split" << "\n";
1757
1758 // Test the cell list
1759}
1760
1761/*BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split_test_performance)
1762{
1763 typedef float T;
1764 constexpr int dim = 3;
1765 typedef CellList_gpu<3,float,CudaMemory,shift_only<3,float>> CellS;
1766
1767 std::cout << "Performance" << "\n";
1768
1769 SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1770 size_t div[] = {64,64,64};
1771
1772 // Origin
1773 Point<dim,T> org({0.0,0.0,0.0});
1774
1775 // id Cell list
1776 CellS cl2_split1(box,div,2);
1777 CellS cl2_split2(box,div,2);
1778
1779 cl2_split1.setBoxNN(2);
1780 cl2_split2.setBoxNN(2);
1781
1782 T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1783
1784 // vector of particles
1785
1786 openfpm::vector<Point<dim,T>,CudaMemory,typename memory_traits_inte<Point<dim,T>>::type,memory_traits_inte> pl;
1787 openfpm::vector<Point<dim,T>,CudaMemory,typename memory_traits_inte<Point<dim,T>>::type,memory_traits_inte> pl_out;
1788
1789 openfpm::vector<aggregate<T,T[3]>,CudaMemory,typename memory_traits_inte<aggregate<T,T[3]>>::type,memory_traits_inte> pl_prp;
1790 openfpm::vector<aggregate<T,T[3]>,CudaMemory,typename memory_traits_inte<aggregate<T,T[3]>>::type,memory_traits_inte> pl_prp_out;
1791
1792 openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> n_out;
1793 openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> n_out_partial;
1794
1795 // create random particles
1796
1797 fill_random_parts<3>(box,pl,pl_prp,1400000);
1798
1799 pl_prp_out.resize(pl.size());
1800 pl_out.resize(pl.size());
1801 n_out.resize(pl.size()+1);
1802 n_out.fill<0>(0);
1803
1804 pl_prp.resize(pl.size());
1805 pl_prp_out.resize(pl.size());
1806
1807 pl.hostToDevice<0>();
1808 pl_prp.hostToDevice<0,1>();
1809
1810 size_t g_m = pl.size() / 2;
1811
1812 gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
1813
1814 cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
1815 cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
1816
1817 cudaDeviceSynchronize();
1818
1819 timer t;
1820 t.start();
1821
1822 cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
1823 cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
1824
1825 t.stop();
1826 std::cout << "Time: " << t.getwct() << std::endl;
1827
1828 cudaDeviceSynchronize();
1829
1830 cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size());
1831
1832 cudaDeviceSynchronize();
1833
1834 timer t2;
1835 t2.start();
1836
1837 cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size());
1838
1839 t2.stop();
1840 std::cout << "Time: " << t2.getwct() << std::endl;
1841
1842 std::cout << "Performance" << "\n";
1843
1844 // Test the cell list
1845}*/
1846
1847#if 0
1848
1849BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_sparse)
1850{
1851 std::cout << "Test cell list GPU" << "\n";
1852
1853 SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1854 SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1855
1856 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box,1000,{32,32,32},2);
1857 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box,10000,{32,32,32},2);
1858
1859 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box2,1000,{32,32,32},2);
1860 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box2,10000,{32,32,32},2);
1861
1862 std::cout << "End cell list GPU" << "\n";
1863
1864 // Test the cell list
1865}
1866
1867#endif
1868
1869BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_radius)
1870{
1871 std::cout << "Test cell list GPU" << "\n";
1872
1873 SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1874 SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1875
1876 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,1000,{32,32,32});
1877 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,10000,{32,32,32});
1878
1879 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,1000,{32,32,32});
1880 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,10000,{32,32,32});
1881
1882 std::cout << "End cell list GPU" << "\n";
1883
1884 // Test the cell list
1885}
1886
1887#if 0
1888
1889BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force)
1890{
1891 std::cout << "Test cell list GPU" << "\n";
1892
1893 SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1894 SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1895
1896 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,1000,{16,16,16});
1897 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,10000,{16,16,16});
1898
1899 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,1000,{16,16,16});
1900 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,10000,{16,16,16});
1901
1902 std::cout << "End cell list GPU" << "\n";
1903
1904 // Test the cell list
1905}
1906
1907BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_sparse)
1908{
1909 std::cout << "Test cell list GPU force sparse" << "\n";
1910
1911 SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1912 SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1913
1914 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box,1000,{16,16,16});
1915 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box,10000,{16,16,16});
1916
1917 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box2,1000,{16,16,16});
1918 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box2,10000,{16,16,16});
1919
1920 std::cout << "End cell list GPU force sparse" << "\n";
1921
1922 // Test the cell list
1923}
1924
1925#endif
1926
1927template<typename CellList_type, typename Vector_type, typename Vector_out>
1928__global__ void cl_offload_gpu(CellList_type cl, Vector_type parts, Vector_out output)
1929{
1930 int p = threadIdx.x + blockIdx.x * blockDim.x;
1931
1932 if (p >= parts.size()) return;
1933
1934 Point<3,float> xp = parts.template get<0>(p);
1935
1936 output.template get<0>(p) = cl.getNelements(cl.getCell(xp));
1937}
1938
1939template<typename CellList_type, typename Vector_type, typename Vector_scan_type, typename Vector_list_type>
1940__global__ void cl_offload_gpu_list(CellList_type cl, Vector_type parts, Vector_scan_type scan, Vector_list_type list)
1941{
1942 int p = threadIdx.x + blockIdx.x * blockDim.x;
1943
1944 if (p >= parts.size()) return;
1945
1946 Point<3,float> xp = parts.template get<0>(p);
1947
1948 int id = cl.getCell(xp);
1949 int n_ele = cl.getNelements(id);
1950 int start = scan.template get<0>(p);
1951
1952 for (int j = 0 ; j < n_ele ; j++)
1953 {
1954 list.template get<0>(start+j) = cl.get(id,j);
1955 }
1956
1957}
1958
1959#if 0
1960
1961BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
1962{
1963 std::cout << "Test cell list offload gpu" << "\n";
1964
1965 // Subdivisions
1966 size_t div[3] = {10,10,10};
1967
1968 // grid info
1969 grid_sm<3,void> g_info(div);
1970
1971 Box<3,float> box({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
1972
1973 // CellS = CellListM<dim,T,8>
1975
1978 v.resize(10000);
1979 os.resize(v.size());
1980
1981 for (size_t i = 0 ; i < v.size() ; i++)
1982 {
1983 v.template get<0>(i)[0] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1984 v.template get<0>(i)[1] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1985 v.template get<0>(i)[2] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1986
1987 Point<3,float> xp = v.template get<0>(i);
1988
1989 cl1.add(xp,i);
1990 }
1991
1992 auto ite = v.getGPUIterator();
1993
1994 cl1.hostToDevice();
1995 v.hostToDevice<0>();
1996
1997 CUDA_LAUNCH_DIM3((cl_offload_gpu<decltype(cl1.toKernel()),decltype(v.toKernel()),decltype(os.toKernel())>),ite.wthr,ite.thr,cl1.toKernel(),v.toKernel(),os.toKernel());
1998
1999 os.deviceToHost<0>();
2000
2001 bool match = true;
2002 for (size_t i = 0 ; i < os.size() ; i++)
2003 {
2004 Point<3,float> xp = v.template get<0>(i);
2005
2006 match &= os.template get<0>(i) == cl1.getNelements(cl1.getCell(xp));
2007 }
2008
2009 BOOST_REQUIRE_EQUAL(match,true);
2010
2011 // now we scan the vector out
2012
2014 os_scan.resize(v.size());
2015
2017 openfpm::scan((int *)os.template getDeviceBuffer<0>(),os.size(),(int *)os_scan.template getDeviceBuffer<0>(),ctx);
2018
2019 os_scan.deviceToHost<0>();
2020 os.deviceToHost<0>(os.size()-1,os.size()-1);
2021 size_t size_list = os_scan.template get<0>(os_scan.size()-1) + os.template get<0>(os.size()-1);
2022
2024 os_list.resize(size_list);
2025
2026 CUDA_LAUNCH_DIM3((cl_offload_gpu_list<decltype(cl1.toKernel()),decltype(v.toKernel()),
2027 decltype(os_scan.toKernel()),decltype(os_list.toKernel())>),ite.wthr,ite.thr,
2028 cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
2029
2030 os_list.deviceToHost<0>();
2031
2032 match = true;
2033 for (size_t i = 0 ; i < os.size() ; i++)
2034 {
2035 Point<3,float> xp = v.template get<0>(i);
2036
2037 for (size_t j = 0 ; j < cl1.getNelements(cl1.getCell(xp)) ; j++)
2038 {
2039 match &= os_list.template get<0>(os_scan.template get<0>(i)+j) == cl1.get(cl1.getCell(xp),j);
2040 }
2041 }
2042
2043 BOOST_REQUIRE_EQUAL(match,true);
2044
2045 std::cout << "End cell list offload gpu" << "\n";
2046
2047 // Test the cell list
2048}
2049
2050#endif
2051
2052BOOST_AUTO_TEST_CASE( CellList_swap_test )
2053{
2054 size_t npart = 4096;
2055
2056 Box<3,float> box({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
2057
2058 // Subdivisions
2059 size_t div[3] = {10,10,10};
2060
2061 // Origin
2062 Point<3,float> org({0.0,0.0,0.0});
2063
2064 // id Cell list
2065 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl2(box,div,2);
2066 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl3(box,div,2);
2067 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl4(box,div,2);
2068
2069 // vector of particles
2070
2073
2076
2077 // create random particles
2078
2079 fill_random_parts<3>(box,pl,pl_prp,npart);
2080
2081 pl_prp_out.resize(pl.size());
2082 pl_out.resize(pl.size());
2083
2084 pl_prp.resize(pl.size());
2085 pl_prp_out.resize(pl.size());
2086
2087 pl.template hostToDevice<0>();
2088 pl_prp.template hostToDevice<0,1>();
2089
2090 size_t g_m = pl.size() / 2;
2091
2092 gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
2093 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2094 cl4.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2095
2096 cl3.swap(cl2);
2097
2098 // move device to host
2099
2100 cl3.debug_deviceToHost();
2101 cl4.debug_deviceToHost();
2102
2103 BOOST_REQUIRE_EQUAL(cl3.getNCells(),cl4.getNCells());
2104
2107
2108 bool check = true;
2109 for (size_t i = 0 ; i < cl3.getNCells() ; i++)
2110 {
2111 check &= cl3.getNelements(i) == cl4.getNelements(i);
2112
2113 for (size_t j = 0 ; j < cl3.getNelements(i) ; j++)
2114 {
2115 s1.add(cl3.get(i,j));
2116 s2.add(cl4.get(i,j));
2117 }
2118
2119 s1.sort();
2120 s2.sort();
2121
2122 for (size_t j = 0 ; j < s1.size() ; j++)
2123 {
2124 check &= s1.get(j) == s2.get(j);
2125 }
2126 }
2127
2128 BOOST_REQUIRE_EQUAL(check,true);
2129
2131}
2132
2133BOOST_AUTO_TEST_SUITE_END()
2134
This class represent an N-dimensional box.
Definition Box.hpp:61
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition Box.hpp:556
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition Box.hpp:567
Class for FAST cell list implementation.
Definition CellList.hpp:357
This class implement an NxN (dense) matrix.
Definition Matrix.hpp:33
It is a class that work like a vector of vector.
Definition MemFast.hpp:89
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition Point.hpp:172
This class represent an N-dimensional box.
Definition SpaceBox.hpp:27
Declaration grid_key_dx_iterator_sub.
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19
Declaration grid_sm.
Definition grid_sm.hpp:167
__device__ __host__ grid_key_dx< N > InvLinId(mem_id id) const
Construct.
Definition grid_sm.hpp:609
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Definition grid_sm.hpp:454
__device__ __host__ size_t size() const
Return the size of the grid.
Definition grid_sm.hpp:657
No transformation.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void flush(gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)