8 #include "util/cuda_util.hpp"
10 #define BOOST_TEST_DYN_LINK
11 #include <boost/test/unit_test.hpp>
13 #include "NN/CellList/cuda/CellList_gpu.hpp"
14 #include "NN/CellList/CellList.hpp"
15 #include "util/boost/boost_array_openfpm.hpp"
16 #include "Point_test.hpp"
17 #include "util/cuda_util.hpp"
19 BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
21 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
59 for (
size_t i = 0 ; i < dim ; i++)
66 cl_n.resize(17*17*17);
67 cl_n.template fill<0>(0);
70 cellIndex_LocalIndex.resize(vPos.
size());
72 size_t sz[3] = {17,17,17};
75 auto ite = vPos.getGPUIterator();
77 vPos.template hostToDevice<0>();
93 cellIndex_LocalIndex.toKernel());
95 cl_n.template deviceToHost<0>();
96 cellIndex_LocalIndex.template deviceToHost<0>();
100 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(0)[0],gr.LinId({2,2,2}));
101 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(1)[0],gr.LinId({9,2,2}));
102 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(2)[0],gr.LinId({2,9,2}));
103 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(3)[0],gr.LinId({2,2,9}));
104 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(4)[0],gr.LinId({9,9,2}));
105 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(5)[0],gr.LinId({9,2,9}));
106 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(6)[0],gr.LinId({2,9,9}));
107 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(7)[0],gr.LinId({9,9,9}));
108 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(8)[0],gr.LinId({0,0,0}));
109 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(9)[0],gr.LinId({2,2,2}));
111 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
112 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
113 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
114 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
115 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
116 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
117 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
118 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
119 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
123 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
124 void test_sub_index2()
179 for (
size_t i = 0 ; i < dim ; i++)
186 cl_n.resize(17*17*17);
187 cl_n.template fill<0>(0);
190 cellIndex_LocalIndex.resize(vPos.
size());
192 size_t sz[3] = {17,17,17};
195 auto ite = vPos.getGPUIterator();
197 vPos.template hostToDevice<0>();
204 CUDA_LAUNCH_DIM3((fill_cellIndex_LocalIndex<dim,T,ids_type,
shift_only<dim,T>>),ite.wthr,ite.thr,div,
212 cellIndex_LocalIndex.toKernel());
214 cl_n.template deviceToHost<0>();
215 cellIndex_LocalIndex.template deviceToHost<0>();
219 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(0)[0],gr.LinId({2,2,2}));
220 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(1)[0],gr.LinId({9,2,2}));
221 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(2)[0],gr.LinId({2,9,2}));
222 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(3)[0],gr.LinId({2,2,9}));
223 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(4)[0],gr.LinId({9,9,2}));
224 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(5)[0],gr.LinId({9,2,9}));
225 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(6)[0],gr.LinId({2,9,9}));
226 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(7)[0],gr.LinId({9,9,9}));
227 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(8)[0],gr.LinId({0,0,0}));
228 BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(9)[0],gr.LinId({2,2,2}));
230 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
231 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
232 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
233 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
234 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
235 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
236 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
237 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
238 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
241 template<
unsigned int dim,
typename T>
242 void create_n_part(
int n_part,
248 auto it = vPos.getIterator();
254 vPos.template get<0>(p)[0] = (double)rand()/RAND_MAX;
255 vPos.template get<0>(p)[1] = (double)rand()/RAND_MAX;
256 vPos.template get<0>(p)[2] = (double)rand()/RAND_MAX;
259 xp.
get(0) = vPos.template get<0>(p)[0];
260 xp.
get(1) = vPos.template get<0>(p)[1];
261 xp.
get(2) = vPos.template get<0>(p)[2];
263 size_t c = cellList.getCell(xp);
264 cellList.addCell(c,p);
270 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
281 cellIndex_LocalIndex.resize(n_part);
282 starts.resize(n_cell);
283 cells.resize(n_part);
291 auto cell = itg.get();
293 size_t clin = gr.
LinId(cell);
295 for (
size_t j = 0 ; j < cellList.getNelements(clin) ; j++)
297 size_t p_id = cellList.get(clin,j);
299 cellIndex_LocalIndex.template get<0>(p_id)[0] = clin;
301 cellIndex_LocalIndex.template get<0>(p_id)[1] = j;
303 cells.template get<0>(start+j) = p_id;
305 starts.template get<0>(clin) = start;
306 start += cellList.getNelements(clin);
312 template<
typename sparse_vector_type>
313 __global__
void construct_cells(sparse_vector_type sv,
grid_sm<3,void> gs)
330 sv.template insert<0>(gs.
LinId(key1)) = gs.
LinId(key1);
331 sv.template insert<0>(gs.
LinId(key2)) = gs.
LinId(key2);
332 sv.template insert<0>(gs.
LinId(key3)) = gs.
LinId(key3);
333 sv.template insert<0>(gs.
LinId(key4)) = gs.
LinId(key4);
334 sv.template insert<0>(gs.
LinId(key5)) = gs.
LinId(key5);
335 sv.template insert<0>(gs.
LinId(key6)) = gs.
LinId(key6);
336 sv.template insert<0>(gs.
LinId(key7)) = gs.
LinId(key7);
337 sv.template insert<0>(gs.
LinId(key8)) = gs.
LinId(key8);
338 sv.template insert<0>(gs.
LinId(key9)) = gs.
LinId(key9);
339 sv.template insert<0>(gs.
LinId(key10)) = gs.
LinId(key10);
341 sv.flush_block_insert();
344 void test_cell_count_n()
350 vs.template setBackground<0>(-1);
354 size_t sz[] = {17,17,17};
357 CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.
toKernel(),gs);
370 int mid = gs.
LinId(middle);
379 cells_nn_test.get<0>(cells_nn_test.
size()-1) = (
int)gs.
LinId(p) - mid;
384 cells_nn_test.template hostToDevice<0>();
386 auto itgg = vs.getGPUIterator();
387 CUDA_LAUNCH((countNonEmptyNeighborCells),itgg,vs.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
389 cells_nn.deviceToHost<0>();
391 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(0),8);
392 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(1),8);
393 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(2),8);
394 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(3),8);
395 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(4),8);
396 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(5),8);
397 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(6),8);
398 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(7),9);
399 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(8),2);
400 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(9),1);
403 openfpm::scan((
unsigned int *)cells_nn.template getDeviceBuffer<0>(), cells_nn.
size(), (
unsigned int *)cells_nn.template getDeviceBuffer<0>() , gpuContext);
406 cell_nn_list.resize(7*8 + 9 + 2 + 1);
408 CUDA_LAUNCH((fillNeighborCellList),itgg,vs.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
410 cell_nn_list.deviceToHost<0>();
413 for (
size_t i = 0 ; i < 7 ; i++)
415 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+0),1535);
416 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+1),1536);
417 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+2),1552);
418 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+3),1553);
419 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+4),1824);
420 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+5),1825);
421 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+6),1841);
422 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+7),1842);
426 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+0),1535);
427 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+1),1536);
428 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+2),1552);
429 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+3),1553);
430 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+4),1824);
431 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+5),1825);
432 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+6),1841);
433 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+7),1842);
434 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+8),2149);
437 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9),1842);
438 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+1),2149);
441 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+2),2763);
444 BOOST_AUTO_TEST_CASE( test_count_nn_cells )
446 std::cout <<
"Test cell count nn" << std::endl;
451 BOOST_AUTO_TEST_CASE( test_subindex_funcs )
453 std::cout <<
"Test cell list GPU base func" <<
"\n";
455 test_sub_index<3,float,int,unsigned char>();
456 test_sub_index2<3,float,int,unsigned char>();
458 std::cout <<
"End cell list GPU" <<
"\n";
463 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
464 void test_reorder_parts(
size_t n_part)
484 size_t div_host[dim];
487 for (
size_t i = 0 ; i < dim ; i++)
493 domain.setLow(i,0.0);
494 domain.setHigh(i,1.0);
500 create_n_part(n_part,vPos,cellList);
502 vPrpReorder.resize(n_part);
503 sortToNonSort.resize(n_part);
504 NonSortToSort.resize(n_part);
506 auto p_it = vPrp.getIterator();
507 while (p_it.isNext())
511 vPrp.template get<0>(p) = 10000 + p;
512 vPrp.template get<1>(p) = 20000 + p;
514 vPrp.template get<2>(p)[0] = 30000 + p;
515 vPrp.template get<2>(p)[1] = 40000 + p;
516 vPrp.template get<2>(p)[2] = 50000 + p;
518 vPrp.template get<3>(p)[0][0] = 60000 + p;
519 vPrp.template get<3>(p)[0][1] = 70000 + p;
520 vPrp.template get<3>(p)[0][2] = 80000 + p;
521 vPrp.template get<3>(p)[1][0] = 90000 + p;
522 vPrp.template get<3>(p)[1][1] = 100000 + p;
523 vPrp.template get<3>(p)[1][2] = 110000 + p;
524 vPrp.template get<3>(p)[2][0] = 120000 + p;
525 vPrp.template get<3>(p)[2][1] = 130000 + p;
526 vPrp.template get<3>(p)[0][2] = 140000 + p;
533 create_starts_and_parts_ids(cellList,gr,vPos.
size(),tot,starts,cellIndex_LocalIndex,cells_out);
536 auto itgg = vPos.getGPUIterator();
538 cells_out.template hostToDevice<0>();
540 auto ite = vPos.getGPUIterator();
542 vPrp.template hostToDevice<0,1,2,3>();
544 CUDA_LAUNCH_DIM3((constructSortUnsortBidirectMap),
546 sortToNonSort.toKernel(),
547 NonSortToSort.toKernel(),
552 (reorderParticlesPrp<
553 decltype(vPrp.toKernel()),
554 decltype(NonSortToSort.toKernel()),
558 vPrpReorder.toKernel(),
559 NonSortToSort.toKernel(),
564 vPrpReorder.template deviceToHost<0>();
565 sortToNonSort.template deviceToHost<0>();
566 NonSortToSort.template deviceToHost<0>();
569 for (
size_t i = 0 ; i < tot ; i++)
571 size_t n = cellList.getNelements(i);
573 for (
size_t j = 0 ; j < n ; j++)
575 size_t p = cellList.get(i,j);
577 check &= vPrpReorder.template get<0>(st) == vPrp.template get<0>(p);
578 check &= sortToNonSort.template get<0>(st) == p;
579 check &= NonSortToSort.template get<0>(p) == st;
586 BOOST_REQUIRE_EQUAL(check,
true);
589 BOOST_AUTO_TEST_CASE ( test_reorder_particles )
591 std::cout <<
"Test GPU reorder" <<
"\n";
593 test_reorder_parts<3,float,unsigned int, unsigned char>(5000);
595 std::cout <<
"End GPU reorder" <<
"\n";
598 template<
unsigned int dim,
typename T,
typename CellS>
void Test_cell_gpu(
Box<dim,T> & box)
604 size_t div[dim] = {16,16,16};
610 CellS cellList2(box,div);
611 cellList2.setOpt(CL_NON_SYMMETRIC | CL_GPU_REORDER_PROPERTY | CL_GPU_RESTORE_PROPERTY);
640 vPrp.resize(vPos.
size());
642 for (
size_t i = 0 ; i < vPos.
size() ; i++)
644 vPrp.template get<0>(i) = vPos.template get<0>(i)[0];
646 vPrp.template get<1>(i)[0] = vPos.template get<0>(i)[0]+100.0;
647 vPrp.template get<1>(i)[1] = vPos.template get<0>(i)[1]+100.0;
648 vPrp.template get<1>(i)[2] = vPos.template get<0>(i)[2]+100.0;
650 vPrp.template get<2>(i)[0][0] = vPos.template get<0>(i)[0]+1000.0;
651 vPrp.template get<2>(i)[0][1] = vPos.template get<0>(i)[1]+1000.0;
652 vPrp.template get<2>(i)[0][2] = vPos.template get<0>(i)[2]+1000.0;
654 vPrp.template get<2>(i)[1][0] = vPos.template get<0>(i)[0]+2000.0;
655 vPrp.template get<2>(i)[1][1] = vPos.template get<0>(i)[1]+3000.0;
656 vPrp.template get<2>(i)[1][2] = vPos.template get<0>(i)[2]+4000.0;
658 vPrp.template get<2>(i)[2][0] = vPos.template get<0>(i)[0]+5000.0;
659 vPrp.template get<2>(i)[2][1] = vPos.template get<0>(i)[1]+6000.0;
660 vPrp.template get<2>(i)[2][2] = vPos.template get<0>(i)[2]+7000.0;
663 vPrp.resize(vPos.
size());
665 vPos.template hostToDevice<0>();
666 vPrp.template hostToDevice<0,1,2>();
674 cellList2.template construct<decltype(vPos), decltype(vPrp), 0, 1, 2>(
685 vPrpReorder.template deviceToHost<0,1,2>();
701 for (
size_t i = 0 ; i < pl_correct.
size() ; i++)
703 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<0>(i),(
float)pl_correct.template get<0>(i)[0]);
704 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<1>(i)[0],(
float)(pl_correct.template get<0>(i)[0]+100.0));
705 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<1>(i)[1],(
float)(pl_correct.template get<0>(i)[1]+100.0));
706 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<1>(i)[2],(
float)(pl_correct.template get<0>(i)[2]+100.0));
707 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[0][0],(
float)(pl_correct.template get<0>(i)[0] + 1000.0));
708 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[0][1],(
float)(pl_correct.template get<0>(i)[1] + 1000.0));
709 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[0][2],(
float)(pl_correct.template get<0>(i)[2] + 1000.0));
710 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[1][0],(
float)(pl_correct.template get<0>(i)[0] + 2000.0));
711 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[1][1],(
float)(pl_correct.template get<0>(i)[1] + 3000.0));
712 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[1][2],(
float)(pl_correct.template get<0>(i)[2] + 4000.0));
713 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[2][0],(
float)(pl_correct.template get<0>(i)[0] + 5000.0));
714 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[2][1],(
float)(pl_correct.template get<0>(i)[1] + 6000.0));
715 BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[2][2],(
float)(pl_correct.template get<0>(i)[2] + 7000.0));
720 auto & vsrt = cellList2.getSortToNonSort();
721 vsrt.template deviceToHost<0>();
723 BOOST_REQUIRE_EQUAL(vsrt.size(),9);
725 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(0),8);
726 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(1),0);
727 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(2),1);
728 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(3),2);
729 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(4),4);
730 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(5),3);
731 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(6),5);
732 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(7),6);
733 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(8),7);
735 auto & vnsrt = cellList2.getNonSortToSort();
737 BOOST_REQUIRE_EQUAL(vnsrt.size(),9);
741 vnsrt.template deviceToHost<0>();
743 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(8),0);
744 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(0),1);
745 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(1),2);
746 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(2),3);
747 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(4),4);
748 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(3),5);
749 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(5),6);
750 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(6),7);
751 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(7),8);
755 BOOST_AUTO_TEST_CASE( CellList_gpu_use)
757 std::cout <<
"Test cell list GPU" <<
"\n";
762 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory>>(box);
764 std::cout <<
"End cell list GPU" <<
"\n";
769 BOOST_AUTO_TEST_CASE( CellList_gpu_use_sparse )
771 std::cout <<
"Test cell list GPU sparse" <<
"\n";
776 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory,no_transform_only<3,double>,
true>> (box);
778 std::cout <<
"End cell list GPU sparse" <<
"\n";
783 template<
unsigned int dim,
typename vector_ps,
typename vector_pr>
784 void fill_random_parts(
Box<dim,float> & box, vector_ps & vd_pos, vector_pr & vd_prp,
size_t n)
786 for (
size_t i = 0 ; i < n ; i++)
790 p.get(0) = ((box.
getHigh(0) - box.
getLow(0) - 0.0001)*(
float)rand()/RAND_MAX) + box.
getLow(0);
791 p.get(1) = ((box.
getHigh(1) - box.
getLow(1) - 0.0001)*(
float)rand()/RAND_MAX) + box.
getLow(1);
792 p.get(2) = ((box.
getHigh(2) - box.
getLow(2) - 0.0001)*(
float)rand()/RAND_MAX) + box.
getLow(2);
796 vd_prp.last().template get<0>() = i % 3;
801 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
802 __global__
void calc_force_number(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out)
804 int p = threadIdx.x + blockIdx.x * blockDim.x;
806 if (p >= pos.size())
return;
810 auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
814 auto q = it.get_sort();
815 auto q_ns = it.get();
817 int s1 = sortToNonSort.template get<0>(q);
819 atomicAdd(&n_out.template get<0>(s1), 1);
825 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
826 __global__
void calc_force_number_noato(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out)
828 int p = threadIdx.x + blockIdx.x * blockDim.x;
830 if (p >= pos.size())
return;
834 auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
838 auto q = it.get_sort();
839 auto q_ns = it.get();
841 int s1 = sortToNonSort.template get<0>(q);
843 ++n_out.template get<0>(p);
849 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
850 __global__
void calc_force_number_box(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out,
unsigned int start)
852 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
854 if (p >= pos.size())
return;
858 auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
862 auto q = it.get_sort();
864 int s1 = sortToNonSort.template get<0>(q);
866 atomicAdd(&n_out.template get<0>(s1), 1);
872 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
873 __global__
void calc_force_number_box_noato(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out,
unsigned int start)
875 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
877 if (p >= pos.size())
return;
881 auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
885 auto q = it.get_sort();
887 ++n_out.template get<0>(p);
893 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
894 __global__
void calc_force_number_rad(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out)
896 int p = threadIdx.x + blockIdx.x * blockDim.x;
898 if (p >= pos.size())
return;
902 auto it = cellList.getNNIteratorRadius(cellList.getCell(xp));
906 auto q = it.get_sort();
908 int s1 = sortToNonSort.template get<0>(q);
910 atomicAdd(&n_out.template get<0>(s1), 1);
916 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
917 __global__
void calc_force_list_box(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type v_nscan ,vector_n_type v_list)
919 int p = threadIdx.x + blockIdx.x * blockDim.x;
921 if (p >= pos.size())
return;
924 int start_list = v_nscan.template get<0>(p);
926 auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
930 auto q = it.get_sort();
932 int s1 = sortToNonSort.template get<0>(q);
934 v_list.template get<0>(start_list) = s1;
941 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
942 __global__
void calc_force_list(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type v_nscan ,vector_n_type v_list)
944 int p = threadIdx.x + blockIdx.x * blockDim.x;
946 if (p >= pos.size())
return;
949 int start_list = v_nscan.template get<0>(p);
951 auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
955 auto q = it.get_sort();
957 int s1 = sortToNonSort.template get<0>(q);
959 v_list.template get<0>(start_list) = s1;
966 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
967 __global__
void calc_force_list_box_partial(vector_pos pos,
968 vector_ns sortToNonSort,
969 CellList_type cellList,
970 vector_n_type v_nscan,
971 vector_n_type v_nscan_part,
972 vector_n_type v_list)
974 int p = threadIdx.x + blockIdx.x * blockDim.x;
976 if (p >= pos.size())
return;
979 int start_list = v_nscan.template get<0>(p) + v_nscan_part.template get<0>(p);
981 auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
985 auto q = it.get_sort();
987 int s1 = sortToNonSort.template get<0>(q);
989 v_list.template get<0>(start_list) = s1;
996 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
997 __global__
void calc_force_list_rad(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type v_nscan ,vector_n_type v_list)
999 int p = threadIdx.x + blockIdx.x * blockDim.x;
1001 if (p >= pos.size())
return;
1004 int start_list = v_nscan.template get<0>(p);
1006 auto it = cellList.getNNIteratorRadius(cellList.getCell(xp));
1010 auto q = it.get_sort();
1012 int s1 = sortToNonSort.template get<0>(q);
1014 v_list.template get<0>(start_list) = s1;
1021 template<
unsigned int impl>
1024 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1025 static void set_radius(CellS & cellList2, Cells_cpu_type & cl_cpu, T & radius)
1029 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_type>
1030 static void calc_num(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out,
unsigned int start)
1032 auto ite = vPos.getGPUIterator();
1034 CUDA_LAUNCH((calc_force_number),ite,vPos.toKernel(),
1035 sortToNonSort.toKernel(),
1036 cellList2.toKernel(),
1041 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1042 static void calc_list(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2,n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1044 auto ite = vPos.getGPUIterator();
1046 CUDA_LAUNCH((calc_force_list),ite,vPos.toKernel(),
1047 sortToNonSort.toKernel(),
1048 cellList2.toKernel(),
1049 n_out_scan.toKernel(),
1054 template<
typename NN_type>
1055 static auto getNN(NN_type & nn,
size_t cell) -> decltype(nn.getNNIteratorBox(cell))
1057 return nn.getNNIteratorBox(cell);
1064 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1065 static void set_radius(CellS & cellList2, Cells_cpu_type & cl_cpu, T & radius)
1067 cellList2.setRadius(radius);
1068 cl_cpu.setRadius(radius);
1071 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_type>
1072 static void calc_num(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out,
unsigned int start)
1074 auto ite = vPos.getGPUIterator();
1076 CUDA_LAUNCH((calc_force_number_rad<decltype(vPos.toKernel()),
1077 decltype(sortToNonSort.toKernel()),
1078 decltype(cellList2.toKernel()),
1079 decltype(n_out.toKernel())>),
1080 ite,vPos.toKernel(),
1081 sortToNonSort.toKernel(),
1082 cellList2.toKernel(),
1086 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1087 static void calc_list(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1089 auto ite = vPos.getGPUIterator();
1091 CUDA_LAUNCH((calc_force_list_rad<decltype(vPos.toKernel()),
1092 decltype(sortToNonSort.toKernel()),
1093 decltype(cellList2.toKernel()),
1094 decltype(nn_list.toKernel())>),
1095 ite,vPos.toKernel(),
1096 sortToNonSort.toKernel(),
1097 cellList2.toKernel(),
1098 n_out_scan.toKernel(),
1099 nn_list.toKernel());
1102 template<
typename NN_type>
1103 static auto getNN(NN_type & nn,
size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1105 return nn.getNNIteratorRadius(cell);
1112 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1113 static void set_radius(CellS & cellList2, Cells_cpu_type & cl_cpu, T & radius)
1115 cellList2.setRadius(radius);
1116 cl_cpu.setRadius(radius);
1119 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_type>
1120 static void calc_num_noato(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out,
unsigned int start)
1122 auto ite = sortToNonSort.getGPUIterator();
1124 CUDA_LAUNCH((calc_force_number_box_noato<decltype(vPos.toKernel()),
1125 decltype(sortToNonSort.toKernel()),
1126 decltype(cellList2.toKernel()),
1127 decltype(n_out.toKernel())>),
1128 ite,vPos.toKernel(),
1129 sortToNonSort.toKernel(),
1130 cellList2.toKernel(),
1135 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_type>
1136 static void calc_num(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out,
unsigned int start)
1138 auto ite = sortToNonSort.getGPUIterator();
1140 CUDA_LAUNCH((calc_force_number_box<decltype(vPos.toKernel()),
1141 decltype(sortToNonSort.toKernel()),
1142 decltype(cellList2.toKernel()),
1143 decltype(n_out.toKernel())>),
1146 sortToNonSort.toKernel(),
1147 cellList2.toKernel(),
1152 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1153 static void calc_list(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1155 auto ite = sortToNonSort.getGPUIterator();
1157 CUDA_LAUNCH((calc_force_list_box<decltype(vPos.toKernel()),
1158 decltype(sortToNonSort.toKernel()),
1159 decltype(cellList2.toKernel()),
1160 decltype(nn_list.toKernel())>),
1161 ite,vPos.toKernel(),
1162 sortToNonSort.toKernel(),
1163 cellList2.toKernel(),
1164 n_out_scan.toKernel(),
1165 nn_list.toKernel());
1168 template<
typename pl_type,
typename sortToNonSort_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1169 static void calc_list_partial(pl_type & vPos,
1170 sortToNonSort_type & sortToNonSort,
1171 cl2_type & cellList2,
1172 n_out_scan_type & n_out_scan,
1173 n_out_scan_type & n_out_scan_partial,
1174 nn_list_type & nn_list)
1176 auto ite = sortToNonSort.getGPUIterator();
1178 CUDA_LAUNCH((calc_force_list_box_partial),ite,vPos.toKernel(),
1179 sortToNonSort.toKernel(),
1180 cellList2.toKernel(),
1181 n_out_scan.toKernel(),
1182 n_out_scan_partial.toKernel(),
1183 nn_list.toKernel());
1186 template<
typename NN_type>
1187 static auto getNN(NN_type & nn,
size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1189 return nn.getNNIteratorRadius(cell);
1193 template<
unsigned int dim,
typename T,
typename CellS,
int impl>
1194 void Test_cell_gpu_force(
Box<dim,T> & box,
size_t npart,
const size_t (& div)[dim],
int box_nn = 2)
1200 CellS cellList2(box,div,2);
1204 cellList2.setBoxNN(box_nn);
1218 fill_random_parts<3>(box,vPos,vPrp,npart);
1220 n_out.resize(vPos.
size()+1);
1223 vPrp.resize(vPos.
size());
1225 vPos.template hostToDevice<0>();
1226 vPrp.template hostToDevice<0,1>();
1232 auto it2 = vPos.getIterator();
1234 while (it2.isNext())
1245 size_t ghostMarker = vPos.
size() / 2;
1248 cellList2.construct(vPos, vPrp, gpuContext, ghostMarker, 0, vPos.
size());
1250 auto & sortToNonSort = cellList2.getSortToNonSort();
1252 vPos.template hostToDevice<0>();
1258 auto & gdsi = cellList2.getDomainSortIds();
1259 gdsi.template deviceToHost<0>();
1260 sortToNonSort.template deviceToHost<0>();
1263 for (
size_t i = 0 ; i < ghostMarker ; i++)
1265 unsigned int p = gdsi.template get<0>(i);
1267 match &= (sortToNonSort.template get<0>(p) < ghostMarker);
1270 BOOST_REQUIRE_EQUAL(match,
true);
1274 n_out.deviceToHost<0>();
1278 auto it = vPos.getIterator();
1291 while (NN_it.isNext())
1293 auto q = NN_it.get();
1300 check &= n_ele == n_out.template get<0>(p);
1304 std::cout << p <<
" " << n_ele <<
" " << n_out.template get<0>(p) <<
" " << check << std::endl;
1310 BOOST_REQUIRE_EQUAL(check,
true);
1318 n_out_scan.resize(vPos.
size()+1);
1320 openfpm::scan((
unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.
size(),(
unsigned int *)n_out_scan.template getDeviceBuffer<0>(),gpuContext);
1321 n_out_scan.template deviceToHost<0>();
1323 if (n_out_scan.template get<0>(vPos.
size()) == 0)
1326 nn_list.resize(n_out_scan.template get<0>(vPos.
size()));
1330 vPos.template hostToDevice<0>();
1334 nn_list.template deviceToHost<0>();
1338 n_out.deviceToHost<0>();
1342 auto it = vPos.getIterator();
1356 while (NN_it.isNext())
1358 auto q = NN_it.get();
1367 for (
size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1369 gpu_list.add(nn_list.template get<0>(i));
1377 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1378 {check &= cpu_list.get(j) == gpu_list.get(j);}
1383 BOOST_REQUIRE_EQUAL(check,
true);
1388 template<
unsigned int dim,
typename T,
typename CellS,
int impl>
1389 void Test_cell_gpu_force_split(
Box<dim,T> & box,
size_t npart,
const size_t (& div)[dim],
int box_nn = 2)
1395 CellS cl2_split1(box,div,2);
1396 CellS cl2_split2(box,div,2);
1400 cl2_split1.setBoxNN(box_nn);
1401 cl2_split2.setBoxNN(box_nn);
1417 fill_random_parts<3>(box,vPos,vPrp,npart);
1419 n_out.resize(vPos.
size()+1);
1422 vPrp.resize(vPos.
size());
1424 vPos.template hostToDevice<0>();
1425 vPrp.template hostToDevice<0,1>();
1431 auto it2 = vPos.getIterator();
1433 while (it2.isNext())
1444 size_t ghostMarker = vPos.
size() / 2;
1447 cl2_split1.construct(vPos,vPrp,gpuContext,ghostMarker,0,vPos.
size()/2);
1448 cl2_split2.construct(vPos,vPrp,gpuContext,ghostMarker,vPos.
size()/2,vPos.
size());
1449 auto & sortToNonSort_s1 = cl2_split1.getSortToNonSort();
1450 auto & sortToNonSort_s2 = cl2_split2.getSortToNonSort();
1453 n_out_partial = n_out;
1458 auto & gdsi_s1 = cl2_split1.getDomainSortIds();
1459 gdsi_s1.template deviceToHost<0>();
1460 sortToNonSort_s1.template deviceToHost<0>();
1463 for (
size_t i = 0 ; i < ghostMarker ; i++)
1465 unsigned int p = gdsi_s1.template get<0>(i);
1467 match &= (sortToNonSort_s1.template get<0>(p) < ghostMarker);
1470 BOOST_REQUIRE_EQUAL(match,
true);
1474 n_out.deviceToHost<0>();
1478 auto it = vPos.getIteratorTo(vPos.
size()/2-1);
1491 while (NN_it.isNext())
1493 auto q = NN_it.get();
1500 check &= n_ele == n_out.template get<0>(p);
1504 std::cout << p <<
" " << n_ele <<
" " << n_out.template get<0>(p) <<
" " << check << std::endl;
1510 BOOST_REQUIRE_EQUAL(check,
true);
1518 n_out_scan.resize(n_out.
size());
1520 openfpm::scan((
unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.
size(),(
unsigned int *)n_out_scan.template getDeviceBuffer<0>(),gpuContext);
1522 n_out_scan.template deviceToHost<0>();
1524 if (n_out_scan.template get<0>(vPos.
size()) == 0)
1527 nn_list.resize(n_out_scan.template get<0>(vPos.
size()));
1531 vPos.template hostToDevice<0>();
1536 nn_list.template deviceToHost<0>();
1540 n_out.deviceToHost<0>();
1544 auto it = vPos.getIteratorTo(vPos.
size()/2-1);
1558 while (NN_it.isNext())
1560 auto q = NN_it.get();
1569 for (
size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1571 gpu_list.add(nn_list.template get<0>(i));
1579 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1580 {check &= cpu_list.get(j) == gpu_list.get(j);}
1584 std::cout <<
"NPARTS: " << npart << std::endl;
1586 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1587 {std::cout << cpu_list.get(j) <<
" " << gpu_list.get(j) << std::endl;}
1595 BOOST_REQUIRE_EQUAL(check,
true);
1600 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box)
1602 std::cout <<
"Test cell list GPU" <<
"\n";
1605 Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1607 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{8,8,8});
1608 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{8,8,8});
1610 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{8,8,8});
1611 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{8,8,8});
1613 std::cout <<
"End cell list GPU" <<
"\n";
1618 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split)
1620 std::cout <<
"Test cell list GPU split" <<
"\n";
1623 Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1625 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1626 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1628 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1629 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1631 std::cout <<
"End cell list GPU split" <<
"\n";
1723 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_sparse)
1725 std::cout <<
"Test cell list GPU" <<
"\n";
1728 Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1730 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,2>(box,1000,{32,32,32},2);
1731 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,2>(box,10000,{32,32,32},2);
1733 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,2>(box2,1000,{32,32,32},2);
1734 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,2>(box2,10000,{32,32,32},2);
1736 std::cout <<
"End cell list GPU" <<
"\n";
1741 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_radius)
1743 std::cout <<
"Test cell list GPU" <<
"\n";
1746 Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1748 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,1000,{32,32,32});
1749 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,10000,{32,32,32});
1751 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,1000,{32,32,32});
1752 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,10000,{32,32,32});
1754 std::cout <<
"End cell list GPU" <<
"\n";
1761 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force)
1763 std::cout <<
"Test cell list GPU" <<
"\n";
1766 Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1768 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,1000,{16,16,16});
1769 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,10000,{16,16,16});
1771 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,1000,{16,16,16});
1772 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,10000,{16,16,16});
1774 std::cout <<
"End cell list GPU" <<
"\n";
1779 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_sparse)
1781 std::cout <<
"Test cell list GPU force sparse" <<
"\n";
1784 Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1786 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,0>(box,1000,{16,16,16});
1787 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,0>(box,10000,{16,16,16});
1789 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,0>(box2,1000,{16,16,16});
1790 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
true>,0>(box2,10000,{16,16,16});
1792 std::cout <<
"End cell list GPU force sparse" <<
"\n";
1799 template<
typename CellList_type,
typename Vector_type,
typename Vector_out>
1800 __global__
void cl_offload_gpu(CellList_type cellList, Vector_type parts, Vector_out output)
1802 int p = threadIdx.x + blockIdx.x * blockDim.x;
1804 if (p >= parts.size())
return;
1808 output.template get<0>(p) = cellList.getNelements(cellList.getCell(xp));
1811 template<
typename CellList_type,
typename Vector_type,
typename Vector_scan_type,
typename Vector_list_type>
1812 __global__
void cl_offload_gpu_list(CellList_type cellList, Vector_type parts, Vector_scan_type scan, Vector_list_type list)
1814 int p = threadIdx.x + blockIdx.x * blockDim.x;
1816 if (p >= parts.size())
return;
1820 int id = cellList.getCell(xp);
1821 int n_ele = cellList.getNelements(
id);
1822 int start = scan.template get<0>(p);
1824 for (
int j = 0 ; j < n_ele ; j++)
1826 list.template get<0>(start+j) = cellList.get(
id,j);
1833 BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
1835 std::cout <<
"Test cell list offload gpu" <<
"\n";
1838 size_t div[3] = {10,10,10};
1851 os.resize(v.size());
1853 for (
size_t i = 0 ; i < v.size() ; i++)
1855 v.template get<0>(i)[0] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1856 v.template get<0>(i)[1] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1857 v.template get<0>(i)[2] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1864 auto ite = v.getGPUIterator();
1867 v.hostToDevice<0>();
1869 CUDA_LAUNCH_DIM3((cl_offload_gpu<decltype(cl1.toKernel()),decltype(v.toKernel()),decltype(os.toKernel())>),ite.wthr,ite.thr,cl1.toKernel(),v.toKernel(),os.toKernel());
1871 os.deviceToHost<0>();
1874 for (
size_t i = 0 ; i < os.
size() ; i++)
1878 match &= os.template get<0>(i) == cl1.getNelements(cl1.getCell(xp));
1881 BOOST_REQUIRE_EQUAL(match,
true);
1886 os_scan.resize(v.size());
1889 openfpm::scan((
int *)os.template getDeviceBuffer<0>(),os.
size(),(
int *)os_scan.template getDeviceBuffer<0>(),gpuContext);
1891 os_scan.deviceToHost<0>();
1892 os.deviceToHost<0>(os.
size()-1,os.
size()-1);
1893 size_t size_list = os_scan.template get<0>(os_scan.
size()-1) + os.template get<0>(os.
size()-1);
1896 os_list.resize(size_list);
1898 CUDA_LAUNCH_DIM3((cl_offload_gpu_list<decltype(cl1.toKernel()),decltype(v.toKernel()),
1899 decltype(os_scan.toKernel()),decltype(os_list.toKernel())>),ite.wthr,ite.thr,
1900 cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
1902 os_list.deviceToHost<0>();
1905 for (
size_t i = 0 ; i < os.
size() ; i++)
1909 for (
size_t j = 0 ; j < cl1.getNelements(cl1.getCell(xp)) ; j++)
1911 match &= os_list.template get<0>(os_scan.template get<0>(i)+j) == cl1.get(cl1.getCell(xp),j);
1915 BOOST_REQUIRE_EQUAL(match,
true);
1917 std::cout <<
"End cell list offload gpu" <<
"\n";
1924 BOOST_AUTO_TEST_CASE( CellList_swap_test )
1926 size_t npart = 4096;
1931 size_t div[3] = {10,10,10};
1937 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cellList2(box,div,2);
1938 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cellList3(box,div,2);
1939 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cellList4(box,div,2);
1948 fill_random_parts<3>(box,vPos,vPrp,npart);
1950 vPrp.resize(vPos.
size());
1952 vPos.template hostToDevice<0>();
1953 vPrp.template hostToDevice<0,1>();
1955 size_t ghostMarker = vPos.
size() / 2;
1958 cellList2.construct(vPos,vPrp,gpuContext,ghostMarker);
1959 cellList4.construct(vPos,vPrp,gpuContext,ghostMarker);
1961 cellList3.swap(cellList2);
1965 cellList3.debug_deviceToHost();
1966 cellList4.debug_deviceToHost();
1968 BOOST_REQUIRE_EQUAL(cellList3.getNCells(),cellList4.getNCells());
1974 for (
size_t i = 0 ; i < cellList3.getNCells() ; i++)
1976 check &= cellList3.getNelements(i) == cellList4.getNelements(i);
1978 for (
size_t j = 0 ; j < cellList3.getNelements(i) ; j++)
1980 s1.add(cellList3.get(i,j));
1981 s2.add(cellList4.get(i,j));
1987 for (
size_t j = 0 ; j < s1.
size() ; j++)
1989 check &= s1.get(j) == s2.get(j);
1993 BOOST_REQUIRE_EQUAL(check,
true);
1998 BOOST_AUTO_TEST_SUITE_END()
This class represent an N-dimensional box.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Class for FAST cell list implementation.
This class implement an NxN (dense) matrix.
It is a class that work like a vector of vector.
This class implement the point shape in an N-dimensional space.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Declaration grid_key_dx_iterator_sub.
grid_key_dx is the key to access any element in the grid
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void flush(gpu::ofp_context_t &gpuContext, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Implementation of 1-D std::vector like structure.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)