8 #define BOOST_GPU_ENABLED __host__ __device__ 9 #include "util/cuda_launch.hpp" 11 #define BOOST_TEST_DYN_LINK 12 #include <boost/test/unit_test.hpp> 14 #include "util/cuda_util.hpp" 15 #include "cuda/CellList_gpu.hpp" 16 #include "CellList.hpp" 17 #include "util/boost/boost_array_openfpm.hpp" 18 #include "Point_test.hpp" 19 #include "util/cuda_util.hpp" 21 BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
23 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
61 for (
size_t i = 0 ; i < dim ; i++)
68 cl_n.resize(17*17*17);
69 cl_n.template fill<0>(0);
72 part_ids.resize(pl.
size());
74 size_t sz[3] = {17,17,17};
77 auto ite = pl.getGPUIterator();
79 pl.template hostToDevice<0>();
86 CUDA_LAUNCH_DIM3((subindex<
false,dim,T,cnt_type,ids_type,
no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
94 static_cast<T *>(pl.template getDeviceBuffer<0>()),
95 static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
96 static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
98 cl_n.template deviceToHost<0>();
99 part_ids.template deviceToHost<0>();
103 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
104 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
105 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
106 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
107 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
108 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
109 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
110 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
111 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
112 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
114 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
115 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
116 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
117 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
118 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
119 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
120 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
121 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
122 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
126 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
127 void test_sub_index2()
182 for (
size_t i = 0 ; i < dim ; i++)
189 cl_n.resize(17*17*17);
190 cl_n.template fill<0>(0);
193 part_ids.resize(pl.
size());
195 size_t sz[3] = {17,17,17};
198 auto ite = pl.getGPUIterator();
200 pl.template hostToDevice<0>();
206 CUDA_LAUNCH_DIM3((subindex<
false,dim,T,cnt_type,ids_type,
shift_only<dim,T>>),ite.wthr,ite.thr,div,
214 static_cast<T *>(pl.template getDeviceBuffer<0>()),
215 static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
216 static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
218 cl_n.template deviceToHost<0>();
219 part_ids.template deviceToHost<0>();
223 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
224 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
225 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
226 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
227 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
228 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
229 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
230 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
231 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
232 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
234 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
235 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
236 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
237 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
238 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
239 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
240 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
241 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
242 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
245 template<
unsigned int dim,
typename T>
246 void create_n_part(
int n_part,
252 auto it = pl.getIterator();
258 pl.template get<0>(p)[0] = (double)rand()/RAND_MAX;
259 pl.template get<0>(p)[1] = (double)rand()/RAND_MAX;
260 pl.template get<0>(p)[2] = (double)rand()/RAND_MAX;
263 xp.
get(0) = pl.template get<0>(p)[0];
264 xp.
get(1) = pl.template get<0>(p)[1];
265 xp.
get(2) = pl.template get<0>(p)[2];
267 size_t c = cl.getCell(xp);
274 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
285 part_ids.resize(n_part);
286 starts.resize(n_cell);
287 cells.resize(n_part);
295 auto cell = itg.get();
297 size_t clin = gr.
LinId(cell);
299 for (
size_t j = 0 ; j < cl.getNelements(clin) ; j++)
301 size_t p_id = cl.get(clin,j);
303 part_ids.template get<0>(p_id)[0] = clin;
305 part_ids.template get<0>(p_id)[1] = j;
307 cells.template get<0>(start+j) = p_id;
309 starts.template get<0>(clin) = start;
310 start += cl.getNelements(clin);
316 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
317 void test_fill_cell()
319 #ifndef MAKE_CELLLIST_DETERMINISTIC 335 size_t div_host[dim];
338 for (
size_t i = 0 ; i < dim ; i++)
352 create_n_part(5000,pl,cl);
356 create_starts_and_parts_ids(cl,gr,pl.
size(),tot,starts,part_ids,cells_out);
359 cells.resize(pl.
size());
360 for (
size_t i = 0 ; i < gr.
size() - 1 ; i++)
362 size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
364 check &= (tot_p == cl.getNelements(i));
370 for (
size_t j = 0 ; j < cl.getNelements(i) ; j++)
372 size_t p_id = cl.get(i,j);
374 check &= part_ids.template get<0>(p_id)[0] == i;
378 BOOST_REQUIRE(check ==
true);
380 auto itgg = part_ids.getGPUIterator();
382 starts.template hostToDevice<0>();
383 part_ids.template hostToDevice<0>();
392 static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
393 static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
394 static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
396 cells.template deviceToHost<0>();
398 for (
size_t i = 0 ; i < gr.
size() - 1 ; i++)
400 size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
402 check &= (tot_p == cl.getNelements(i));
408 for (
size_t j = 0 ; j < cl.getNelements(i) ; j++)
410 size_t p_id = cl.get(i,j);
412 size_t p_id2 = cells.template get<0>(starts.template get<0>(i) + j);
414 check &= (p_id == p_id2);
418 BOOST_REQUIRE(check ==
true);
423 template<
typename sparse_vector_type>
424 __global__
void construct_cells(sparse_vector_type sv,
grid_sm<3,void> gs)
441 sv.template insert<0>(gs.
LinId(key1)) = gs.
LinId(key1);
442 sv.template insert<0>(gs.
LinId(key2)) = gs.
LinId(key2);
443 sv.template insert<0>(gs.
LinId(key3)) = gs.
LinId(key3);
444 sv.template insert<0>(gs.
LinId(key4)) = gs.
LinId(key4);
445 sv.template insert<0>(gs.
LinId(key5)) = gs.
LinId(key5);
446 sv.template insert<0>(gs.
LinId(key6)) = gs.
LinId(key6);
447 sv.template insert<0>(gs.
LinId(key7)) = gs.
LinId(key7);
448 sv.template insert<0>(gs.
LinId(key8)) = gs.
LinId(key8);
449 sv.template insert<0>(gs.
LinId(key9)) = gs.
LinId(key9);
450 sv.template insert<0>(gs.
LinId(key10)) = gs.
LinId(key10);
452 sv.flush_block_insert();
455 void test_cell_count_n()
461 vs.template setBackground<0>(-1);
465 size_t sz[] = {17,17,17};
468 CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.
toKernel(),gs);
470 mgpu::ofp_context_t ctx;
481 int mid = gs.
LinId(middle);
490 cells_nn_test.get<0>(cells_nn_test.
size()-1) = (
int)gs.
LinId(p) - mid;
495 cells_nn_test.template hostToDevice<0>();
497 auto itgg = vs.getGPUIterator();
498 CUDA_LAUNCH((count_nn_cells),itgg,vs.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
500 cells_nn.deviceToHost<0>();
502 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(0),8);
503 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(1),8);
504 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(2),8);
505 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(3),8);
506 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(4),8);
507 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(5),8);
508 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(6),8);
509 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(7),9);
510 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(8),2);
511 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(9),1);
514 openfpm::scan((
unsigned int *)cells_nn.template getDeviceBuffer<0>(), cells_nn.
size(), (
unsigned int *)cells_nn.template getDeviceBuffer<0>() , ctx);
517 cell_nn_list.resize(7*8 + 9 + 2 + 1);
519 CUDA_LAUNCH((fill_nn_cells),itgg,vs.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
521 cell_nn_list.deviceToHost<0>();
524 for (
size_t i = 0 ; i < 7 ; i++)
526 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+0),1535);
527 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+1),1536);
528 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+2),1552);
529 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+3),1553);
530 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+4),1824);
531 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+5),1825);
532 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+6),1841);
533 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+7),1842);
537 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+0),1535);
538 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+1),1536);
539 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+2),1552);
540 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+3),1553);
541 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+4),1824);
542 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+5),1825);
543 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+6),1841);
544 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+7),1842);
545 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+8),2149);
548 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9),1842);
549 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+1),2149);
552 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+2),2763);
555 BOOST_AUTO_TEST_CASE( test_count_nn_cells )
557 std::cout <<
"Test cell count nn" << std::endl;
562 BOOST_AUTO_TEST_CASE( test_subindex_funcs )
564 std::cout <<
"Test cell list GPU base func" <<
"\n";
566 test_sub_index<3,float,int,unsigned char>();
567 test_sub_index2<3,float,int,unsigned char>();
569 std::cout <<
"End cell list GPU" <<
"\n";
574 BOOST_AUTO_TEST_CASE ( test_cell_fill )
576 std::cout <<
"Test GPU fill cells" <<
"\n";
578 test_fill_cell<3,float,unsigned int, unsigned char>();
580 std::cout <<
"End GPU fill cells" <<
"\n";
583 template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
584 void test_reorder_parts(
size_t n_part)
604 size_t div_host[dim];
607 for (
size_t i = 0 ; i < dim ; i++)
621 create_n_part(n_part,pl,cl);
622 parts_prp.resize(n_part);
623 parts_prp_out.resize(n_part);
624 pl_out.resize(n_part);
625 sort_to_not_sort.resize(n_part);
626 non_sort_to_sort.resize(n_part);
628 auto p_it = parts_prp.getIterator();
629 while (p_it.isNext())
633 parts_prp.template get<0>(p) = 10000 + p;
634 parts_prp.template get<1>(p) = 20000 + p;
636 parts_prp.template get<2>(p)[0] = 30000 + p;
637 parts_prp.template get<2>(p)[1] = 40000 + p;
638 parts_prp.template get<2>(p)[2] = 50000 + p;
640 parts_prp.template get<3>(p)[0][0] = 60000 + p;
641 parts_prp.template get<3>(p)[0][1] = 70000 + p;
642 parts_prp.template get<3>(p)[0][2] = 80000 + p;
643 parts_prp.template get<3>(p)[1][0] = 90000 + p;
644 parts_prp.template get<3>(p)[1][1] = 100000 + p;
645 parts_prp.template get<3>(p)[1][2] = 110000 + p;
646 parts_prp.template get<3>(p)[2][0] = 120000 + p;
647 parts_prp.template get<3>(p)[2][1] = 130000 + p;
648 parts_prp.template get<3>(p)[0][2] = 140000 + p;
655 create_starts_and_parts_ids(cl,gr,pl.
size(),tot,starts,part_ids,cells_out);
658 auto itgg = pl.getGPUIterator();
660 cells_out.template hostToDevice<0>();
662 auto ite = pl.getGPUIterator();
664 parts_prp.template hostToDevice<0,1,2,3>();
667 CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()),
668 decltype(pl.toKernel()),
669 decltype(sort_to_not_sort.toKernel()),
672 parts_prp.toKernel(),
673 parts_prp_out.toKernel(),
676 sort_to_not_sort.toKernel(),
677 non_sort_to_sort.toKernel(),
678 static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>()));
681 parts_prp_out.template deviceToHost<0>();
682 sort_to_not_sort.template deviceToHost<0>();
683 non_sort_to_sort.template deviceToHost<0>();
686 for (
size_t i = 0 ; i < tot ; i++)
688 size_t n = cl.getNelements(i);
690 for (
size_t j = 0 ; j < n ; j++)
692 size_t p = cl.get(i,j);
694 check &= parts_prp_out.template get<0>(st) == parts_prp.template get<0>(p);
695 check &= sort_to_not_sort.template get<0>(st) == p;
696 check &= non_sort_to_sort.template get<0>(p) == st;
703 BOOST_REQUIRE_EQUAL(check,
true);
706 BOOST_AUTO_TEST_CASE ( test_reorder_particles )
708 std::cout <<
"Test GPU reorder" <<
"\n";
710 test_reorder_parts<3,float,unsigned int, unsigned char>(5000);
712 std::cout <<
"End GPU reorder" <<
"\n";
715 template<
unsigned int dim,
typename T,
typename CellS>
void Test_cell_gpu(
SpaceBox<dim,T> & box)
721 size_t div[dim] = {16,16,16};
760 pl_prp.resize(pl.
size());
761 pl_prp_out.resize(pl.
size());
762 pl_out.resize(pl.
size());
764 for (
size_t i = 0 ; i < pl.
size() ; i++)
766 pl_prp.template get<0>(i) = pl.template get<0>(i)[0];
768 pl_prp.template get<1>(i)[0] = pl.template get<0>(i)[0]+100.0;
769 pl_prp.template get<1>(i)[1] = pl.template get<0>(i)[1]+100.0;
770 pl_prp.template get<1>(i)[2] = pl.template get<0>(i)[2]+100.0;
772 pl_prp.template get<2>(i)[0][0] = pl.template get<0>(i)[0]+1000.0;
773 pl_prp.template get<2>(i)[0][1] = pl.template get<0>(i)[1]+1000.0;
774 pl_prp.template get<2>(i)[0][2] = pl.template get<0>(i)[2]+1000.0;
776 pl_prp.template get<2>(i)[1][0] = pl.template get<0>(i)[0]+2000.0;
777 pl_prp.template get<2>(i)[1][1] = pl.template get<0>(i)[1]+3000.0;
778 pl_prp.template get<2>(i)[1][2] = pl.template get<0>(i)[2]+4000.0;
780 pl_prp.template get<2>(i)[2][0] = pl.template get<0>(i)[0]+5000.0;
781 pl_prp.template get<2>(i)[2][1] = pl.template get<0>(i)[1]+6000.0;
782 pl_prp.template get<2>(i)[2][2] = pl.template get<0>(i)[2]+7000.0;
785 pl_prp.resize(pl.
size());
786 pl_prp_out.resize(pl.
size());
788 pl.template hostToDevice<0>();
789 pl_prp.template hostToDevice<0,1,2>();
792 mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
793 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context);
797 pl_prp_out.deviceToHost<0>();
798 pl_prp_out.deviceToHost<1>();
799 pl_prp_out.deviceToHost<2>();
815 for (
size_t i = 0 ; i < pl_correct.
size() ; i++)
817 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<0>(i),(
float)pl_correct.template get<0>(i)[0]);
818 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[0],(
float)(pl_correct.template get<0>(i)[0]+100.0));
819 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[1],(
float)(pl_correct.template get<0>(i)[1]+100.0));
820 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[2],(
float)(pl_correct.template get<0>(i)[2]+100.0));
821 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][0],(
float)(pl_correct.template get<0>(i)[0] + 1000.0));
822 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][1],(
float)(pl_correct.template get<0>(i)[1] + 1000.0));
823 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][2],(
float)(pl_correct.template get<0>(i)[2] + 1000.0));
824 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][0],(
float)(pl_correct.template get<0>(i)[0] + 2000.0));
825 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][1],(
float)(pl_correct.template get<0>(i)[1] + 3000.0));
826 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][2],(
float)(pl_correct.template get<0>(i)[2] + 4000.0));
827 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][0],(
float)(pl_correct.template get<0>(i)[0] + 5000.0));
828 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][1],(
float)(pl_correct.template get<0>(i)[1] + 6000.0));
829 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][2],(
float)(pl_correct.template get<0>(i)[2] + 7000.0));
834 auto & vsrt = cl2.getSortToNonSort();
835 vsrt.template deviceToHost<0>();
837 BOOST_REQUIRE_EQUAL(vsrt.size(),9);
839 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(0),8);
840 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(1),0);
841 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(2),1);
842 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(3),2);
843 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(4),4);
844 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(5),3);
845 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(6),5);
846 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(7),6);
847 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(8),7);
849 auto & vnsrt = cl2.getNonSortToSort();
851 BOOST_REQUIRE_EQUAL(vnsrt.size(),9);
855 vnsrt.template deviceToHost<0>();
857 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(8),0);
858 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(0),1);
859 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(1),2);
860 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(2),3);
861 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(4),4);
862 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(3),5);
863 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(5),6);
864 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(6),7);
865 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(7),8);
869 BOOST_AUTO_TEST_CASE( CellList_gpu_use)
871 std::cout <<
"Test cell list GPU" <<
"\n";
876 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory>>(box);
878 std::cout <<
"End cell list GPU" <<
"\n";
883 BOOST_AUTO_TEST_CASE( CellList_gpu_use_sparse )
885 std::cout <<
"Test cell list GPU sparse" <<
"\n";
890 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory,no_transform_only<3,double>,
unsigned int,
int,
true>> (box);
892 std::cout <<
"End cell list GPU sparse" <<
"\n";
897 template<
unsigned int dim,
typename vector_ps,
typename vector_pr>
898 void fill_random_parts(
Box<dim,float> & box, vector_ps & vd_pos, vector_pr & vd_prp,
size_t n)
900 for (
size_t i = 0 ; i < n ; i++)
910 vd_prp.last().template get<0>() = i % 3;
915 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
916 __global__
void calc_force_number(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
918 int p = threadIdx.x + blockIdx.x * blockDim.x;
920 if (p >= pos.size())
return;
924 auto it = cl.getNNIterator(cl.getCell(xp));
928 auto q = it.get_sort();
929 auto q_ns = it.get();
931 int s1 = s_t_ns.template get<0>(q);
933 atomicAdd(&vn.template get<0>(s1), 1);
939 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
940 __global__
void calc_force_number_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
942 int p = threadIdx.x + blockIdx.x * blockDim.x;
944 if (p >= pos.size())
return;
948 auto it = cl.getNNIterator(cl.getCell(xp));
952 auto q = it.get_sort();
953 auto q_ns = it.get();
955 int s1 = s_t_ns.template get<0>(q);
957 ++vn.template get<0>(p);
963 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
964 __global__
void calc_force_number_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn,
unsigned int start)
966 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
968 if (p >= pos.size())
return;
972 auto it = cl.getNNIteratorBox(cl.getCell(xp));
976 auto q = it.get_sort();
978 int s1 = s_t_ns.template get<0>(q);
980 atomicAdd(&vn.template get<0>(s1), 1);
986 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
987 __global__
void calc_force_number_box_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn,
unsigned int start)
989 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
991 if (p >= pos.size())
return;
995 auto it = cl.getNNIteratorBox(cl.getCell(xp));
999 auto q = it.get_sort();
1001 ++vn.template get<0>(p);
1007 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1008 __global__
void calc_force_number_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
1010 int p = threadIdx.x + blockIdx.x * blockDim.x;
1012 if (p >= pos.size())
return;
1016 auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1020 auto q = it.get_sort();
1022 int s1 = s_t_ns.template get<0>(q);
1024 atomicAdd(&vn.template get<0>(s1), 1);
1030 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1031 __global__
void calc_force_list_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1033 int p = threadIdx.x + blockIdx.x * blockDim.x;
1035 if (p >= pos.size())
return;
1038 int start_list = v_nscan.template get<0>(p);
1040 auto it = cl.getNNIteratorBox(cl.getCell(xp));
1044 auto q = it.get_sort();
1046 int s1 = s_t_ns.template get<0>(q);
1048 v_list.template get<0>(start_list) = s1;
1055 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1056 __global__
void calc_force_list(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1058 int p = threadIdx.x + blockIdx.x * blockDim.x;
1060 if (p >= pos.size())
return;
1063 int start_list = v_nscan.template get<0>(p);
1065 auto it = cl.getNNIterator(cl.getCell(xp));
1069 auto q = it.get_sort();
1071 int s1 = s_t_ns.template get<0>(q);
1073 v_list.template get<0>(start_list) = s1;
1080 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1081 __global__
void calc_force_list_box_partial(vector_pos pos,
1084 vector_n_type v_nscan,
1085 vector_n_type v_nscan_part,
1086 vector_n_type v_list)
1088 int p = threadIdx.x + blockIdx.x * blockDim.x;
1090 if (p >= pos.size())
return;
1093 int start_list = v_nscan.template get<0>(p) + v_nscan_part.template get<0>(p);
1095 auto it = cl.getNNIteratorBox(cl.getCell(xp));
1099 auto q = it.get_sort();
1101 int s1 = s_t_ns.template get<0>(q);
1103 v_list.template get<0>(start_list) = s1;
1110 template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1111 __global__
void calc_force_list_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1113 int p = threadIdx.x + blockIdx.x * blockDim.x;
1115 if (p >= pos.size())
return;
1118 int start_list = v_nscan.template get<0>(p);
1120 auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1124 auto q = it.get_sort();
1126 int s1 = s_t_ns.template get<0>(q);
1128 v_list.template get<0>(start_list) = s1;
1135 template<
unsigned int impl>
1138 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1139 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1143 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1144 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1146 auto ite = pl.getGPUIterator();
1148 CUDA_LAUNCH((calc_force_number),ite,pl.toKernel(),s_t_ns.toKernel(),
1149 cl2.toKernel(),n_out.toKernel());
1152 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1153 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2,n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1155 auto ite = pl.getGPUIterator();
1157 CUDA_LAUNCH((calc_force_list),ite,pl.toKernel(),
1160 n_out_scan.toKernel(),
1161 nn_list.toKernel());
1164 template<
typename NN_type>
1165 static auto getNN(NN_type & nn,
size_t cell) -> decltype(nn.getNNIterator(cell))
1167 return nn.getNNIterator(cell);
1174 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1175 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1177 cl2.setRadius(radius);
1178 cl_cpu.setRadius(radius);
1181 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1182 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1184 auto ite = pl.getGPUIterator();
1186 CUDA_LAUNCH((calc_force_number_rad<decltype(pl.toKernel()),
1187 decltype(s_t_ns.toKernel()),
1188 decltype(cl2.toKernel()),
1189 decltype(n_out.toKernel())>),
1196 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1197 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1199 auto ite = pl.getGPUIterator();
1201 CUDA_LAUNCH((calc_force_list_rad<decltype(pl.toKernel()),
1202 decltype(s_t_ns.toKernel()),
1203 decltype(cl2.toKernel()),
1204 decltype(nn_list.toKernel())>),
1208 n_out_scan.toKernel(),
1209 nn_list.toKernel());
1212 template<
typename NN_type>
1213 static auto getNN(NN_type & nn,
size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1215 return nn.getNNIteratorRadius(cell);
1222 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1223 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1225 cl2.setRadius(radius);
1226 cl_cpu.setRadius(radius);
1229 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1230 static void calc_num_noato(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1232 auto ite = s_t_ns.getGPUIterator();
1234 CUDA_LAUNCH((calc_force_number_box_noato<decltype(pl.toKernel()),
1235 decltype(s_t_ns.toKernel()),
1236 decltype(cl2.toKernel()),
1237 decltype(n_out.toKernel())>),
1245 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1246 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1248 auto ite = s_t_ns.getGPUIterator();
1250 CUDA_LAUNCH((calc_force_number_box<decltype(pl.toKernel()),
1251 decltype(s_t_ns.toKernel()),
1252 decltype(cl2.toKernel()),
1253 decltype(n_out.toKernel())>),
1262 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1263 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1265 auto ite = s_t_ns.getGPUIterator();
1267 CUDA_LAUNCH((calc_force_list_box<decltype(pl.toKernel()),
1268 decltype(s_t_ns.toKernel()),
1269 decltype(cl2.toKernel()),
1270 decltype(nn_list.toKernel())>),
1274 n_out_scan.toKernel(),
1275 nn_list.toKernel());
1278 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1279 static void calc_list_partial(pl_type & pl,
1280 s_t_ns_type & s_t_ns,
1282 n_out_scan_type & n_out_scan,
1283 n_out_scan_type & n_out_scan_partial,
1284 nn_list_type & nn_list)
1286 auto ite = s_t_ns.getGPUIterator();
1288 CUDA_LAUNCH((calc_force_list_box_partial),ite,pl.toKernel(),
1291 n_out_scan.toKernel(),
1292 n_out_scan_partial.toKernel(),
1293 nn_list.toKernel());
1296 template<
typename NN_type>
1297 static auto getNN(NN_type & nn,
size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1299 return nn.getNNIteratorRadius(cell);
1303 template<
unsigned int dim,
typename T,
typename CellS,
int impl>
1304 void Test_cell_gpu_force(
SpaceBox<dim,T> & box,
size_t npart,
const size_t (& div)[dim],
int box_nn = 1)
1310 CellS cl2(box,div,2);
1314 cl2.setBoxNN(box_nn);
1331 fill_random_parts<3>(box,pl,pl_prp,npart);
1333 pl_prp_out.resize(pl.
size());
1334 pl_out.resize(pl.
size());
1335 n_out.resize(pl.
size()+1);
1338 pl_prp.resize(pl.
size());
1339 pl_prp_out.resize(pl.
size());
1341 pl.template hostToDevice<0>();
1342 pl_prp.template hostToDevice<0,1>();
1348 auto it2 = pl.getIterator();
1350 while (it2.isNext())
1361 size_t g_m = pl.
size() / 2;
1363 mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
1364 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
1366 auto & s_t_ns = cl2.getSortToNonSort();
1368 pl.template hostToDevice<0>();
1375 auto & gdsi = cl2.getDomainSortIds();
1376 gdsi.template deviceToHost<0>();
1377 s_t_ns.template deviceToHost<0>();
1380 for (
size_t i = 0 ; i < g_m ; i++)
1382 unsigned int p = gdsi.template get<0>(i);
1384 match &= (s_t_ns.template get<0>(p) < g_m);
1387 BOOST_REQUIRE_EQUAL(match,
true);
1391 n_out.deviceToHost<0>();
1395 auto it = pl.getIterator();
1408 while (NN_it.isNext())
1410 auto q = NN_it.get();
1417 check &= n_ele == n_out.template get<0>(p);
1421 std::cout << p <<
" " << n_ele <<
" " << n_out.template get<0>(p) <<
" " << check << std::endl;
1427 BOOST_REQUIRE_EQUAL(check,
true);
1435 n_out_scan.resize(pl.
size()+1);
1437 openfpm::scan((
unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.
size(),(
unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1438 n_out_scan.template deviceToHost<0>();
1440 if (n_out_scan.template get<0>(pl.
size()) == 0)
1443 nn_list.resize(n_out_scan.template get<0>(pl.
size()));
1447 pl.template hostToDevice<0>();
1451 nn_list.template deviceToHost<0>();
1455 n_out.deviceToHost<0>();
1459 auto it = pl.getIterator();
1473 while (NN_it.isNext())
1475 auto q = NN_it.get();
1484 for (
size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1486 gpu_list.add(nn_list.template get<0>(i));
1494 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1495 {check &= cpu_list.get(j) == gpu_list.get(j);}
1500 BOOST_REQUIRE_EQUAL(check,
true);
1505 template<
unsigned int dim,
typename T,
typename CellS,
int impl>
1506 void Test_cell_gpu_force_split(
SpaceBox<dim,T> & box,
size_t npart,
const size_t (& div)[dim],
int box_nn = 1)
1512 CellS cl2_split1(box,div,2);
1513 CellS cl2_split2(box,div,2);
1517 cl2_split1.setBoxNN(box_nn);
1518 cl2_split2.setBoxNN(box_nn);
1537 fill_random_parts<3>(box,pl,pl_prp,npart);
1539 pl_prp_out.resize(pl.
size());
1540 pl_out.resize(pl.
size());
1541 n_out.resize(pl.
size()+1);
1544 pl_prp.resize(pl.
size());
1545 pl_prp_out.resize(pl.
size());
1547 pl.template hostToDevice<0>();
1548 pl_prp.template hostToDevice<0,1>();
1554 auto it2 = pl.getIterator();
1556 while (it2.isNext())
1567 size_t g_m = pl.
size() / 2;
1569 mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
1570 cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.
size()/2);
1571 cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.
size()/2,pl.
size());
1572 auto & s_t_ns_s1 = cl2_split1.getSortToNonSort();
1573 auto & s_t_ns_s2 = cl2_split2.getSortToNonSort();
1576 n_out_partial = n_out;
1581 auto & gdsi_s1 = cl2_split1.getDomainSortIds();
1582 gdsi_s1.template deviceToHost<0>();
1583 s_t_ns_s1.template deviceToHost<0>();
1586 for (
size_t i = 0 ; i < g_m ; i++)
1588 unsigned int p = gdsi_s1.template get<0>(i);
1590 match &= (s_t_ns_s1.template get<0>(p) < g_m);
1593 BOOST_REQUIRE_EQUAL(match,
true);
1597 n_out.deviceToHost<0>();
1601 auto it = pl.getIteratorTo(pl.
size()/2-1);
1614 while (NN_it.isNext())
1616 auto q = NN_it.get();
1623 check &= n_ele == n_out.template get<0>(p);
1627 std::cout << p <<
" " << n_ele <<
" " << n_out.template get<0>(p) <<
" " << check << std::endl;
1633 BOOST_REQUIRE_EQUAL(check,
true);
1641 n_out_scan.resize(n_out.
size());
1643 openfpm::scan((
unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.
size(),(
unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1645 n_out_scan.template deviceToHost<0>();
1647 if (n_out_scan.template get<0>(pl.
size()) == 0)
1650 nn_list.resize(n_out_scan.template get<0>(pl.
size()));
1654 pl.template hostToDevice<0>();
1659 nn_list.template deviceToHost<0>();
1663 n_out.deviceToHost<0>();
1667 auto it = pl.getIteratorTo(pl.
size()/2-1);
1681 while (NN_it.isNext())
1683 auto q = NN_it.get();
1692 for (
size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1694 gpu_list.add(nn_list.template get<0>(i));
1699 #ifndef MAKE_CELLLIST_DETERMINISTIC 1706 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1707 {check &= cpu_list.get(j) == gpu_list.get(j);}
1711 std::cout <<
"NPARTS: " << npart << std::endl;
1713 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1714 {std::cout << cpu_list.get(j) <<
" " << gpu_list.get(j) << std::endl;}
1722 BOOST_REQUIRE_EQUAL(check,
true);
1727 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box)
1729 std::cout <<
"Test cell list GPU" <<
"\n";
1734 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1735 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1737 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1738 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1740 std::cout <<
"End cell list GPU" <<
"\n";
1745 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split)
1747 std::cout <<
"Test cell list GPU split" <<
"\n";
1752 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1753 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1755 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1756 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1758 std::cout <<
"End cell list GPU split" <<
"\n";
1851 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_sparse)
1853 std::cout <<
"Test cell list GPU" <<
"\n";
1858 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box,1000,{32,32,32},2);
1859 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box,10000,{32,32,32},2);
1861 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box2,1000,{32,32,32},2);
1862 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box2,10000,{32,32,32},2);
1864 std::cout <<
"End cell list GPU" <<
"\n";
1871 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_radius)
1873 std::cout <<
"Test cell list GPU" <<
"\n";
1878 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,1000,{32,32,32});
1879 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,10000,{32,32,32});
1881 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,1000,{32,32,32});
1882 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,10000,{32,32,32});
1884 std::cout <<
"End cell list GPU" <<
"\n";
1891 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force)
1893 std::cout <<
"Test cell list GPU" <<
"\n";
1898 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,1000,{16,16,16});
1899 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,10000,{16,16,16});
1901 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,1000,{16,16,16});
1902 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,10000,{16,16,16});
1904 std::cout <<
"End cell list GPU" <<
"\n";
1909 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_sparse)
1911 std::cout <<
"Test cell list GPU force sparse" <<
"\n";
1916 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box,1000,{16,16,16});
1917 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box,10000,{16,16,16});
1919 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box2,1000,{16,16,16});
1920 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box2,10000,{16,16,16});
1922 std::cout <<
"End cell list GPU force sparse" <<
"\n";
1929 template<
typename CellList_type,
typename Vector_type,
typename Vector_out>
1930 __global__
void cl_offload_gpu(CellList_type cl, Vector_type parts, Vector_out output)
1932 int p = threadIdx.x + blockIdx.x * blockDim.x;
1934 if (p >= parts.size())
return;
1938 output.template get<0>(p) = cl.getNelements(cl.getCell(xp));
1941 template<
typename CellList_type,
typename Vector_type,
typename Vector_scan_type,
typename Vector_list_type>
1942 __global__
void cl_offload_gpu_list(CellList_type cl, Vector_type parts, Vector_scan_type scan, Vector_list_type list)
1944 int p = threadIdx.x + blockIdx.x * blockDim.x;
1946 if (p >= parts.size())
return;
1950 int id = cl.getCell(xp);
1951 int n_ele = cl.getNelements(
id);
1952 int start = scan.template get<0>(p);
1954 for (
int j = 0 ; j < n_ele ; j++)
1956 list.template get<0>(start+j) = cl.get(
id,j);
1963 BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
1965 std::cout <<
"Test cell list offload gpu" <<
"\n";
1968 size_t div[3] = {10,10,10};
1981 os.resize(v.size());
1983 for (
size_t i = 0 ; i < v.size() ; i++)
1985 v.template get<0>(i)[0] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1986 v.template get<0>(i)[1] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1987 v.template get<0>(i)[2] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1994 auto ite = v.getGPUIterator();
1997 v.hostToDevice<0>();
1999 CUDA_LAUNCH_DIM3((cl_offload_gpu<decltype(cl1.toKernel()),decltype(v.toKernel()),decltype(os.toKernel())>),ite.wthr,ite.thr,cl1.toKernel(),v.toKernel(),os.toKernel());
2001 os.deviceToHost<0>();
2004 for (
size_t i = 0 ; i < os.
size() ; i++)
2008 match &= os.template get<0>(i) == cl1.getNelements(cl1.getCell(xp));
2011 BOOST_REQUIRE_EQUAL(match,
true);
2016 os_scan.resize(v.size());
2018 mgpu::ofp_context_t ctx;
2019 openfpm::scan((
int *)os.template getDeviceBuffer<0>(),os.
size(),(
int *)os_scan.template getDeviceBuffer<0>(),ctx);
2021 os_scan.deviceToHost<0>();
2022 os.deviceToHost<0>(os.
size()-1,os.
size()-1);
2023 size_t size_list = os_scan.template get<0>(os_scan.
size()-1) + os.template get<0>(os.
size()-1);
2026 os_list.resize(size_list);
2028 CUDA_LAUNCH_DIM3((cl_offload_gpu_list<decltype(cl1.toKernel()),decltype(v.toKernel()),
2029 decltype(os_scan.toKernel()),decltype(os_list.toKernel())>),ite.wthr,ite.thr,
2030 cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
2032 os_list.deviceToHost<0>();
2035 for (
size_t i = 0 ; i < os.
size() ; i++)
2039 for (
size_t j = 0 ; j < cl1.getNelements(cl1.getCell(xp)) ; j++)
2041 match &= os_list.template get<0>(os_scan.template get<0>(i)+j) == cl1.get(cl1.getCell(xp),j);
2045 BOOST_REQUIRE_EQUAL(match,
true);
2047 std::cout <<
"End cell list offload gpu" <<
"\n";
2054 BOOST_AUTO_TEST_CASE( CellList_swap_test )
2056 size_t npart = 4096;
2061 size_t div[3] = {10,10,10};
2067 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl2(box,div,2);
2068 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl3(box,div,2);
2069 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl4(box,div,2);
2081 fill_random_parts<3>(box,pl,pl_prp,npart);
2083 pl_prp_out.resize(pl.
size());
2084 pl_out.resize(pl.
size());
2086 pl_prp.resize(pl.
size());
2087 pl_prp_out.resize(pl.
size());
2089 pl.template hostToDevice<0>();
2090 pl_prp.template hostToDevice<0,1>();
2092 size_t g_m = pl.
size() / 2;
2094 mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
2095 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2096 cl4.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2102 cl3.debug_deviceToHost();
2103 cl4.debug_deviceToHost();
2105 BOOST_REQUIRE_EQUAL(cl3.getNCells(),cl4.getNCells());
2111 for (
size_t i = 0 ; i < cl3.getNCells() ; i++)
2113 check &= cl3.getNelements(i) == cl4.getNelements(i);
2115 for (
size_t j = 0 ; j < cl3.getNelements(i) ; j++)
2117 s1.add(cl3.get(i,j));
2118 s2.add(cl4.get(i,j));
2124 for (
size_t j = 0 ; j < s1.size() ; j++)
2126 check &= s1.get(j) == s2.get(j);
2130 BOOST_REQUIRE_EQUAL(check,
true);
2135 BOOST_AUTO_TEST_SUITE_END()
This class represent an N-dimensional box.
__device__ __host__ grid_key_dx< N > InvLinId(mem_id id) const
Construct.
__device__ __host__ size_t size() const
Return the size of the grid.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
This class implement the point shape in an N-dimensional space.
Transform the boost::fusion::vector into memory specification (memory_traits)
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
This class implement an NxN (dense) matrix.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
This class represent an N-dimensional box.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void flush(mgpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
Declaration grid_key_dx_iterator_sub.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Implementation of 1-D std::vector like structure.
Class for FAST cell list implementation.
__device__ __host__ T getHigh(int i) const
get the high interval of the box