8#define BOOST_GPU_ENABLED __host__ __device__
9#include "util/cuda_launch.hpp"
11#define BOOST_TEST_DYN_LINK
12#include <boost/test/unit_test.hpp>
14#include "util/cuda_util.hpp"
15#include "cuda/CellList_gpu.hpp"
16#include "CellList.hpp"
17#include "util/boost/boost_array_openfpm.hpp"
18#include "Point_test.hpp"
19#include "util/cuda_util.hpp"
21BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
23template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
61 for (
size_t i = 0 ; i < dim ; i++)
68 cl_n.resize(17*17*17);
69 cl_n.template fill<0>(0);
72 part_ids.resize(pl.
size());
74 size_t sz[3] = {17,17,17};
77 auto ite = pl.getGPUIterator();
79 pl.template hostToDevice<0>();
87 CUDA_LAUNCH_DIM3((subindex<
false,dim,T,cnt_type,ids_type,
no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
97 cl_n.template deviceToHost<0>();
98 part_ids.template deviceToHost<0>();
102 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
103 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
104 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
105 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
106 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
107 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
108 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
109 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
110 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
111 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
113 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
114 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
115 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
116 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
117 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
118 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
119 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
120 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
121 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
125template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
126void test_sub_index2()
181 for (
size_t i = 0 ; i < dim ; i++)
188 cl_n.resize(17*17*17);
189 cl_n.template fill<0>(0);
192 part_ids.resize(pl.
size());
194 size_t sz[3] = {17,17,17};
197 auto ite = pl.getGPUIterator();
199 pl.template hostToDevice<0>();
206 CUDA_LAUNCH_DIM3((subindex<
false,dim,T,cnt_type,ids_type,
shift_only<dim,T>>),ite.wthr,ite.thr,div,
214 part_ids.toKernel());
216 cl_n.template deviceToHost<0>();
217 part_ids.template deviceToHost<0>();
221 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
222 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
223 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
224 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
225 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
226 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
227 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
228 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
229 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
230 BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
232 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
233 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
234 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
235 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
236 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
237 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
238 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
239 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
240 BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
243template<
unsigned int dim,
typename T>
244void create_n_part(
int n_part,
250 auto it = pl.getIterator();
256 pl.template get<0>(p)[0] = (double)rand()/RAND_MAX;
257 pl.template get<0>(p)[1] = (double)rand()/RAND_MAX;
258 pl.template get<0>(p)[2] = (double)rand()/RAND_MAX;
261 xp.
get(0) = pl.template get<0>(p)[0];
262 xp.
get(1) = pl.template get<0>(p)[1];
263 xp.
get(2) = pl.template get<0>(p)[2];
265 size_t c = cl.getCell(xp);
272template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
283 part_ids.resize(n_part);
284 starts.resize(n_cell);
285 cells.resize(n_part);
293 auto cell = itg.get();
295 size_t clin = gr.
LinId(cell);
297 for (
size_t j = 0 ; j < cl.getNelements(clin) ; j++)
299 size_t p_id = cl.get(clin,j);
301 part_ids.template get<0>(p_id)[0] = clin;
303 part_ids.template get<0>(p_id)[1] = j;
305 cells.template get<0>(start+j) = p_id;
307 starts.template get<0>(clin) = start;
308 start += cl.getNelements(clin);
314template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
317#ifndef MAKE_CELLLIST_DETERMINISTIC
333 size_t div_host[dim];
336 for (
size_t i = 0 ; i < dim ; i++)
342 domain.setLow(i,0.0);
343 domain.setHigh(i,1.0);
350 create_n_part(5000,pl,cl);
354 create_starts_and_parts_ids(cl,gr,pl.
size(),tot,starts,part_ids,cells_out);
357 cells.resize(pl.
size());
358 for (
size_t i = 0 ; i < gr.
size() - 1 ; i++)
360 size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
362 check &= (tot_p == cl.getNelements(i));
368 for (
size_t j = 0 ; j < cl.getNelements(i) ; j++)
370 size_t p_id = cl.get(i,j);
372 check &= part_ids.template get<0>(p_id)[0] == i;
376 BOOST_REQUIRE(check ==
true);
378 auto itgg = part_ids.getGPUIterator();
380 starts.template hostToDevice<0>();
381 part_ids.template hostToDevice<0>();
393 cells.template deviceToHost<0>();
395 for (
size_t i = 0 ; i < gr.
size() - 1 ; i++)
397 size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
399 check &= (tot_p == cl.getNelements(i));
405 for (
size_t j = 0 ; j < cl.getNelements(i) ; j++)
407 size_t p_id = cl.get(i,j);
409 size_t p_id2 = cells.template get<0>(starts.template get<0>(i) + j);
411 check &= (p_id == p_id2);
415 BOOST_REQUIRE(check ==
true);
420template<
typename sparse_vector_type>
421__global__
void construct_cells(sparse_vector_type sv,
grid_sm<3,void> gs)
438 sv.template insert<0>(gs.
LinId(key1)) = gs.
LinId(key1);
439 sv.template insert<0>(gs.
LinId(key2)) = gs.
LinId(key2);
440 sv.template insert<0>(gs.
LinId(key3)) = gs.
LinId(key3);
441 sv.template insert<0>(gs.
LinId(key4)) = gs.
LinId(key4);
442 sv.template insert<0>(gs.
LinId(key5)) = gs.
LinId(key5);
443 sv.template insert<0>(gs.
LinId(key6)) = gs.
LinId(key6);
444 sv.template insert<0>(gs.
LinId(key7)) = gs.
LinId(key7);
445 sv.template insert<0>(gs.
LinId(key8)) = gs.
LinId(key8);
446 sv.template insert<0>(gs.
LinId(key9)) = gs.
LinId(key9);
447 sv.template insert<0>(gs.
LinId(key10)) = gs.
LinId(key10);
449 sv.flush_block_insert();
452void test_cell_count_n()
458 vs.template setBackground<0>(-1);
462 size_t sz[] = {17,17,17};
465 CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.
toKernel(),gs);
478 int mid = gs.
LinId(middle);
487 cells_nn_test.get<0>(cells_nn_test.
size()-1) = (
int)gs.
LinId(p) - mid;
492 cells_nn_test.template hostToDevice<0>();
494 auto itgg = vs.getGPUIterator();
495 CUDA_LAUNCH((count_nn_cells),itgg,vs.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
497 cells_nn.deviceToHost<0>();
499 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(0),8);
500 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(1),8);
501 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(2),8);
502 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(3),8);
503 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(4),8);
504 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(5),8);
505 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(6),8);
506 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(7),9);
507 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(8),2);
508 BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(9),1);
511 openfpm::scan((
unsigned int *)cells_nn.template getDeviceBuffer<0>(), cells_nn.
size(), (
unsigned int *)cells_nn.template getDeviceBuffer<0>() , ctx);
514 cell_nn_list.resize(7*8 + 9 + 2 + 1);
516 CUDA_LAUNCH((fill_nn_cells),itgg,vs.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
518 cell_nn_list.deviceToHost<0>();
521 for (
size_t i = 0 ; i < 7 ; i++)
523 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+0),1535);
524 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+1),1536);
525 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+2),1552);
526 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+3),1553);
527 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+4),1824);
528 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+5),1825);
529 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+6),1841);
530 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+7),1842);
534 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+0),1535);
535 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+1),1536);
536 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+2),1552);
537 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+3),1553);
538 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+4),1824);
539 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+5),1825);
540 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+6),1841);
541 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+7),1842);
542 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+8),2149);
545 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9),1842);
546 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+1),2149);
549 BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+2),2763);
552BOOST_AUTO_TEST_CASE( test_count_nn_cells )
554 std::cout <<
"Test cell count nn" << std::endl;
559BOOST_AUTO_TEST_CASE( test_subindex_funcs )
561 std::cout <<
"Test cell list GPU base func" <<
"\n";
563 test_sub_index<3,float,int,unsigned char>();
564 test_sub_index2<3,float,int,unsigned char>();
566 std::cout <<
"End cell list GPU" <<
"\n";
571BOOST_AUTO_TEST_CASE ( test_cell_fill )
573 std::cout <<
"Test GPU fill cells" <<
"\n";
575 test_fill_cell<3,float,unsigned int, unsigned char>();
577 std::cout <<
"End GPU fill cells" <<
"\n";
580template<
unsigned int dim,
typename T,
typename cnt_type,
typename ids_type>
581void test_reorder_parts(
size_t n_part)
601 size_t div_host[dim];
604 for (
size_t i = 0 ; i < dim ; i++)
610 domain.setLow(i,0.0);
611 domain.setHigh(i,1.0);
618 create_n_part(n_part,pl,cl);
619 parts_prp.resize(n_part);
620 parts_prp_out.resize(n_part);
621 pl_out.resize(n_part);
622 sort_to_not_sort.resize(n_part);
623 non_sort_to_sort.resize(n_part);
625 auto p_it = parts_prp.getIterator();
626 while (p_it.isNext())
630 parts_prp.template get<0>(p) = 10000 + p;
631 parts_prp.template get<1>(p) = 20000 + p;
633 parts_prp.template get<2>(p)[0] = 30000 + p;
634 parts_prp.template get<2>(p)[1] = 40000 + p;
635 parts_prp.template get<2>(p)[2] = 50000 + p;
637 parts_prp.template get<3>(p)[0][0] = 60000 + p;
638 parts_prp.template get<3>(p)[0][1] = 70000 + p;
639 parts_prp.template get<3>(p)[0][2] = 80000 + p;
640 parts_prp.template get<3>(p)[1][0] = 90000 + p;
641 parts_prp.template get<3>(p)[1][1] = 100000 + p;
642 parts_prp.template get<3>(p)[1][2] = 110000 + p;
643 parts_prp.template get<3>(p)[2][0] = 120000 + p;
644 parts_prp.template get<3>(p)[2][1] = 130000 + p;
645 parts_prp.template get<3>(p)[0][2] = 140000 + p;
652 create_starts_and_parts_ids(cl,gr,pl.
size(),tot,starts,part_ids,cells_out);
655 auto itgg = pl.getGPUIterator();
657 cells_out.template hostToDevice<0>();
659 auto ite = pl.getGPUIterator();
661 parts_prp.template hostToDevice<0,1,2,3>();
664 CUDA_LAUNCH_DIM3((reorder_parts<
decltype(parts_prp.toKernel()),
665 decltype(pl.toKernel()),
666 decltype(sort_to_not_sort.toKernel()),
667 decltype(cells_out.toKernel()),
670 parts_prp.toKernel(),
671 parts_prp_out.toKernel(),
674 sort_to_not_sort.toKernel(),
675 non_sort_to_sort.toKernel(),
676 cells_out.toKernel());
679 parts_prp_out.template deviceToHost<0>();
680 sort_to_not_sort.template deviceToHost<0>();
681 non_sort_to_sort.template deviceToHost<0>();
684 for (
size_t i = 0 ; i < tot ; i++)
686 size_t n = cl.getNelements(i);
688 for (
size_t j = 0 ; j < n ; j++)
690 size_t p = cl.get(i,j);
692 check &= parts_prp_out.template get<0>(st) == parts_prp.template get<0>(p);
693 check &= sort_to_not_sort.template get<0>(st) == p;
694 check &= non_sort_to_sort.template get<0>(p) == st;
701 BOOST_REQUIRE_EQUAL(check,
true);
704BOOST_AUTO_TEST_CASE ( test_reorder_particles )
706 std::cout <<
"Test GPU reorder" <<
"\n";
708 test_reorder_parts<3,float,unsigned int, unsigned char>(5000);
710 std::cout <<
"End GPU reorder" <<
"\n";
713template<
unsigned int dim,
typename T,
typename CellS>
void Test_cell_gpu(
SpaceBox<dim,T> & box)
719 size_t div[dim] = {16,16,16};
758 pl_prp.resize(pl.
size());
759 pl_prp_out.resize(pl.
size());
760 pl_out.resize(pl.
size());
762 for (
size_t i = 0 ; i < pl.
size() ; i++)
764 pl_prp.template get<0>(i) = pl.template get<0>(i)[0];
766 pl_prp.template get<1>(i)[0] = pl.template get<0>(i)[0]+100.0;
767 pl_prp.template get<1>(i)[1] = pl.template get<0>(i)[1]+100.0;
768 pl_prp.template get<1>(i)[2] = pl.template get<0>(i)[2]+100.0;
770 pl_prp.template get<2>(i)[0][0] = pl.template get<0>(i)[0]+1000.0;
771 pl_prp.template get<2>(i)[0][1] = pl.template get<0>(i)[1]+1000.0;
772 pl_prp.template get<2>(i)[0][2] = pl.template get<0>(i)[2]+1000.0;
774 pl_prp.template get<2>(i)[1][0] = pl.template get<0>(i)[0]+2000.0;
775 pl_prp.template get<2>(i)[1][1] = pl.template get<0>(i)[1]+3000.0;
776 pl_prp.template get<2>(i)[1][2] = pl.template get<0>(i)[2]+4000.0;
778 pl_prp.template get<2>(i)[2][0] = pl.template get<0>(i)[0]+5000.0;
779 pl_prp.template get<2>(i)[2][1] = pl.template get<0>(i)[1]+6000.0;
780 pl_prp.template get<2>(i)[2][2] = pl.template get<0>(i)[2]+7000.0;
783 pl_prp.resize(pl.
size());
784 pl_prp_out.resize(pl.
size());
786 pl.template hostToDevice<0>();
787 pl_prp.template hostToDevice<0,1,2>();
791 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context);
795 pl_prp_out.deviceToHost<0>();
796 pl_prp_out.deviceToHost<1>();
797 pl_prp_out.deviceToHost<2>();
813 for (
size_t i = 0 ; i < pl_correct.
size() ; i++)
815 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<0>(i),(
float)pl_correct.template get<0>(i)[0]);
816 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[0],(
float)(pl_correct.template get<0>(i)[0]+100.0));
817 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[1],(
float)(pl_correct.template get<0>(i)[1]+100.0));
818 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[2],(
float)(pl_correct.template get<0>(i)[2]+100.0));
819 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][0],(
float)(pl_correct.template get<0>(i)[0] + 1000.0));
820 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][1],(
float)(pl_correct.template get<0>(i)[1] + 1000.0));
821 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][2],(
float)(pl_correct.template get<0>(i)[2] + 1000.0));
822 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][0],(
float)(pl_correct.template get<0>(i)[0] + 2000.0));
823 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][1],(
float)(pl_correct.template get<0>(i)[1] + 3000.0));
824 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][2],(
float)(pl_correct.template get<0>(i)[2] + 4000.0));
825 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][0],(
float)(pl_correct.template get<0>(i)[0] + 5000.0));
826 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][1],(
float)(pl_correct.template get<0>(i)[1] + 6000.0));
827 BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][2],(
float)(pl_correct.template get<0>(i)[2] + 7000.0));
832 auto & vsrt = cl2.getSortToNonSort();
833 vsrt.template deviceToHost<0>();
835 BOOST_REQUIRE_EQUAL(vsrt.size(),9);
837 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(0),8);
838 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(1),0);
839 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(2),1);
840 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(3),2);
841 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(4),4);
842 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(5),3);
843 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(6),5);
844 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(7),6);
845 BOOST_REQUIRE_EQUAL(vsrt.template get<0>(8),7);
847 auto & vnsrt = cl2.getNonSortToSort();
849 BOOST_REQUIRE_EQUAL(vnsrt.size(),9);
853 vnsrt.template deviceToHost<0>();
855 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(8),0);
856 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(0),1);
857 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(1),2);
858 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(2),3);
859 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(4),4);
860 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(3),5);
861 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(5),6);
862 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(6),7);
863 BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(7),8);
867BOOST_AUTO_TEST_CASE( CellList_gpu_use)
869 std::cout <<
"Test cell list GPU" <<
"\n";
874 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory>>(box);
876 std::cout <<
"End cell list GPU" <<
"\n";
881BOOST_AUTO_TEST_CASE( CellList_gpu_use_sparse )
883 std::cout <<
"Test cell list GPU sparse" <<
"\n";
888 Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory,no_transform_only<3,double>,
unsigned int,
int,
true>> (box);
890 std::cout <<
"End cell list GPU sparse" <<
"\n";
895template<
unsigned int dim,
typename vector_ps,
typename vector_pr>
896void fill_random_parts(
Box<dim,float> & box, vector_ps & vd_pos, vector_pr & vd_prp,
size_t n)
898 for (
size_t i = 0 ; i < n ; i++)
908 vd_prp.last().template get<0>() = i % 3;
913template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
914__global__
void calc_force_number(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
916 int p = threadIdx.x + blockIdx.x * blockDim.x;
918 if (p >= pos.size())
return;
922 auto it = cl.getNNIterator(cl.getCell(xp));
926 auto q = it.get_sort();
927 auto q_ns = it.get();
929 int s1 = s_t_ns.template get<0>(q);
931 atomicAdd(&vn.template get<0>(s1), 1);
937template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
938__global__
void calc_force_number_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
940 int p = threadIdx.x + blockIdx.x * blockDim.x;
942 if (p >= pos.size())
return;
946 auto it = cl.getNNIterator(cl.getCell(xp));
950 auto q = it.get_sort();
951 auto q_ns = it.get();
953 int s1 = s_t_ns.template get<0>(q);
955 ++vn.template get<0>(p);
961template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
962__global__
void calc_force_number_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn,
unsigned int start)
964 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
966 if (p >= pos.size())
return;
970 auto it = cl.getNNIteratorBox(cl.getCell(xp));
974 auto q = it.get_sort();
976 int s1 = s_t_ns.template get<0>(q);
978 atomicAdd(&vn.template get<0>(s1), 1);
984template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
985__global__
void calc_force_number_box_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn,
unsigned int start)
987 int p = threadIdx.x + blockIdx.x * blockDim.x + start;
989 if (p >= pos.size())
return;
993 auto it = cl.getNNIteratorBox(cl.getCell(xp));
997 auto q = it.get_sort();
999 ++vn.template get<0>(p);
1005template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1006__global__
void calc_force_number_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
1008 int p = threadIdx.x + blockIdx.x * blockDim.x;
1010 if (p >= pos.size())
return;
1014 auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1018 auto q = it.get_sort();
1020 int s1 = s_t_ns.template get<0>(q);
1022 atomicAdd(&vn.template get<0>(s1), 1);
1028template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1029__global__
void calc_force_list_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1031 int p = threadIdx.x + blockIdx.x * blockDim.x;
1033 if (p >= pos.size())
return;
1036 int start_list = v_nscan.template get<0>(p);
1038 auto it = cl.getNNIteratorBox(cl.getCell(xp));
1042 auto q = it.get_sort();
1044 int s1 = s_t_ns.template get<0>(q);
1046 v_list.template get<0>(start_list) = s1;
1053template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1054__global__
void calc_force_list(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1056 int p = threadIdx.x + blockIdx.x * blockDim.x;
1058 if (p >= pos.size())
return;
1061 int start_list = v_nscan.template get<0>(p);
1063 auto it = cl.getNNIterator(cl.getCell(xp));
1067 auto q = it.get_sort();
1069 int s1 = s_t_ns.template get<0>(q);
1071 v_list.template get<0>(start_list) = s1;
1078template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1079__global__
void calc_force_list_box_partial(vector_pos pos,
1082 vector_n_type v_nscan,
1083 vector_n_type v_nscan_part,
1084 vector_n_type v_list)
1086 int p = threadIdx.x + blockIdx.x * blockDim.x;
1088 if (p >= pos.size())
return;
1091 int start_list = v_nscan.template get<0>(p) + v_nscan_part.template get<0>(p);
1093 auto it = cl.getNNIteratorBox(cl.getCell(xp));
1097 auto q = it.get_sort();
1099 int s1 = s_t_ns.template get<0>(q);
1101 v_list.template get<0>(start_list) = s1;
1108template<
typename vector_pos,
typename vector_ns,
typename CellList_type,
typename vector_n_type>
1109__global__
void calc_force_list_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1111 int p = threadIdx.x + blockIdx.x * blockDim.x;
1113 if (p >= pos.size())
return;
1116 int start_list = v_nscan.template get<0>(p);
1118 auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1122 auto q = it.get_sort();
1124 int s1 = s_t_ns.template get<0>(q);
1126 v_list.template get<0>(start_list) = s1;
1133template<
unsigned int impl>
1136 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1137 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1141 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1142 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1144 auto ite = pl.getGPUIterator();
1146 CUDA_LAUNCH((calc_force_number),ite,pl.toKernel(),s_t_ns.toKernel(),
1147 cl2.toKernel(),n_out.toKernel());
1150 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1151 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2,n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1153 auto ite = pl.getGPUIterator();
1155 CUDA_LAUNCH((calc_force_list),ite,pl.toKernel(),
1158 n_out_scan.toKernel(),
1159 nn_list.toKernel());
1162 template<
typename NN_type>
1163 static auto getNN(NN_type & nn,
size_t cell) ->
decltype(nn.getNNIterator(cell))
1165 return nn.getNNIterator(cell);
1172 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1173 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1175 cl2.setRadius(radius);
1176 cl_cpu.setRadius(radius);
1179 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1180 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1182 auto ite = pl.getGPUIterator();
1184 CUDA_LAUNCH((calc_force_number_rad<
decltype(pl.toKernel()),
1185 decltype(s_t_ns.toKernel()),
1186 decltype(cl2.toKernel()),
1187 decltype(n_out.toKernel())>),
1194 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1195 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1197 auto ite = pl.getGPUIterator();
1199 CUDA_LAUNCH((calc_force_list_rad<
decltype(pl.toKernel()),
1200 decltype(s_t_ns.toKernel()),
1201 decltype(cl2.toKernel()),
1202 decltype(nn_list.toKernel())>),
1206 n_out_scan.toKernel(),
1207 nn_list.toKernel());
1210 template<
typename NN_type>
1211 static auto getNN(NN_type & nn,
size_t cell) ->
decltype(nn.getNNIteratorRadius(cell))
1213 return nn.getNNIteratorRadius(cell);
1220 template<
typename CellS,
typename Cells_cpu_type,
typename T>
1221 static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1223 cl2.setRadius(radius);
1224 cl_cpu.setRadius(radius);
1227 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1228 static void calc_num_noato(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1230 auto ite = s_t_ns.getGPUIterator();
1232 CUDA_LAUNCH((calc_force_number_box_noato<
decltype(pl.toKernel()),
1233 decltype(s_t_ns.toKernel()),
1234 decltype(cl2.toKernel()),
1235 decltype(n_out.toKernel())>),
1243 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_type>
1244 static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out,
unsigned int start)
1246 auto ite = s_t_ns.getGPUIterator();
1248 CUDA_LAUNCH((calc_force_number_box<
decltype(pl.toKernel()),
1249 decltype(s_t_ns.toKernel()),
1250 decltype(cl2.toKernel()),
1251 decltype(n_out.toKernel())>),
1260 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1261 static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1263 auto ite = s_t_ns.getGPUIterator();
1265 CUDA_LAUNCH((calc_force_list_box<
decltype(pl.toKernel()),
1266 decltype(s_t_ns.toKernel()),
1267 decltype(cl2.toKernel()),
1268 decltype(nn_list.toKernel())>),
1272 n_out_scan.toKernel(),
1273 nn_list.toKernel());
1276 template<
typename pl_type,
typename s_t_ns_type,
typename cl2_type,
typename n_out_scan_type,
typename nn_list_type>
1277 static void calc_list_partial(pl_type & pl,
1278 s_t_ns_type & s_t_ns,
1280 n_out_scan_type & n_out_scan,
1281 n_out_scan_type & n_out_scan_partial,
1282 nn_list_type & nn_list)
1284 auto ite = s_t_ns.getGPUIterator();
1286 CUDA_LAUNCH((calc_force_list_box_partial),ite,pl.toKernel(),
1289 n_out_scan.toKernel(),
1290 n_out_scan_partial.toKernel(),
1291 nn_list.toKernel());
1294 template<
typename NN_type>
1295 static auto getNN(NN_type & nn,
size_t cell) ->
decltype(nn.getNNIteratorRadius(cell))
1297 return nn.getNNIteratorRadius(cell);
1301template<
unsigned int dim,
typename T,
typename CellS,
int impl>
1302void Test_cell_gpu_force(
SpaceBox<dim,T> & box,
size_t npart,
const size_t (& div)[dim],
int box_nn = 1)
1308 CellS cl2(box,div,2);
1312 cl2.setBoxNN(box_nn);
1329 fill_random_parts<3>(box,pl,pl_prp,npart);
1331 pl_prp_out.resize(pl.
size());
1332 pl_out.resize(pl.
size());
1333 n_out.resize(pl.
size()+1);
1336 pl_prp.resize(pl.
size());
1337 pl_prp_out.resize(pl.
size());
1339 pl.template hostToDevice<0>();
1340 pl_prp.template hostToDevice<0,1>();
1346 auto it2 = pl.getIterator();
1348 while (it2.isNext())
1359 size_t g_m = pl.
size() / 2;
1362 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
1364 auto & s_t_ns = cl2.getSortToNonSort();
1366 pl.template hostToDevice<0>();
1373 auto & gdsi = cl2.getDomainSortIds();
1374 gdsi.template deviceToHost<0>();
1375 s_t_ns.template deviceToHost<0>();
1378 for (
size_t i = 0 ; i < g_m ; i++)
1380 unsigned int p = gdsi.template get<0>(i);
1382 match &= (s_t_ns.template get<0>(p) < g_m);
1385 BOOST_REQUIRE_EQUAL(match,
true);
1389 n_out.deviceToHost<0>();
1393 auto it = pl.getIterator();
1406 while (NN_it.isNext())
1408 auto q = NN_it.get();
1415 check &= n_ele == n_out.template get<0>(p);
1419 std::cout << p <<
" " << n_ele <<
" " << n_out.template get<0>(p) <<
" " << check << std::endl;
1425 BOOST_REQUIRE_EQUAL(check,
true);
1433 n_out_scan.resize(pl.
size()+1);
1435 openfpm::scan((
unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.
size(),(
unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1436 n_out_scan.template deviceToHost<0>();
1438 if (n_out_scan.template get<0>(pl.
size()) == 0)
1441 nn_list.resize(n_out_scan.template get<0>(pl.
size()));
1445 pl.template hostToDevice<0>();
1449 nn_list.template deviceToHost<0>();
1453 n_out.deviceToHost<0>();
1457 auto it = pl.getIterator();
1471 while (NN_it.isNext())
1473 auto q = NN_it.get();
1482 for (
size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1484 gpu_list.add(nn_list.template get<0>(i));
1492 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1493 {check &= cpu_list.get(j) == gpu_list.get(j);}
1498 BOOST_REQUIRE_EQUAL(check,
true);
1503template<
unsigned int dim,
typename T,
typename CellS,
int impl>
1504void Test_cell_gpu_force_split(
SpaceBox<dim,T> & box,
size_t npart,
const size_t (& div)[dim],
int box_nn = 1)
1510 CellS cl2_split1(box,div,2);
1511 CellS cl2_split2(box,div,2);
1515 cl2_split1.setBoxNN(box_nn);
1516 cl2_split2.setBoxNN(box_nn);
1535 fill_random_parts<3>(box,pl,pl_prp,npart);
1537 pl_prp_out.resize(pl.
size());
1538 pl_out.resize(pl.
size());
1539 n_out.resize(pl.
size()+1);
1542 pl_prp.resize(pl.
size());
1543 pl_prp_out.resize(pl.
size());
1545 pl.template hostToDevice<0>();
1546 pl_prp.template hostToDevice<0,1>();
1552 auto it2 = pl.getIterator();
1554 while (it2.isNext())
1565 size_t g_m = pl.
size() / 2;
1568 cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.
size()/2);
1569 cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.
size()/2,pl.
size());
1570 auto & s_t_ns_s1 = cl2_split1.getSortToNonSort();
1571 auto & s_t_ns_s2 = cl2_split2.getSortToNonSort();
1574 n_out_partial = n_out;
1579 auto & gdsi_s1 = cl2_split1.getDomainSortIds();
1580 gdsi_s1.template deviceToHost<0>();
1581 s_t_ns_s1.template deviceToHost<0>();
1584 for (
size_t i = 0 ; i < g_m ; i++)
1586 unsigned int p = gdsi_s1.template get<0>(i);
1588 match &= (s_t_ns_s1.template get<0>(p) < g_m);
1591 BOOST_REQUIRE_EQUAL(match,
true);
1595 n_out.deviceToHost<0>();
1599 auto it = pl.getIteratorTo(pl.
size()/2-1);
1612 while (NN_it.isNext())
1614 auto q = NN_it.get();
1621 check &= n_ele == n_out.template get<0>(p);
1625 std::cout << p <<
" " << n_ele <<
" " << n_out.template get<0>(p) <<
" " << check << std::endl;
1631 BOOST_REQUIRE_EQUAL(check,
true);
1639 n_out_scan.resize(n_out.
size());
1641 openfpm::scan((
unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.
size(),(
unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1643 n_out_scan.template deviceToHost<0>();
1645 if (n_out_scan.template get<0>(pl.
size()) == 0)
1648 nn_list.resize(n_out_scan.template get<0>(pl.
size()));
1652 pl.template hostToDevice<0>();
1657 nn_list.template deviceToHost<0>();
1661 n_out.deviceToHost<0>();
1665 auto it = pl.getIteratorTo(pl.
size()/2-1);
1679 while (NN_it.isNext())
1681 auto q = NN_it.get();
1690 for (
size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1692 gpu_list.add(nn_list.template get<0>(i));
1697#ifndef MAKE_CELLLIST_DETERMINISTIC
1704 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1705 {check &= cpu_list.get(j) == gpu_list.get(j);}
1709 std::cout <<
"NPARTS: " << npart << std::endl;
1711 for (
size_t j = 0 ; j < cpu_list.
size() ; j++)
1712 {std::cout << cpu_list.get(j) <<
" " << gpu_list.get(j) << std::endl;}
1720 BOOST_REQUIRE_EQUAL(check,
true);
1725BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box)
1727 std::cout <<
"Test cell list GPU" <<
"\n";
1732 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1733 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1735 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1736 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1738 std::cout <<
"End cell list GPU" <<
"\n";
1743BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split)
1745 std::cout <<
"Test cell list GPU split" <<
"\n";
1750 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1751 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1753 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1754 Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1756 std::cout <<
"End cell list GPU split" <<
"\n";
1849BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_sparse)
1851 std::cout <<
"Test cell list GPU" <<
"\n";
1856 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box,1000,{32,32,32},2);
1857 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box,10000,{32,32,32},2);
1859 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box2,1000,{32,32,32},2);
1860 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,2>(box2,10000,{32,32,32},2);
1862 std::cout <<
"End cell list GPU" <<
"\n";
1869BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_radius)
1871 std::cout <<
"Test cell list GPU" <<
"\n";
1876 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,1000,{32,32,32});
1877 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,10000,{32,32,32});
1879 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,1000,{32,32,32});
1880 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,10000,{32,32,32});
1882 std::cout <<
"End cell list GPU" <<
"\n";
1889BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force)
1891 std::cout <<
"Test cell list GPU" <<
"\n";
1896 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,1000,{16,16,16});
1897 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,10000,{16,16,16});
1899 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,1000,{16,16,16});
1900 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,10000,{16,16,16});
1902 std::cout <<
"End cell list GPU" <<
"\n";
1907BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_sparse)
1909 std::cout <<
"Test cell list GPU force sparse" <<
"\n";
1914 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box,1000,{16,16,16});
1915 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box,10000,{16,16,16});
1917 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box2,1000,{16,16,16});
1918 Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,
unsigned int,
int,
true>,0>(box2,10000,{16,16,16});
1920 std::cout <<
"End cell list GPU force sparse" <<
"\n";
1927template<
typename CellList_type,
typename Vector_type,
typename Vector_out>
1928__global__
void cl_offload_gpu(CellList_type cl, Vector_type parts, Vector_out output)
1930 int p = threadIdx.x + blockIdx.x * blockDim.x;
1932 if (p >= parts.size())
return;
1936 output.template get<0>(p) = cl.getNelements(cl.getCell(xp));
1939template<
typename CellList_type,
typename Vector_type,
typename Vector_scan_type,
typename Vector_list_type>
1940__global__
void cl_offload_gpu_list(CellList_type cl, Vector_type parts, Vector_scan_type scan, Vector_list_type list)
1942 int p = threadIdx.x + blockIdx.x * blockDim.x;
1944 if (p >= parts.size())
return;
1948 int id = cl.getCell(xp);
1949 int n_ele = cl.getNelements(
id);
1950 int start = scan.template get<0>(p);
1952 for (
int j = 0 ; j < n_ele ; j++)
1954 list.template get<0>(start+j) = cl.get(
id,j);
1961BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
1963 std::cout <<
"Test cell list offload gpu" <<
"\n";
1966 size_t div[3] = {10,10,10};
1979 os.resize(v.size());
1981 for (
size_t i = 0 ; i < v.size() ; i++)
1983 v.template get<0>(i)[0] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1984 v.template get<0>(i)[1] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1985 v.template get<0>(i)[2] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1992 auto ite = v.getGPUIterator();
1995 v.hostToDevice<0>();
1997 CUDA_LAUNCH_DIM3((cl_offload_gpu<
decltype(cl1.toKernel()),
decltype(v.toKernel()),
decltype(os.toKernel())>),ite.wthr,ite.thr,cl1.toKernel(),v.toKernel(),os.toKernel());
1999 os.deviceToHost<0>();
2002 for (
size_t i = 0 ; i < os.
size() ; i++)
2006 match &= os.template get<0>(i) == cl1.getNelements(cl1.getCell(xp));
2009 BOOST_REQUIRE_EQUAL(match,
true);
2014 os_scan.resize(v.size());
2017 openfpm::scan((
int *)os.template getDeviceBuffer<0>(),os.
size(),(
int *)os_scan.template getDeviceBuffer<0>(),ctx);
2019 os_scan.deviceToHost<0>();
2020 os.deviceToHost<0>(os.
size()-1,os.
size()-1);
2021 size_t size_list = os_scan.template get<0>(os_scan.
size()-1) + os.template get<0>(os.
size()-1);
2024 os_list.resize(size_list);
2026 CUDA_LAUNCH_DIM3((cl_offload_gpu_list<
decltype(cl1.toKernel()),
decltype(v.toKernel()),
2027 decltype(os_scan.toKernel()),
decltype(os_list.toKernel())>),ite.wthr,ite.thr,
2028 cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
2030 os_list.deviceToHost<0>();
2033 for (
size_t i = 0 ; i < os.
size() ; i++)
2037 for (
size_t j = 0 ; j < cl1.getNelements(cl1.getCell(xp)) ; j++)
2039 match &= os_list.template get<0>(os_scan.template get<0>(i)+j) == cl1.get(cl1.getCell(xp),j);
2043 BOOST_REQUIRE_EQUAL(match,
true);
2045 std::cout <<
"End cell list offload gpu" <<
"\n";
2052BOOST_AUTO_TEST_CASE( CellList_swap_test )
2054 size_t npart = 4096;
2059 size_t div[3] = {10,10,10};
2065 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl2(box,div,2);
2066 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl3(box,div,2);
2067 CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl4(box,div,2);
2079 fill_random_parts<3>(box,pl,pl_prp,npart);
2081 pl_prp_out.resize(pl.
size());
2082 pl_out.resize(pl.
size());
2084 pl_prp.resize(pl.
size());
2085 pl_prp_out.resize(pl.
size());
2087 pl.template hostToDevice<0>();
2088 pl_prp.template hostToDevice<0,1>();
2090 size_t g_m = pl.
size() / 2;
2093 cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2094 cl4.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2100 cl3.debug_deviceToHost();
2101 cl4.debug_deviceToHost();
2103 BOOST_REQUIRE_EQUAL(cl3.getNCells(),cl4.getNCells());
2109 for (
size_t i = 0 ; i < cl3.getNCells() ; i++)
2111 check &= cl3.getNelements(i) == cl4.getNelements(i);
2113 for (
size_t j = 0 ; j < cl3.getNelements(i) ; j++)
2115 s1.add(cl3.get(i,j));
2116 s2.add(cl4.get(i,j));
2122 for (
size_t j = 0 ; j < s1.
size() ; j++)
2124 check &= s1.get(j) == s2.get(j);
2128 BOOST_REQUIRE_EQUAL(check,
true);
2133BOOST_AUTO_TEST_SUITE_END()
This class represent an N-dimensional box.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Class for FAST cell list implementation.
This class implement an NxN (dense) matrix.
It is a class that work like a vector of vector.
This class implement the point shape in an N-dimensional space.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
This class represent an N-dimensional box.
Declaration grid_key_dx_iterator_sub.
grid_key_dx is the key to access any element in the grid
__device__ __host__ grid_key_dx< N > InvLinId(mem_id id) const
Construct.
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
__device__ __host__ size_t size() const
Return the size of the grid.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void flush(gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Implementation of 1-D std::vector like structure.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)